wufan commited on
Commit
3257a37
·
verified ·
1 Parent(s): a8003ea

Update modules/latex2bbox_color.py

Browse files
Files changed (1) hide show
  1. modules/latex2bbox_color.py +22 -1
modules/latex2bbox_color.py CHANGED
@@ -154,7 +154,27 @@ def extrac_bbox_from_color_image(image_path, color_list):
154
 
155
  def contains_chinese(text):
156
  # 匹配中文字符的正则表达式范围
157
- return re.search(r'[\u4e00-\u9fff]', text) is not None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  def latex2bbox_color(input_arg):
160
  latex, basename, output_path, temp_dir, total_color_list = input_arg
@@ -164,6 +184,7 @@ def latex2bbox_color(input_arg):
164
  if contains_chinese(latex):
165
  template = formular_template_zh
166
  latex = latex.replace(",", ", ").replace(":", ": ").replace(";", "; ")
 
167
  else:
168
  template = formular_template
169
  output_bbox_path = os.path.join(output_path, 'bbox', basename+'.jsonl')
 
154
 
155
  def contains_chinese(text):
156
  # 匹配中文字符的正则表达式范围
157
+ return re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text) is not None
158
+
159
+ def wrap_chinese_in_text(latex_text):
160
+ chinese_pattern = r'[\u4e00-\u9fff\u3400-\u4dbf]'
161
+ # 匹配连续的中文字符
162
+ chinese_sequence_pattern = chinese_pattern + '+'
163
+ def replace_chinese(match):
164
+ chinese_text = match.group(0)
165
+ # 检查是否已经被\text{}包裹
166
+ start_pos = match.start()
167
+ end_pos = match.end()
168
+ # 检查匹配位置前后是否有\text{和}
169
+ before_text = latex_text[max(0, start_pos-6):start_pos]
170
+ after_text = latex_text[end_pos:min(len(latex_text), end_pos+1)]
171
+ if before_text.endswith('\\text{') and after_text.startswith('}'):
172
+ return chinese_text
173
+ else:
174
+ return f'\\text{{{chinese_text}}}'
175
+ # 替换所有连续的中文字符
176
+ result = re.sub(chinese_sequence_pattern, replace_chinese, latex_text)
177
+ return result
178
 
179
  def latex2bbox_color(input_arg):
180
  latex, basename, output_path, temp_dir, total_color_list = input_arg
 
184
  if contains_chinese(latex):
185
  template = formular_template_zh
186
  latex = latex.replace(",", ", ").replace(":", ": ").replace(";", "; ")
187
+ latex = wrap_chinese_in_text(latex)
188
  else:
189
  template = formular_template
190
  output_bbox_path = os.path.join(output_path, 'bbox', basename+'.jsonl')