Spaces:
Running
Running
Update modules/latex2bbox_color.py
Browse files- modules/latex2bbox_color.py +22 -1
modules/latex2bbox_color.py
CHANGED
|
@@ -154,7 +154,27 @@ def extrac_bbox_from_color_image(image_path, color_list):
|
|
| 154 |
|
| 155 |
def contains_chinese(text):
|
| 156 |
# 匹配中文字符的正则表达式范围
|
| 157 |
-
return re.search(r'[\u4e00-\u9fff]', text) is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
def latex2bbox_color(input_arg):
|
| 160 |
latex, basename, output_path, temp_dir, total_color_list = input_arg
|
|
@@ -164,6 +184,7 @@ def latex2bbox_color(input_arg):
|
|
| 164 |
if contains_chinese(latex):
|
| 165 |
template = formular_template_zh
|
| 166 |
latex = latex.replace(",", ", ").replace(":", ": ").replace(";", "; ")
|
|
|
|
| 167 |
else:
|
| 168 |
template = formular_template
|
| 169 |
output_bbox_path = os.path.join(output_path, 'bbox', basename+'.jsonl')
|
|
|
|
| 154 |
|
| 155 |
def contains_chinese(text):
|
| 156 |
# 匹配中文字符的正则表达式范围
|
| 157 |
+
return re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text) is not None
|
| 158 |
+
|
| 159 |
+
def wrap_chinese_in_text(latex_text):
|
| 160 |
+
chinese_pattern = r'[\u4e00-\u9fff\u3400-\u4dbf]'
|
| 161 |
+
# 匹配连续的中文字符
|
| 162 |
+
chinese_sequence_pattern = chinese_pattern + '+'
|
| 163 |
+
def replace_chinese(match):
|
| 164 |
+
chinese_text = match.group(0)
|
| 165 |
+
# 检查是否已经被\text{}包裹
|
| 166 |
+
start_pos = match.start()
|
| 167 |
+
end_pos = match.end()
|
| 168 |
+
# 检查匹配位置前后是否有\text{和}
|
| 169 |
+
before_text = latex_text[max(0, start_pos-6):start_pos]
|
| 170 |
+
after_text = latex_text[end_pos:min(len(latex_text), end_pos+1)]
|
| 171 |
+
if before_text.endswith('\\text{') and after_text.startswith('}'):
|
| 172 |
+
return chinese_text
|
| 173 |
+
else:
|
| 174 |
+
return f'\\text{{{chinese_text}}}'
|
| 175 |
+
# 替换所有连续的中文字符
|
| 176 |
+
result = re.sub(chinese_sequence_pattern, replace_chinese, latex_text)
|
| 177 |
+
return result
|
| 178 |
|
| 179 |
def latex2bbox_color(input_arg):
|
| 180 |
latex, basename, output_path, temp_dir, total_color_list = input_arg
|
|
|
|
| 184 |
if contains_chinese(latex):
|
| 185 |
template = formular_template_zh
|
| 186 |
latex = latex.replace(",", ", ").replace(":", ": ").replace(";", "; ")
|
| 187 |
+
latex = wrap_chinese_in_text(latex)
|
| 188 |
else:
|
| 189 |
template = formular_template
|
| 190 |
output_bbox_path = os.path.join(output_path, 'bbox', basename+'.jsonl')
|