Spaces:

SynaptechX
/

ImgTextParser

Sleeping

App Files Files Community

nihuajian commited on Aug 21

Commit

fef840c

verified ·

1 Parent(s): 621012b

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -6

app.py CHANGED Viewed

@@ -101,20 +101,35 @@ def clean_formula_output(text):
 def clean_text_output(text):
     """清理输出文本，只保留识别的文字内容"""
-    lines = text.strip().split('\n')
-    text_lines = []
     # 移除代码块标记
     cleaned_text = text.replace('```text', '').replace('```', '').strip()
     lines = cleaned_text.split('\n')
     for line in lines:
         line = line.strip()
-        # 跳过解释性文字
         if line and not any(line.startswith(prefix) for prefix in [
-            '图片中的文字', '识别结果', '文字内容', '根据图片', '这张图片', '该图片'
         ]):
-            text_lines.append(line)
     return '\n'.join(text_lines)

 def clean_text_output(text):
     """清理输出文本，只保留识别的文字内容"""
     # 移除代码块标记
     cleaned_text = text.replace('```text', '').replace('```', '').strip()
     lines = cleaned_text.split('\n')
+    text_lines = []
     for line in lines:
         line = line.strip()
+        # 跳过解释性文字和标签信息
         if line and not any(line.startswith(prefix) for prefix in [
+            '图片中的文字', '识别结果', '文字内容', '根据图片', '这张图片', '该图片',
+            '标题:', '正文:', '内容:', '文本:', '题目:', '段落:', '文字:'
         ]):
+            # 移除行首的标签格式（如 "标题：内容" -> "内容"）
+            if '：' in line:
+                # 检查是否是标签格式
+                parts = line.split('：', 1)
+                if len(parts) == 2 and len(parts[0]) <= 10:  # 标签通常很短
+                    # 可能的标签词
+                    label_keywords = ['标题', '正文', '内容', '文本', '题目', '段落', '文字', '主题', '副标题']
+                    if any(keyword in parts[0] for keyword in label_keywords):
+                        # 只保留标签后的内容
+                        text_lines.append(parts[1].strip())
+                    else:
+                        # 不是标签格式，保留整行
+                        text_lines.append(line)
+                else:
+                    text_lines.append(line)
+            else:
+                text_lines.append(line)
     return '\n'.join(text_lines)