[init] update application file
Browse files- .gitignore +72 -0
- app.py +481 -0
- examples/page_1.pdf +3 -0
- examples/page_2.pdf +3 -0
- examples/page_3.jpeg +3 -0
- examples/page_4.png +3 -0
- examples/page_5.jpg +3 -0
- examples/page_6.jpg +3 -0
- header.html +447 -0
- inference_hugg.py +287 -0
- pyproject.toml +16 -0
- requirements.txt +14 -0
- static/styles.css +306 -0
- utils/markdown_utils.py +442 -0
- utils/utils.py +367 -0
.gitignore
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python相关
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
MANIFEST
|
| 23 |
+
|
| 24 |
+
# 环境文件
|
| 25 |
+
.env
|
| 26 |
+
.venv
|
| 27 |
+
env/
|
| 28 |
+
venv/
|
| 29 |
+
ENV/
|
| 30 |
+
env.bak/
|
| 31 |
+
venv.bak/
|
| 32 |
+
|
| 33 |
+
# 编辑器文件
|
| 34 |
+
.vscode/
|
| 35 |
+
.idea/
|
| 36 |
+
*.suo
|
| 37 |
+
*.ntvs*
|
| 38 |
+
*.njsproj
|
| 39 |
+
*.sln
|
| 40 |
+
*.sw?
|
| 41 |
+
|
| 42 |
+
# 日志和数据库
|
| 43 |
+
*.log
|
| 44 |
+
*.sqlite
|
| 45 |
+
*.db
|
| 46 |
+
|
| 47 |
+
# 系统文件
|
| 48 |
+
.DS_Store
|
| 49 |
+
Thumbs.db
|
| 50 |
+
|
| 51 |
+
# 测试相关
|
| 52 |
+
htmlcov/
|
| 53 |
+
.tox/
|
| 54 |
+
.coverage
|
| 55 |
+
.coverage.*
|
| 56 |
+
.cache
|
| 57 |
+
nosetests.xml
|
| 58 |
+
coverage.xml
|
| 59 |
+
*.cover
|
| 60 |
+
.hypothesis/
|
| 61 |
+
.pytest_cache/
|
| 62 |
+
|
| 63 |
+
# 输出文件
|
| 64 |
+
*.csv
|
| 65 |
+
*.json
|
| 66 |
+
*.xlsx
|
| 67 |
+
# *.pdf
|
| 68 |
+
out/
|
| 69 |
+
output/
|
| 70 |
+
|
| 71 |
+
# Jupyter笔记本
|
| 72 |
+
.ipynb_checkpoints
|
app.py
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
import time
|
| 4 |
+
import uuid
|
| 5 |
+
|
| 6 |
+
import cv2
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import pymupdf
|
| 9 |
+
import spaces
|
| 10 |
+
import torch
|
| 11 |
+
from gradio_pdf import PDF
|
| 12 |
+
from loguru import logger
|
| 13 |
+
from PIL import Image
|
| 14 |
+
from transformers import AutoProcessor, VisionEncoderDecoderModel
|
| 15 |
+
|
| 16 |
+
from utils.utils import prepare_image, parse_layout_string, process_coordinates, ImageDimensions
|
| 17 |
+
|
| 18 |
+
# 读取外部CSS文件
|
| 19 |
+
def load_css():
|
| 20 |
+
css_path = os.path.join(os.path.dirname(__file__), "static", "styles.css")
|
| 21 |
+
if os.path.exists(css_path):
|
| 22 |
+
with open(css_path, "r", encoding="utf-8") as f:
|
| 23 |
+
return f.read()
|
| 24 |
+
return ""
|
| 25 |
+
|
| 26 |
+
# 全局变量存储模型
|
| 27 |
+
model = None
|
| 28 |
+
processor = None
|
| 29 |
+
tokenizer = None
|
| 30 |
+
|
| 31 |
+
# 自动初始化模型
|
| 32 |
+
@spaces.GPU
|
| 33 |
+
def initialize_model():
|
| 34 |
+
"""初始化 Hugging Face 模型"""
|
| 35 |
+
global model, processor, tokenizer
|
| 36 |
+
|
| 37 |
+
if model is None:
|
| 38 |
+
logger.info("Loading DOLPHIN model...")
|
| 39 |
+
model_id = "ByteDance/Dolphin"
|
| 40 |
+
|
| 41 |
+
# 加载处理器和模型
|
| 42 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
| 43 |
+
model = VisionEncoderDecoderModel.from_pretrained(model_id)
|
| 44 |
+
model.eval()
|
| 45 |
+
|
| 46 |
+
# 设置设备和精度
|
| 47 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 48 |
+
model.to(device)
|
| 49 |
+
model = model.half() # 使用半精度
|
| 50 |
+
|
| 51 |
+
# 设置tokenizer
|
| 52 |
+
tokenizer = processor.tokenizer
|
| 53 |
+
|
| 54 |
+
logger.info(f"Model loaded successfully on {device}")
|
| 55 |
+
|
| 56 |
+
return "Model ready"
|
| 57 |
+
|
| 58 |
+
# 启动时自动初始化模型
|
| 59 |
+
logger.info("Initializing model at startup...")
|
| 60 |
+
try:
|
| 61 |
+
initialize_model()
|
| 62 |
+
logger.info("Model initialization completed")
|
| 63 |
+
except Exception as e:
|
| 64 |
+
logger.error(f"Model initialization failed: {e}")
|
| 65 |
+
# 模型将在首次使用时重新尝试初始化
|
| 66 |
+
|
| 67 |
+
# 模型推理函数
|
| 68 |
+
@spaces.GPU
|
| 69 |
+
def model_chat(prompt, image):
|
| 70 |
+
"""使用模型进行推理"""
|
| 71 |
+
global model, processor, tokenizer
|
| 72 |
+
|
| 73 |
+
# 确保模型已初始化
|
| 74 |
+
if model is None:
|
| 75 |
+
initialize_model()
|
| 76 |
+
|
| 77 |
+
# 检查是否为批处理
|
| 78 |
+
is_batch = isinstance(image, list)
|
| 79 |
+
|
| 80 |
+
if not is_batch:
|
| 81 |
+
images = [image]
|
| 82 |
+
prompts = [prompt]
|
| 83 |
+
else:
|
| 84 |
+
images = image
|
| 85 |
+
prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)
|
| 86 |
+
|
| 87 |
+
# 准备图像
|
| 88 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 89 |
+
batch_inputs = processor(images, return_tensors="pt", padding=True)
|
| 90 |
+
batch_pixel_values = batch_inputs.pixel_values.half().to(device)
|
| 91 |
+
|
| 92 |
+
# 准备提示
|
| 93 |
+
prompts = [f"<s>{p} <Answer/>" for p in prompts]
|
| 94 |
+
batch_prompt_inputs = tokenizer(
|
| 95 |
+
prompts,
|
| 96 |
+
add_special_tokens=False,
|
| 97 |
+
return_tensors="pt"
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
batch_prompt_ids = batch_prompt_inputs.input_ids.to(device)
|
| 101 |
+
batch_attention_mask = batch_prompt_inputs.attention_mask.to(device)
|
| 102 |
+
|
| 103 |
+
# 生成文本
|
| 104 |
+
outputs = model.generate(
|
| 105 |
+
pixel_values=batch_pixel_values,
|
| 106 |
+
decoder_input_ids=batch_prompt_ids,
|
| 107 |
+
decoder_attention_mask=batch_attention_mask,
|
| 108 |
+
min_length=1,
|
| 109 |
+
max_length=4096,
|
| 110 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 111 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 112 |
+
use_cache=True,
|
| 113 |
+
bad_words_ids=[[tokenizer.unk_token_id]],
|
| 114 |
+
return_dict_in_generate=True,
|
| 115 |
+
do_sample=False,
|
| 116 |
+
num_beams=1,
|
| 117 |
+
repetition_penalty=1.1
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# 处理输出
|
| 121 |
+
sequences = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)
|
| 122 |
+
|
| 123 |
+
# 清理提示文本
|
| 124 |
+
results = []
|
| 125 |
+
for i, sequence in enumerate(sequences):
|
| 126 |
+
cleaned = sequence.replace(prompts[i], "").replace("<pad>", "").replace("</s>", "").strip()
|
| 127 |
+
results.append(cleaned)
|
| 128 |
+
|
| 129 |
+
# 返回单个结果或批处理结果
|
| 130 |
+
if not is_batch:
|
| 131 |
+
return results[0]
|
| 132 |
+
return results
|
| 133 |
+
|
| 134 |
+
# 处理元素批次
|
| 135 |
+
@spaces.GPU
|
| 136 |
+
def process_element_batch(elements, prompt, max_batch_size=16):
|
| 137 |
+
"""处理同类型元素的批次"""
|
| 138 |
+
results = []
|
| 139 |
+
|
| 140 |
+
# 确定批次大小
|
| 141 |
+
batch_size = min(len(elements), max_batch_size)
|
| 142 |
+
|
| 143 |
+
# 分批处理
|
| 144 |
+
for i in range(0, len(elements), batch_size):
|
| 145 |
+
batch_elements = elements[i:i+batch_size]
|
| 146 |
+
crops_list = [elem["crop"] for elem in batch_elements]
|
| 147 |
+
|
| 148 |
+
# 使用相同的提示
|
| 149 |
+
prompts_list = [prompt] * len(crops_list)
|
| 150 |
+
|
| 151 |
+
# 批量推理
|
| 152 |
+
batch_results = model_chat(prompts_list, crops_list)
|
| 153 |
+
|
| 154 |
+
# 添加结果
|
| 155 |
+
for j, result in enumerate(batch_results):
|
| 156 |
+
elem = batch_elements[j]
|
| 157 |
+
results.append({
|
| 158 |
+
"label": elem["label"],
|
| 159 |
+
"bbox": elem["bbox"],
|
| 160 |
+
"text": result.strip(),
|
| 161 |
+
"reading_order": elem["reading_order"],
|
| 162 |
+
})
|
| 163 |
+
|
| 164 |
+
return results
|
| 165 |
+
|
| 166 |
+
# 清理临时文件
|
| 167 |
+
def cleanup_temp_file(file_path):
|
| 168 |
+
"""安全地删除临时文件"""
|
| 169 |
+
try:
|
| 170 |
+
if file_path and os.path.exists(file_path):
|
| 171 |
+
os.unlink(file_path)
|
| 172 |
+
except Exception as e:
|
| 173 |
+
logger.warning(f"Failed to cleanup temp file {file_path}: {e}")
|
| 174 |
+
|
| 175 |
+
def to_pdf(file_path):
|
| 176 |
+
"""将输入文件转换为PDF格式"""
|
| 177 |
+
if file_path is None:
|
| 178 |
+
return None
|
| 179 |
+
|
| 180 |
+
with pymupdf.open(file_path) as f:
|
| 181 |
+
if f.is_pdf:
|
| 182 |
+
return file_path
|
| 183 |
+
else:
|
| 184 |
+
pdf_bytes = f.convert_to_pdf()
|
| 185 |
+
# 使用临时文件而不是保存到磁盘
|
| 186 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
|
| 187 |
+
tmp_file.write(pdf_bytes)
|
| 188 |
+
return tmp_file.name
|
| 189 |
+
|
| 190 |
+
@spaces.GPU(duration=120)
|
| 191 |
+
def process_document(file_path):
|
| 192 |
+
"""处理文档的主要函数 - 集成完整的推理逻辑"""
|
| 193 |
+
if file_path is None:
|
| 194 |
+
return "", "", {}, {}
|
| 195 |
+
|
| 196 |
+
start_time = time.time()
|
| 197 |
+
original_file_path = file_path
|
| 198 |
+
|
| 199 |
+
# 确保模型已初始化
|
| 200 |
+
if model is None:
|
| 201 |
+
initialize_model()
|
| 202 |
+
|
| 203 |
+
# 转换为PDF(如果需要)
|
| 204 |
+
converted_file_path = to_pdf(file_path)
|
| 205 |
+
temp_file_created = converted_file_path != original_file_path
|
| 206 |
+
|
| 207 |
+
try:
|
| 208 |
+
logger.info(f"Processing document: {file_path}")
|
| 209 |
+
|
| 210 |
+
# 处理页面
|
| 211 |
+
recognition_results = process_page(converted_file_path)
|
| 212 |
+
|
| 213 |
+
# 生成Markdown内容
|
| 214 |
+
md_content = generate_markdown(recognition_results)
|
| 215 |
+
|
| 216 |
+
# 计算处理时间
|
| 217 |
+
processing_time = time.time() - start_time
|
| 218 |
+
|
| 219 |
+
debug_info = {
|
| 220 |
+
"original_file": original_file_path,
|
| 221 |
+
"converted_file": converted_file_path,
|
| 222 |
+
"temp_file_created": temp_file_created,
|
| 223 |
+
"status": "success",
|
| 224 |
+
"processing_time": f"{processing_time:.2f}s",
|
| 225 |
+
"total_elements": len(recognition_results)
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
processing_data = {
|
| 229 |
+
"pages": [{"elements": recognition_results}],
|
| 230 |
+
"total_elements": len(recognition_results),
|
| 231 |
+
"processing_time": f"{processing_time:.2f}s"
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
logger.info(f"Document processed successfully in {processing_time:.2f}s")
|
| 235 |
+
return md_content, md_content, processing_data, debug_info
|
| 236 |
+
|
| 237 |
+
except Exception as e:
|
| 238 |
+
logger.error(f"Error processing document: {str(e)}")
|
| 239 |
+
error_info = {
|
| 240 |
+
"original_file": original_file_path,
|
| 241 |
+
"converted_file": converted_file_path,
|
| 242 |
+
"temp_file_created": temp_file_created,
|
| 243 |
+
"status": "error",
|
| 244 |
+
"error": str(e)
|
| 245 |
+
}
|
| 246 |
+
return f"# 处理错误\n\n处理文档时发生错误: {str(e)}", "", {}, error_info
|
| 247 |
+
|
| 248 |
+
finally:
|
| 249 |
+
# 清理临时文件
|
| 250 |
+
if temp_file_created:
|
| 251 |
+
cleanup_temp_file(converted_file_path)
|
| 252 |
+
|
| 253 |
+
def process_page(image_path):
|
| 254 |
+
"""处理单页文档"""
|
| 255 |
+
# 阶段1: 页面级布局解析
|
| 256 |
+
pil_image = Image.open(image_path).convert("RGB")
|
| 257 |
+
layout_output = model_chat("Parse the reading order of this document.", pil_image)
|
| 258 |
+
|
| 259 |
+
# 阶段2: 元素级内容解析
|
| 260 |
+
padded_image, dims = prepare_image(pil_image)
|
| 261 |
+
recognition_results = process_elements(layout_output, padded_image, dims)
|
| 262 |
+
|
| 263 |
+
return recognition_results
|
| 264 |
+
|
| 265 |
+
def process_elements(layout_results, padded_image, dims, max_batch_size=16):
|
| 266 |
+
"""解析所有文档元素"""
|
| 267 |
+
layout_results = parse_layout_string(layout_results)
|
| 268 |
+
|
| 269 |
+
# 分别存储不同类型的元素
|
| 270 |
+
text_elements = [] # 文本元素
|
| 271 |
+
table_elements = [] # 表格元素
|
| 272 |
+
figure_results = [] # 图像元素(无需处理)
|
| 273 |
+
previous_box = None
|
| 274 |
+
reading_order = 0
|
| 275 |
+
|
| 276 |
+
# 收集要处理的元素并按类型分组
|
| 277 |
+
for bbox, label in layout_results:
|
| 278 |
+
try:
|
| 279 |
+
# 调整坐标
|
| 280 |
+
x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = process_coordinates(
|
| 281 |
+
bbox, padded_image, dims, previous_box
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
# 裁剪并解析元素
|
| 285 |
+
cropped = padded_image[y1:y2, x1:x2]
|
| 286 |
+
if cropped.size > 0:
|
| 287 |
+
if label == "fig":
|
| 288 |
+
# 对于图像区域,直接添加空文本结果
|
| 289 |
+
figure_results.append(
|
| 290 |
+
{
|
| 291 |
+
"label": label,
|
| 292 |
+
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
| 293 |
+
"text": "",
|
| 294 |
+
"reading_order": reading_order,
|
| 295 |
+
}
|
| 296 |
+
)
|
| 297 |
+
else:
|
| 298 |
+
# 准备元素进行解析
|
| 299 |
+
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
| 300 |
+
element_info = {
|
| 301 |
+
"crop": pil_crop,
|
| 302 |
+
"label": label,
|
| 303 |
+
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
| 304 |
+
"reading_order": reading_order,
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
# 按类型分组
|
| 308 |
+
if label == "tab":
|
| 309 |
+
table_elements.append(element_info)
|
| 310 |
+
else: # 文本元素
|
| 311 |
+
text_elements.append(element_info)
|
| 312 |
+
|
| 313 |
+
reading_order += 1
|
| 314 |
+
|
| 315 |
+
except Exception as e:
|
| 316 |
+
logger.error(f"Error processing bbox with label {label}: {str(e)}")
|
| 317 |
+
continue
|
| 318 |
+
|
| 319 |
+
# 初始化结果列表
|
| 320 |
+
recognition_results = figure_results.copy()
|
| 321 |
+
|
| 322 |
+
# 处理文本元素(批量)
|
| 323 |
+
if text_elements:
|
| 324 |
+
text_results = process_element_batch(text_elements, "Read text in the image.", max_batch_size)
|
| 325 |
+
recognition_results.extend(text_results)
|
| 326 |
+
|
| 327 |
+
# 处理表格元素(批量)
|
| 328 |
+
if table_elements:
|
| 329 |
+
table_results = process_element_batch(table_elements, "Parse the table in the image.", max_batch_size)
|
| 330 |
+
recognition_results.extend(table_results)
|
| 331 |
+
|
| 332 |
+
# 按阅读顺序排序
|
| 333 |
+
recognition_results.sort(key=lambda x: x.get("reading_order", 0))
|
| 334 |
+
|
| 335 |
+
return recognition_results
|
| 336 |
+
|
| 337 |
+
def generate_markdown(recognition_results):
|
| 338 |
+
"""从识别结果生成Markdown内容"""
|
| 339 |
+
markdown_parts = []
|
| 340 |
+
|
| 341 |
+
for result in recognition_results:
|
| 342 |
+
text = result.get("text", "").strip()
|
| 343 |
+
label = result.get("label", "")
|
| 344 |
+
|
| 345 |
+
if text:
|
| 346 |
+
if label == "tab":
|
| 347 |
+
# 表格内容
|
| 348 |
+
markdown_parts.append(f"\n{text}\n")
|
| 349 |
+
else:
|
| 350 |
+
# 普通文本内容
|
| 351 |
+
markdown_parts.append(text)
|
| 352 |
+
|
| 353 |
+
return "\n\n".join(markdown_parts)
|
| 354 |
+
|
| 355 |
+
# LaTeX 渲染配置
|
| 356 |
+
latex_delimiters = [
|
| 357 |
+
{"left": "$$", "right": "$$", "display": True},
|
| 358 |
+
{"left": "$", "right": "$", "display": False},
|
| 359 |
+
{"left": "\\[", "right": "\\]", "display": True},
|
| 360 |
+
{"left": "\\(", "right": "\\)", "display": False},
|
| 361 |
+
]
|
| 362 |
+
|
| 363 |
+
# 加载自定义CSS
|
| 364 |
+
custom_css = load_css()
|
| 365 |
+
|
| 366 |
+
# 读取页面头部
|
| 367 |
+
with open("header.html", "r", encoding="utf-8") as file:
|
| 368 |
+
header = file.read()
|
| 369 |
+
|
| 370 |
+
# 创建 Gradio 界面
|
| 371 |
+
with gr.Blocks(css=custom_css, title="Dolphin Document Parser") as demo:
|
| 372 |
+
gr.HTML(header)
|
| 373 |
+
|
| 374 |
+
with gr.Row():
|
| 375 |
+
# 侧边栏 - 文件上传和控制
|
| 376 |
+
with gr.Column(scale=1, elem_classes="sidebar"):
|
| 377 |
+
# 文件上传组件
|
| 378 |
+
file = gr.File(
|
| 379 |
+
label="Choose PDF or image file",
|
| 380 |
+
file_types=[".pdf", ".png", ".jpeg", ".jpg"],
|
| 381 |
+
elem_id="file-upload"
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
gr.HTML("选择文件后,点击处理按钮开始解析<br>After selecting the file, click the Process button to start parsing")
|
| 385 |
+
|
| 386 |
+
with gr.Row(elem_classes="action-buttons"):
|
| 387 |
+
submit_btn = gr.Button("处理文档/Process Document", variant="primary")
|
| 388 |
+
clear_btn = gr.ClearButton(value="清空/Clear")
|
| 389 |
+
|
| 390 |
+
# 处理状态显示
|
| 391 |
+
status_display = gr.Textbox(
|
| 392 |
+
label="Processing Status",
|
| 393 |
+
value="Ready to process documents",
|
| 394 |
+
interactive=False,
|
| 395 |
+
max_lines=2
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
# 示例文件
|
| 399 |
+
example_root = os.path.join(os.path.dirname(__file__), "examples")
|
| 400 |
+
if os.path.exists(example_root):
|
| 401 |
+
gr.HTML("示例文件/Example Files")
|
| 402 |
+
example_files = [
|
| 403 |
+
os.path.join(example_root, f)
|
| 404 |
+
for f in os.listdir(example_root)
|
| 405 |
+
if not f.endswith(".py")
|
| 406 |
+
]
|
| 407 |
+
|
| 408 |
+
examples = gr.Examples(
|
| 409 |
+
examples=example_files,
|
| 410 |
+
inputs=file,
|
| 411 |
+
examples_per_page=10,
|
| 412 |
+
elem_id="example-files"
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
# 主体内容区域
|
| 416 |
+
with gr.Column(scale=7):
|
| 417 |
+
with gr.Row(elem_classes="main-content"):
|
| 418 |
+
# 预览面板
|
| 419 |
+
with gr.Column(scale=1, elem_classes="preview-panel"):
|
| 420 |
+
gr.HTML("文件预览/Preview")
|
| 421 |
+
pdf_show = PDF(label="", interactive=False, visible=True, height=600)
|
| 422 |
+
debug_output = gr.JSON(label="Debug Info", height=100)
|
| 423 |
+
|
| 424 |
+
# 输出面板
|
| 425 |
+
with gr.Column(scale=1, elem_classes="output-panel"):
|
| 426 |
+
with gr.Tabs():
|
| 427 |
+
with gr.Tab("Markdown [Render]"):
|
| 428 |
+
md_render = gr.Markdown(
|
| 429 |
+
label="",
|
| 430 |
+
height=700,
|
| 431 |
+
show_copy_button=True,
|
| 432 |
+
latex_delimiters=latex_delimiters,
|
| 433 |
+
line_breaks=True,
|
| 434 |
+
)
|
| 435 |
+
with gr.Tab("Markdown [Content]"):
|
| 436 |
+
md_content = gr.TextArea(lines=30, show_copy_button=True)
|
| 437 |
+
with gr.Tab("Processing Data"):
|
| 438 |
+
json_output = gr.JSON(label="", height=700)
|
| 439 |
+
|
| 440 |
+
# 事件处理
|
| 441 |
+
file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
|
| 442 |
+
|
| 443 |
+
# 文档处理
|
| 444 |
+
def process_with_status(file_path):
|
| 445 |
+
"""处理文档并更新状态"""
|
| 446 |
+
if file_path is None:
|
| 447 |
+
return "", "", {}, {}, "Please select a file first"
|
| 448 |
+
|
| 449 |
+
# 更新状态为处理中
|
| 450 |
+
status = "Processing document..."
|
| 451 |
+
|
| 452 |
+
# 执行文档处理
|
| 453 |
+
md_render_result, md_content_result, json_result, debug_result = process_document(file_path)
|
| 454 |
+
|
| 455 |
+
# 更新完成状态
|
| 456 |
+
if "错误" in md_render_result:
|
| 457 |
+
status = "Processing failed - see debug info"
|
| 458 |
+
else:
|
| 459 |
+
status = "Processing completed successfully"
|
| 460 |
+
|
| 461 |
+
return md_render_result, md_content_result, json_result, debug_result, status
|
| 462 |
+
|
| 463 |
+
submit_btn.click(
|
| 464 |
+
fn=process_with_status,
|
| 465 |
+
inputs=[file],
|
| 466 |
+
outputs=[md_render, md_content, json_output, debug_output, status_display],
|
| 467 |
+
)
|
| 468 |
+
|
| 469 |
+
# 清空所有内容
|
| 470 |
+
def reset_all():
|
| 471 |
+
return None, None, "", "", {}, {}, "Ready to process documents"
|
| 472 |
+
|
| 473 |
+
clear_btn.click(
|
| 474 |
+
fn=reset_all,
|
| 475 |
+
inputs=[],
|
| 476 |
+
outputs=[file, pdf_show, md_render, md_content, json_output, debug_output, status_display]
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
# 启动应用
|
| 480 |
+
if __name__ == "__main__":
|
| 481 |
+
demo.launch()
|
examples/page_1.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8984e6b0bffa46e13809b4969e2be559df89e2cf9d6b3d7fb1a78f25aed8e570
|
| 3 |
+
size 1523572
|
examples/page_2.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4f4785470676739e2998f04bfc8daaf2e7ae227bf374614f07821ec5a315143
|
| 3 |
+
size 1478409
|
examples/page_3.jpeg
ADDED
|
Git LFS Details
|
examples/page_4.png
ADDED
|
Git LFS Details
|
examples/page_5.jpg
ADDED
|
Git LFS Details
|
examples/page_6.jpg
ADDED
|
Git LFS Details
|
header.html
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en" style="color-scheme: light;">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<meta name="color-scheme" content="light">
|
| 7 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
|
| 8 |
+
<style>
|
| 9 |
+
:root {
|
| 10 |
+
/* 主色调 */
|
| 11 |
+
--primary-color: #dceaf6;
|
| 12 |
+
--primary-light: #f8f9fa;
|
| 13 |
+
--primary-dark: #9ec9e3;
|
| 14 |
+
|
| 15 |
+
/* 辅助色调 */
|
| 16 |
+
--accent-color: #bfe2f8;
|
| 17 |
+
--accent-light: #dceaf6;
|
| 18 |
+
|
| 19 |
+
/* 背景色 */
|
| 20 |
+
--bg-color: #e8eff5;
|
| 21 |
+
--card-bg: #ffffff;
|
| 22 |
+
|
| 23 |
+
/* 文本色 */
|
| 24 |
+
--dark-text: #2b2d42;
|
| 25 |
+
--light-text: #f8f9fa;
|
| 26 |
+
--muted-text: rgba(43, 45, 66, 0.7);
|
| 27 |
+
|
| 28 |
+
/* 边框和阴影 */
|
| 29 |
+
--border-color: rgba(168, 168, 168, 0.432);
|
| 30 |
+
--card-shadow: 0 4px 20px rgba(104, 104, 104, 0.1);
|
| 31 |
+
|
| 32 |
+
/* 交互状态 */
|
| 33 |
+
--hover-bg: rgba(255, 255, 255, 0.5);
|
| 34 |
+
--active-color: #bfe2f8;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
.header-container {
|
| 38 |
+
display: flex;
|
| 39 |
+
flex-direction: row;
|
| 40 |
+
justify-content: space-between;
|
| 41 |
+
align-items: flex-start;
|
| 42 |
+
background: linear-gradient(135deg,
|
| 43 |
+
#e4deff 0%,
|
| 44 |
+
#d8f7ff 100%
|
| 45 |
+
);
|
| 46 |
+
padding: 1.8rem;
|
| 47 |
+
border-radius: 12px;
|
| 48 |
+
margin-bottom: 1.5rem;
|
| 49 |
+
box-shadow: var(--card-shadow);
|
| 50 |
+
position: relative;
|
| 51 |
+
overflow: hidden;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.header-container::before {
|
| 55 |
+
content: '';
|
| 56 |
+
position: absolute;
|
| 57 |
+
top: 0;
|
| 58 |
+
left: 0;
|
| 59 |
+
right: 0;
|
| 60 |
+
bottom: 0;
|
| 61 |
+
background: linear-gradient(135deg,
|
| 62 |
+
rgba(255, 255, 255, 0.2) 0%,
|
| 63 |
+
rgba(255, 255, 255, 0) 100%
|
| 64 |
+
);
|
| 65 |
+
pointer-events: none;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
.header-content {
|
| 69 |
+
display: flex;
|
| 70 |
+
flex-direction: column;
|
| 71 |
+
align-items: center;
|
| 72 |
+
text-align: center;
|
| 73 |
+
max-width: 100%;
|
| 74 |
+
width: 100%;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.header-buttons {
|
| 78 |
+
display: none;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
.logo-title-container {
|
| 82 |
+
display: flex;
|
| 83 |
+
flex-direction: column;
|
| 84 |
+
align-items: center;
|
| 85 |
+
margin-bottom: 1.5rem;
|
| 86 |
+
max-width: 100%;
|
| 87 |
+
text-align: center;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.logo {
|
| 91 |
+
width: 350px;
|
| 92 |
+
height: auto;
|
| 93 |
+
margin-bottom: 1rem;
|
| 94 |
+
margin-right: 0;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.header-title {
|
| 98 |
+
font-size: 2.2rem;
|
| 99 |
+
font-weight: 700;
|
| 100 |
+
color: var(--dark-text);
|
| 101 |
+
margin: 0;
|
| 102 |
+
font-family: 'Poppins', 'Segoe UI', sans-serif;
|
| 103 |
+
line-height: 1.2;
|
| 104 |
+
text-align: center;
|
| 105 |
+
max-width: 100%;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.header-subtitle {
|
| 109 |
+
font-size: 1.1rem;
|
| 110 |
+
color: var(--muted-text);
|
| 111 |
+
margin: 0 0 1.5rem 0;
|
| 112 |
+
line-height: 1.6;
|
| 113 |
+
max-width: 100%;
|
| 114 |
+
text-align: center;
|
| 115 |
+
margin-left: auto;
|
| 116 |
+
margin-right: auto;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.link-button {
|
| 120 |
+
display: flex;
|
| 121 |
+
align-items: center;
|
| 122 |
+
padding: 0.7rem 1.2rem;
|
| 123 |
+
background-color: var(--hover-bg);
|
| 124 |
+
border-radius: 8px;
|
| 125 |
+
color: var(--dark-text) !important;
|
| 126 |
+
text-decoration: none !important;
|
| 127 |
+
font-weight: 700;
|
| 128 |
+
font-size: 1.1rem;
|
| 129 |
+
transition: all 0.3s ease;
|
| 130 |
+
backdrop-filter: blur(5px);
|
| 131 |
+
border: 1px solid var(--border-color);
|
| 132 |
+
width: 100%;
|
| 133 |
+
margin-bottom: 0.5rem;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.link-button:hover {
|
| 137 |
+
background-color: var(--hover-bg);
|
| 138 |
+
transform: translateY(-2px);
|
| 139 |
+
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
|
| 140 |
+
text-decoration: none !important;
|
| 141 |
+
color: var(--dark-text) !important;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
.link-button i {
|
| 145 |
+
margin-right: 0.8rem;
|
| 146 |
+
font-size: 1.2rem;
|
| 147 |
+
color: var(--primary-dark);
|
| 148 |
+
min-width: 20px;
|
| 149 |
+
text-align: center;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
.link-button * {
|
| 153 |
+
text-decoration: none !important;
|
| 154 |
+
color: inherit !important;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
.feature-grid {
|
| 158 |
+
display: flex;
|
| 159 |
+
flex-direction: row;
|
| 160 |
+
align-items: flex-start;
|
| 161 |
+
justify-content: center;
|
| 162 |
+
margin-top: 1.5rem;
|
| 163 |
+
width: 100%;
|
| 164 |
+
margin-left: auto;
|
| 165 |
+
margin-right: auto;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
.feature-card {
|
| 169 |
+
flex: 1;
|
| 170 |
+
padding: 1rem 1rem;
|
| 171 |
+
background-color: transparent;
|
| 172 |
+
border: none;
|
| 173 |
+
box-shadow: none;
|
| 174 |
+
transition: none;
|
| 175 |
+
text-align: center;
|
| 176 |
+
position: relative;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
.feature-card:hover {
|
| 180 |
+
transform: none;
|
| 181 |
+
box-shadow: none;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
.feature-separator {
|
| 185 |
+
width: 1px;
|
| 186 |
+
align-self: stretch;
|
| 187 |
+
background-color: var(--border-color);
|
| 188 |
+
margin: 0 1rem;
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
.feature-icon {
|
| 192 |
+
font-size: 2rem;
|
| 193 |
+
color: var(--primary-dark);
|
| 194 |
+
margin-bottom: 1rem;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
.feature-title {
|
| 198 |
+
font-weight: 600;
|
| 199 |
+
color: var(--dark-text);
|
| 200 |
+
margin-bottom: 0.8rem;
|
| 201 |
+
font-size: 1.2rem;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
.feature-desc {
|
| 205 |
+
font-size: 0.85rem;
|
| 206 |
+
color: var(--muted-text);
|
| 207 |
+
line-height: 1.5;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
/* 新的导航按钮样式 */
|
| 211 |
+
.nav-buttons {
|
| 212 |
+
display: flex;
|
| 213 |
+
flex-direction: row;
|
| 214 |
+
align-items: center;
|
| 215 |
+
justify-content: center;
|
| 216 |
+
margin-top: 1rem;
|
| 217 |
+
margin-bottom: 2rem;
|
| 218 |
+
background-color: rgba(255, 255, 255, 0.7);
|
| 219 |
+
border-radius: 12px;
|
| 220 |
+
border: 1px solid var(--border-color);
|
| 221 |
+
padding: 0.5rem 1rem;
|
| 222 |
+
max-width: none;
|
| 223 |
+
width: auto;
|
| 224 |
+
align-self: center;
|
| 225 |
+
margin-left: auto;
|
| 226 |
+
margin-right: auto;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
.nav-link {
|
| 230 |
+
display: flex;
|
| 231 |
+
align-items: center;
|
| 232 |
+
padding: 0.5rem 1rem;
|
| 233 |
+
color: var(--dark-text) !important;
|
| 234 |
+
text-decoration: none !important;
|
| 235 |
+
font-weight: 600;
|
| 236 |
+
font-size: 1rem;
|
| 237 |
+
transition: all 0.3s ease;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.nav-link:hover {
|
| 241 |
+
transform: translateY(-3px);
|
| 242 |
+
color: var(--primary-dark) !important;
|
| 243 |
+
background-color: rgba(255, 255, 255, 0.8);
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
.nav-link i {
|
| 247 |
+
margin-right: 0.5rem;
|
| 248 |
+
font-size: 1.1rem;
|
| 249 |
+
color: var(--primary-dark);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
.nav-separator {
|
| 253 |
+
height: 20px;
|
| 254 |
+
width: 1px;
|
| 255 |
+
background-color: var(--border-color);
|
| 256 |
+
margin: 0 0.5rem;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
@media (max-width: 960px) {
|
| 260 |
+
.header-container {
|
| 261 |
+
flex-direction: column;
|
| 262 |
+
padding: 1.5rem;
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
.header-content {
|
| 266 |
+
max-width: 100%;
|
| 267 |
+
margin-bottom: 2rem;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
.header-buttons {
|
| 271 |
+
width: 100%;
|
| 272 |
+
margin-left: 0;
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
.logo-title-container {
|
| 276 |
+
flex-direction: column;
|
| 277 |
+
align-items: center;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
.logo {
|
| 281 |
+
width: 250px;
|
| 282 |
+
margin-bottom: 1rem;
|
| 283 |
+
margin-right: 0;
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
.header-title {
|
| 287 |
+
font-size: 1.8rem;
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
.feature-grid {
|
| 291 |
+
flex-direction: column;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
.feature-card {
|
| 295 |
+
width: 100%;
|
| 296 |
+
padding: 1rem 0;
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
.feature-separator {
|
| 300 |
+
width: 100%;
|
| 301 |
+
height: 1px;
|
| 302 |
+
margin: 0.5rem 0;
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
.nav-buttons {
|
| 306 |
+
flex-wrap: wrap;
|
| 307 |
+
justify-content: center;
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
.nav-link {
|
| 311 |
+
padding: 0.5rem;
|
| 312 |
+
font-size: 0.9rem;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
.nav-separator {
|
| 316 |
+
display: none;
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
.feature-desc {
|
| 320 |
+
font-size: 0.9rem;
|
| 321 |
+
}
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
/* 添加禁用夜间模式的样式 */
|
| 325 |
+
@media (prefers-color-scheme: dark) {
|
| 326 |
+
/* 强制使用明亮模式颜色 */
|
| 327 |
+
.header-container,
|
| 328 |
+
.header-content,
|
| 329 |
+
.logo-title-container,
|
| 330 |
+
.header-title,
|
| 331 |
+
.header-subtitle,
|
| 332 |
+
.nav-buttons,
|
| 333 |
+
.nav-link,
|
| 334 |
+
.feature-grid,
|
| 335 |
+
.feature-card,
|
| 336 |
+
.feature-title,
|
| 337 |
+
.feature-desc,
|
| 338 |
+
body,
|
| 339 |
+
* {
|
| 340 |
+
color-scheme: light !important; /* 强制使用明亮模式配色方案 */
|
| 341 |
+
color: var(--dark-text) !important; /* 强制使用黑色文本 */
|
| 342 |
+
background-color: initial; /* 保持原有背景色 */
|
| 343 |
+
}
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
/* 添加全局样式覆盖,确保所有文本都使用我们指定的颜色 */
|
| 347 |
+
body, p, h1, h2, h3, h4, h5, h6, span, div, a {
|
| 348 |
+
color: var(--dark-text) !important;
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
.feature-desc {
|
| 352 |
+
color: var(--muted-text) !important;
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
/* 确保图标颜色也不受夜间模式影响 */
|
| 356 |
+
.feature-icon i, .nav-link i {
|
| 357 |
+
color: var(--primary-dark) !important;
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
/* 导航链接悬停效果 */
|
| 361 |
+
.nav-link:hover {
|
| 362 |
+
color: var(--primary-dark) !important;
|
| 363 |
+
}
|
| 364 |
+
</style>
|
| 365 |
+
</head>
|
| 366 |
+
<body>
|
| 367 |
+
<div class="header-container">
|
| 368 |
+
<div class="header-content">
|
| 369 |
+
<div class="logo-title-container">
|
| 370 |
+
<img src="https://raw.githubusercontent.com/bytedance/Dolphin/master/assets/dolphin.png" alt="Dolphin Logo" class="logo">
|
| 371 |
+
<h1 class="header-title">Document Image Parsing via Heterogeneous Anchor Prompting</h1>
|
| 372 |
+
</div>
|
| 373 |
+
|
| 374 |
+
<p class="header-subtitle">
|
| 375 |
+
A novel multimodal document image parsing model, following an analyze-then-parse paradigm for parallel decoding
|
| 376 |
+
<!-- <br>
|
| 377 |
+
Stage 1: Comprehensive page-level layout analysis by generating element sequence in natural reading order.
|
| 378 |
+
<br>
|
| 379 |
+
Stage 2: Efficient parallel parsing of document elements using heterogeneous anchors and task-specific prompts. -->
|
| 380 |
+
</p>
|
| 381 |
+
|
| 382 |
+
<!-- 新的导航按钮 -->
|
| 383 |
+
<div class="nav-buttons">
|
| 384 |
+
<!-- <a href="https://mineru.org.cn/home?source=huggingface" class="nav-link">
|
| 385 |
+
<i class="fas fa-home"></i> 主页/Homepage
|
| 386 |
+
</a> -->
|
| 387 |
+
<!-- <div class="nav-separator"></div> -->
|
| 388 |
+
<a href="https://arxiv.org/abs/2505.14059" class="nav-link">
|
| 389 |
+
<i class="fas fa-file-alt"></i> 论文/Paper
|
| 390 |
+
</a>
|
| 391 |
+
<div class="nav-separator"></div>
|
| 392 |
+
<a href="https://huggingface.co/ByteDance/Dolphin" class="nav-link">
|
| 393 |
+
<i class="fas fa-cube"></i> 模型/Model
|
| 394 |
+
</a>
|
| 395 |
+
<div class="nav-separator"></div>
|
| 396 |
+
<a href="https://github.com/bytedance/Dolphin" class="nav-link">
|
| 397 |
+
<i class="fas fa-code"></i> 代码/Code
|
| 398 |
+
</a>
|
| 399 |
+
<div class="nav-separator"></div>
|
| 400 |
+
<a href="https://opensource.org/licenses/MIT" class="nav-link">
|
| 401 |
+
<i class="fas fa-balance-scale"></i> 许可证/License
|
| 402 |
+
</a>
|
| 403 |
+
</div>
|
| 404 |
+
|
| 405 |
+
<div class="feature-grid">
|
| 406 |
+
<div class="feature-card">
|
| 407 |
+
<div class="feature-icon"><i class="fas fa-file-import"></i></div>
|
| 408 |
+
<div class="feature-title">支持格式/Support Format</div>
|
| 409 |
+
<div class="feature-desc">支持多页PDF、单页图像<br>Multi-page PDF, single document image (JPEG/PNG)</div>
|
| 410 |
+
</div>
|
| 411 |
+
|
| 412 |
+
<div class="feature-separator"></div>
|
| 413 |
+
|
| 414 |
+
<div class="feature-card">
|
| 415 |
+
<div class="feature-icon"><i class="fas fa-feather-alt"></i></div>
|
| 416 |
+
<div class="feature-title">轻量级模型/Lightweight Model</div>
|
| 417 |
+
<div class="feature-desc">Dolphin模型参数量322M,高效易部署<br>Lightweight (322M) and efficient, easy to deploy</div>
|
| 418 |
+
</div>
|
| 419 |
+
|
| 420 |
+
<div class="feature-separator"></div>
|
| 421 |
+
|
| 422 |
+
<div class="feature-card">
|
| 423 |
+
<div class="feature-icon"><i class="fas fa-tasks"></i></div>
|
| 424 |
+
<div class="feature-title">并行解析/Parallel Parsing</div>
|
| 425 |
+
<div class="feature-desc">Dolphin并行解析多个文本块<br>Parsing several text blocks in a batch for speed up</div>
|
| 426 |
+
</div>
|
| 427 |
+
|
| 428 |
+
<div class="feature-separator"></div>
|
| 429 |
+
|
| 430 |
+
<div class="feature-card">
|
| 431 |
+
<div class="feature-icon"><i class="fas fa-superscript"></i></div>
|
| 432 |
+
<div class="feature-title">公式和表格/Formula and Table</div>
|
| 433 |
+
<div class="feature-desc">支持公式(LaTeX格式)、表格(HTML格式)输出<br>Support formulas (LaTeX format) and tables (HTML format)</div>
|
| 434 |
+
</div>
|
| 435 |
+
</div>
|
| 436 |
+
|
| 437 |
+
<!-- 添加免责声明 -->
|
| 438 |
+
<p style="
|
| 439 |
+
font-size: 0.8rem;
|
| 440 |
+
color: var(--muted-text) !important;
|
| 441 |
+
margin-top: 1.5rem;
|
| 442 |
+
text-align: center;
|
| 443 |
+
">内容由 AI 生成,请仔细甄别</p>
|
| 444 |
+
</div>
|
| 445 |
+
</div>
|
| 446 |
+
</body>
|
| 447 |
+
</html>
|
inference_hugg.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
| 3 |
+
SPDX-License-Identifier: MIT
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import glob
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
import cv2
|
| 11 |
+
import torch
|
| 12 |
+
from PIL import Image
|
| 13 |
+
from transformers import AutoProcessor, VisionEncoderDecoderModel
|
| 14 |
+
|
| 15 |
+
from utils.utils import *
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DOLPHIN:
|
| 19 |
+
def __init__(self, model_id_or_path):
|
| 20 |
+
"""Initialize the Hugging Face model
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
model_id_or_path: Path to local model or Hugging Face model ID
|
| 24 |
+
"""
|
| 25 |
+
# Load model from local path or Hugging Face hub
|
| 26 |
+
self.processor = AutoProcessor.from_pretrained(model_id_or_path)
|
| 27 |
+
self.model = VisionEncoderDecoderModel.from_pretrained(model_id_or_path)
|
| 28 |
+
self.model.eval()
|
| 29 |
+
|
| 30 |
+
# Set device and precision
|
| 31 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 32 |
+
self.model.to(self.device)
|
| 33 |
+
self.model = self.model.half() # Always use half precision by default
|
| 34 |
+
|
| 35 |
+
# set tokenizer
|
| 36 |
+
self.tokenizer = self.processor.tokenizer
|
| 37 |
+
|
| 38 |
+
def chat(self, prompt, image):
|
| 39 |
+
"""Process an image or batch of images with the given prompt(s)
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
prompt: Text prompt or list of prompts to guide the model
|
| 43 |
+
image: PIL Image or list of PIL Images to process
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
Generated text or list of texts from the model
|
| 47 |
+
"""
|
| 48 |
+
# Check if we're dealing with a batch
|
| 49 |
+
is_batch = isinstance(image, list)
|
| 50 |
+
|
| 51 |
+
if not is_batch:
|
| 52 |
+
# Single image, wrap it in a list for consistent processing
|
| 53 |
+
images = [image]
|
| 54 |
+
prompts = [prompt]
|
| 55 |
+
else:
|
| 56 |
+
# Batch of images
|
| 57 |
+
images = image
|
| 58 |
+
prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)
|
| 59 |
+
|
| 60 |
+
# Prepare image
|
| 61 |
+
batch_inputs = self.processor(images, return_tensors="pt", padding=True)
|
| 62 |
+
batch_pixel_values = batch_inputs.pixel_values.half().to(self.device)
|
| 63 |
+
|
| 64 |
+
# Prepare prompt
|
| 65 |
+
prompts = [f"<s>{p} <Answer/>" for p in prompts]
|
| 66 |
+
batch_prompt_inputs = self.tokenizer(
|
| 67 |
+
prompts,
|
| 68 |
+
add_special_tokens=False,
|
| 69 |
+
return_tensors="pt"
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
batch_prompt_ids = batch_prompt_inputs.input_ids.to(self.device)
|
| 73 |
+
batch_attention_mask = batch_prompt_inputs.attention_mask.to(self.device)
|
| 74 |
+
|
| 75 |
+
# Generate text
|
| 76 |
+
outputs = self.model.generate(
|
| 77 |
+
pixel_values=batch_pixel_values,
|
| 78 |
+
decoder_input_ids=batch_prompt_ids,
|
| 79 |
+
decoder_attention_mask=batch_attention_mask,
|
| 80 |
+
min_length=1,
|
| 81 |
+
max_length=4096,
|
| 82 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
| 83 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
| 84 |
+
use_cache=True,
|
| 85 |
+
bad_words_ids=[[self.tokenizer.unk_token_id]],
|
| 86 |
+
return_dict_in_generate=True,
|
| 87 |
+
do_sample=False,
|
| 88 |
+
num_beams=1,
|
| 89 |
+
repetition_penalty=1.1
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Process output
|
| 93 |
+
sequences = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)
|
| 94 |
+
|
| 95 |
+
# Clean prompt text from output
|
| 96 |
+
results = []
|
| 97 |
+
for i, sequence in enumerate(sequences):
|
| 98 |
+
cleaned = sequence.replace(prompts[i], "").replace("<pad>", "").replace("</s>", "").strip()
|
| 99 |
+
results.append(cleaned)
|
| 100 |
+
|
| 101 |
+
# Return a single result for single image input
|
| 102 |
+
if not is_batch:
|
| 103 |
+
return results[0]
|
| 104 |
+
return results
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def process_page(image_path, model, save_dir, max_batch_size=None):
|
| 108 |
+
"""Parse document images with two stages"""
|
| 109 |
+
# Stage 1: Page-level layout and reading order parsing
|
| 110 |
+
pil_image = Image.open(image_path).convert("RGB")
|
| 111 |
+
layout_output = model.chat("Parse the reading order of this document.", pil_image)
|
| 112 |
+
|
| 113 |
+
# Stage 2: Element-level content parsing
|
| 114 |
+
padded_image, dims = prepare_image(pil_image)
|
| 115 |
+
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size)
|
| 116 |
+
|
| 117 |
+
# Save outputs
|
| 118 |
+
json_path = save_outputs(recognition_results, image_path, save_dir)
|
| 119 |
+
|
| 120 |
+
return json_path, recognition_results
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def process_elements(layout_results, padded_image, dims, model, max_batch_size=None):
|
| 124 |
+
"""Parse all document elements with parallel decoding"""
|
| 125 |
+
layout_results = parse_layout_string(layout_results)
|
| 126 |
+
|
| 127 |
+
# Store text and table elements separately
|
| 128 |
+
text_elements = [] # Text elements
|
| 129 |
+
table_elements = [] # Table elements
|
| 130 |
+
figure_results = [] # Image elements (no processing needed)
|
| 131 |
+
previous_box = None
|
| 132 |
+
reading_order = 0
|
| 133 |
+
|
| 134 |
+
# Collect elements to process and group by type
|
| 135 |
+
for bbox, label in layout_results:
|
| 136 |
+
try:
|
| 137 |
+
# Adjust coordinates
|
| 138 |
+
x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = process_coordinates(
|
| 139 |
+
bbox, padded_image, dims, previous_box
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Crop and parse element
|
| 143 |
+
cropped = padded_image[y1:y2, x1:x2]
|
| 144 |
+
if cropped.size > 0:
|
| 145 |
+
if label == "fig":
|
| 146 |
+
# For figure regions, add empty text result immediately
|
| 147 |
+
figure_results.append(
|
| 148 |
+
{
|
| 149 |
+
"label": label,
|
| 150 |
+
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
| 151 |
+
"text": "",
|
| 152 |
+
"reading_order": reading_order,
|
| 153 |
+
}
|
| 154 |
+
)
|
| 155 |
+
else:
|
| 156 |
+
# Prepare element for parsing
|
| 157 |
+
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
| 158 |
+
element_info = {
|
| 159 |
+
"crop": pil_crop,
|
| 160 |
+
"label": label,
|
| 161 |
+
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
| 162 |
+
"reading_order": reading_order,
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
# Group by type
|
| 166 |
+
if label == "tab":
|
| 167 |
+
table_elements.append(element_info)
|
| 168 |
+
else: # Text elements
|
| 169 |
+
text_elements.append(element_info)
|
| 170 |
+
|
| 171 |
+
reading_order += 1
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
print(f"Error processing bbox with label {label}: {str(e)}")
|
| 175 |
+
continue
|
| 176 |
+
|
| 177 |
+
# Initialize results list
|
| 178 |
+
recognition_results = figure_results.copy()
|
| 179 |
+
|
| 180 |
+
# Process text elements (in batches)
|
| 181 |
+
if text_elements:
|
| 182 |
+
text_results = process_element_batch(text_elements, model, "Read text in the image.", max_batch_size)
|
| 183 |
+
recognition_results.extend(text_results)
|
| 184 |
+
|
| 185 |
+
# Process table elements (in batches)
|
| 186 |
+
if table_elements:
|
| 187 |
+
table_results = process_element_batch(table_elements, model, "Parse the table in the image.", max_batch_size)
|
| 188 |
+
recognition_results.extend(table_results)
|
| 189 |
+
|
| 190 |
+
# Sort elements by reading order
|
| 191 |
+
recognition_results.sort(key=lambda x: x.get("reading_order", 0))
|
| 192 |
+
|
| 193 |
+
return recognition_results
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def process_element_batch(elements, model, prompt, max_batch_size=None):
|
| 197 |
+
"""Process elements of the same type in batches"""
|
| 198 |
+
results = []
|
| 199 |
+
|
| 200 |
+
# Determine batch size
|
| 201 |
+
batch_size = len(elements)
|
| 202 |
+
if max_batch_size is not None and max_batch_size > 0:
|
| 203 |
+
batch_size = min(batch_size, max_batch_size)
|
| 204 |
+
|
| 205 |
+
# Process in batches
|
| 206 |
+
for i in range(0, len(elements), batch_size):
|
| 207 |
+
batch_elements = elements[i:i+batch_size]
|
| 208 |
+
crops_list = [elem["crop"] for elem in batch_elements]
|
| 209 |
+
|
| 210 |
+
# Use the same prompt for all elements in the batch
|
| 211 |
+
prompts_list = [prompt] * len(crops_list)
|
| 212 |
+
|
| 213 |
+
# Batch inference
|
| 214 |
+
batch_results = model.chat(prompts_list, crops_list)
|
| 215 |
+
|
| 216 |
+
# Add results
|
| 217 |
+
for j, result in enumerate(batch_results):
|
| 218 |
+
elem = batch_elements[j]
|
| 219 |
+
results.append({
|
| 220 |
+
"label": elem["label"],
|
| 221 |
+
"bbox": elem["bbox"],
|
| 222 |
+
"text": result.strip(),
|
| 223 |
+
"reading_order": elem["reading_order"],
|
| 224 |
+
})
|
| 225 |
+
|
| 226 |
+
return results
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def main():
|
| 230 |
+
parser = argparse.ArgumentParser(description="Document processing tool using DOLPHIN model")
|
| 231 |
+
parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image or directory of images")
|
| 232 |
+
parser.add_argument(
|
| 233 |
+
"--save_dir",
|
| 234 |
+
type=str,
|
| 235 |
+
default=None,
|
| 236 |
+
help="Directory to save parsing results (default: same as input directory)",
|
| 237 |
+
)
|
| 238 |
+
parser.add_argument(
|
| 239 |
+
"--max_batch_size",
|
| 240 |
+
type=int,
|
| 241 |
+
default=16,
|
| 242 |
+
help="Maximum number of document elements to parse in a single batch (default: 16)",
|
| 243 |
+
)
|
| 244 |
+
args = parser.parse_args()
|
| 245 |
+
|
| 246 |
+
# Load Model
|
| 247 |
+
model = DOLPHIN("ByteDance/Dolphin")
|
| 248 |
+
|
| 249 |
+
# Collect Document Images
|
| 250 |
+
if os.path.isdir(args.input_path):
|
| 251 |
+
image_files = []
|
| 252 |
+
for ext in [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]:
|
| 253 |
+
image_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
|
| 254 |
+
image_files = sorted(image_files)
|
| 255 |
+
else:
|
| 256 |
+
if not os.path.exists(args.input_path):
|
| 257 |
+
raise FileNotFoundError(f"Input path {args.input_path} does not exist")
|
| 258 |
+
image_files = [args.input_path]
|
| 259 |
+
|
| 260 |
+
save_dir = args.save_dir or (
|
| 261 |
+
args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)
|
| 262 |
+
)
|
| 263 |
+
setup_output_dirs(save_dir)
|
| 264 |
+
|
| 265 |
+
total_samples = len(image_files)
|
| 266 |
+
print(f"\nTotal samples to process: {total_samples}")
|
| 267 |
+
|
| 268 |
+
# Process All Document Images
|
| 269 |
+
for image_path in image_files:
|
| 270 |
+
print(f"\nProcessing {image_path}")
|
| 271 |
+
try:
|
| 272 |
+
json_path, recognition_results = process_page(
|
| 273 |
+
image_path=image_path,
|
| 274 |
+
model=model,
|
| 275 |
+
save_dir=save_dir,
|
| 276 |
+
max_batch_size=args.max_batch_size,
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
print(f"Processing completed. Results saved to {save_dir}")
|
| 280 |
+
|
| 281 |
+
except Exception as e:
|
| 282 |
+
print(f"Error processing {image_path}: {str(e)}")
|
| 283 |
+
continue
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
if __name__ == "__main__":
|
| 287 |
+
main()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.black]
|
| 2 |
+
line-length = 120
|
| 3 |
+
include = '\.pyi?$'
|
| 4 |
+
exclude = '''
|
| 5 |
+
/(
|
| 6 |
+
\.git
|
| 7 |
+
| \.hg
|
| 8 |
+
| \.mypy_cache
|
| 9 |
+
| \.tox
|
| 10 |
+
| \.venv
|
| 11 |
+
| _build
|
| 12 |
+
| buck-out
|
| 13 |
+
| build
|
| 14 |
+
| dist
|
| 15 |
+
)/
|
| 16 |
+
'''
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.24.0
|
| 2 |
+
gradio_pdf==0.0.22
|
| 3 |
+
pymupdf==1.25.5
|
| 4 |
+
loguru==0.7.3
|
| 5 |
+
torch==2.1.0
|
| 6 |
+
transformers==4.47.0
|
| 7 |
+
opencv-python==4.11.0.86
|
| 8 |
+
opencv-python-headless==4.5.5.64
|
| 9 |
+
Pillow==9.3.0
|
| 10 |
+
numpy==1.24.4
|
| 11 |
+
spaces
|
| 12 |
+
albumentations==1.4.0
|
| 13 |
+
requests==2.32.3
|
| 14 |
+
httpx==0.23.0
|
static/styles.css
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
:root {
|
| 2 |
+
/* 主色调 */
|
| 3 |
+
--primary-color: #dceaf6;
|
| 4 |
+
--primary-light: #f8f9fa;
|
| 5 |
+
--primary-dark: #9ec9e3;
|
| 6 |
+
|
| 7 |
+
/* 辅助色调 */
|
| 8 |
+
--accent-color: #bfe2f8;
|
| 9 |
+
--accent-light: #dceaf6;
|
| 10 |
+
|
| 11 |
+
/* 背景色 */
|
| 12 |
+
--bg-color: #e8eff5;
|
| 13 |
+
--card-bg: #ffffff;
|
| 14 |
+
|
| 15 |
+
/* 文本色 */
|
| 16 |
+
--dark-text: #2b2d42;
|
| 17 |
+
--light-text: #f8f9fa;
|
| 18 |
+
--muted-text: rgba(43, 45, 66, 0.7);
|
| 19 |
+
|
| 20 |
+
/* 边框和阴影 */
|
| 21 |
+
--border-color: rgba(168, 168, 168, 0.432);
|
| 22 |
+
--card-shadow: 0 4px 20px rgba(104, 104, 104, 0.1);
|
| 23 |
+
|
| 24 |
+
/* 交互状态 */
|
| 25 |
+
--hover-bg: rgba(255, 255, 255, 0.5);
|
| 26 |
+
--active-color: #bfe2f8;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
body {
|
| 30 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 31 |
+
background-color: var(--bg-color);
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
/* 卡片样式 */
|
| 35 |
+
.gradio-container {
|
| 36 |
+
max-width: 95% !important;
|
| 37 |
+
width: 95% !important;
|
| 38 |
+
margin-left: auto !important;
|
| 39 |
+
margin-right: auto !important;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
/* 面板样式 */
|
| 43 |
+
.panel {
|
| 44 |
+
border-radius: 12px !important;
|
| 45 |
+
border: 1px solid var(--border-color) !important;
|
| 46 |
+
box-shadow: var(--card-shadow) !important;
|
| 47 |
+
background-color: var(--card-bg) !important;
|
| 48 |
+
padding: 1.5rem !important;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
/* 按钮样式 */
|
| 52 |
+
button.primary {
|
| 53 |
+
border-radius: 8px !important;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
button {
|
| 57 |
+
border-radius: 8px !important;
|
| 58 |
+
border: 1px solid var(--border-color) !important;
|
| 59 |
+
background-color: var(--hover-bg) !important;
|
| 60 |
+
color: var(--dark-text) !important;
|
| 61 |
+
transition: all 0.3s ease !important;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
button:hover {
|
| 65 |
+
transform: translateY(-2px) !important;
|
| 66 |
+
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1) !important;
|
| 67 |
+
background-color: var(--hover-bg) !important;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
/* 文件上传区域 */
|
| 71 |
+
.file-preview {
|
| 72 |
+
border-radius: 8px !important;
|
| 73 |
+
border: 1px dashed var(--border-color) !important;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
.file-preview:hover {
|
| 77 |
+
border-color: var(--primary-dark) !important;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
/* 确保所有链接按钮样式正确 */
|
| 81 |
+
.header-buttons a,
|
| 82 |
+
.header-buttons a:hover,
|
| 83 |
+
.header-buttons a:visited,
|
| 84 |
+
.header-buttons a:active {
|
| 85 |
+
text-decoration: none !important;
|
| 86 |
+
color: var(--dark-text) !important;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
/* 覆盖任何可能的内联样式 */
|
| 90 |
+
.header-buttons a[style] {
|
| 91 |
+
text-decoration: none !important;
|
| 92 |
+
color: var(--dark-text) !important;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
/* 确保链接内的所有元素都没有下划线 */
|
| 96 |
+
.header-buttons a *,
|
| 97 |
+
.header-buttons a:hover * {
|
| 98 |
+
text-decoration: none !important;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
/* 隐藏页面底部信息 */
|
| 102 |
+
footer, .footer, .footer-links, .gradio-footer {
|
| 103 |
+
display: none !important;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
/* 隐藏底部工具栏 */
|
| 107 |
+
.gradio-container > div:last-child {
|
| 108 |
+
display: none !important;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
/* 隐藏底部API按钮和设置按钮 */
|
| 112 |
+
.fixed-bottom {
|
| 113 |
+
display: none !important;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* 隐藏Gradio品牌信息 */
|
| 117 |
+
.gr-prose p:last-child {
|
| 118 |
+
display: none !important;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
/* 隐藏底部的所有可能元素 */
|
| 122 |
+
[class*="footer"], [id*="footer"], [class*="bottom-bar"], [id*="bottom-bar"] {
|
| 123 |
+
display: none !important;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
/* 侧边栏样式 */
|
| 127 |
+
.sidebar {
|
| 128 |
+
background-color: var(--card-bg);
|
| 129 |
+
border-radius: 12px;
|
| 130 |
+
border: 1px solid var(--border-color);
|
| 131 |
+
box-shadow: var(--card-shadow);
|
| 132 |
+
padding: 1rem;
|
| 133 |
+
margin-right: 1rem;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
/* 上传按钮样式 */
|
| 137 |
+
.upload-button {
|
| 138 |
+
display: flex;
|
| 139 |
+
align-items: center;
|
| 140 |
+
justify-content: center;
|
| 141 |
+
border: 2px dashed var(--border-color);
|
| 142 |
+
padding: 1rem;
|
| 143 |
+
margin-bottom: 1rem;
|
| 144 |
+
cursor: pointer;
|
| 145 |
+
transition: all 0.3s ease;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
.upload-button:hover {
|
| 149 |
+
border-color: var(--primary-dark);
|
| 150 |
+
background-color: rgba(158, 201, 227, 0.1);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
.upload-button i {
|
| 154 |
+
font-size: 1.5rem;
|
| 155 |
+
color: var(--primary-dark);
|
| 156 |
+
margin-right: 0.5rem;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
/* 示例文件列表样式 */
|
| 160 |
+
.example-list {
|
| 161 |
+
list-style-type: none;
|
| 162 |
+
padding: 0;
|
| 163 |
+
margin: 0;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
.example-item {
|
| 167 |
+
display: flex;
|
| 168 |
+
align-items: center;
|
| 169 |
+
padding: 0.5rem;
|
| 170 |
+
border-radius: 8px;
|
| 171 |
+
margin-bottom: 0.5rem;
|
| 172 |
+
cursor: pointer;
|
| 173 |
+
transition: all 0.3s ease;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
.example-item:hover {
|
| 177 |
+
background-color: rgba(158, 201, 227, 0.1);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
.example-item i {
|
| 181 |
+
font-size: 1.2rem;
|
| 182 |
+
color: var(--primary-dark);
|
| 183 |
+
margin-right: 0.5rem;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
.example-item-name {
|
| 187 |
+
white-space: nowrap;
|
| 188 |
+
overflow: hidden;
|
| 189 |
+
text-overflow: ellipsis;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
/* 取消和确认按钮样式 */
|
| 193 |
+
.action-buttons {
|
| 194 |
+
display: flex;
|
| 195 |
+
justify-content: flex-end;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
/* 取消按钮样式 */
|
| 199 |
+
button[value="清空/Clear"] {
|
| 200 |
+
color: #e74c3c !important;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
/* 隐藏原始文件上传组件 */
|
| 204 |
+
.file-upload {
|
| 205 |
+
display: none !important;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
/* 主体内容样式 */
|
| 209 |
+
.main-content {
|
| 210 |
+
display: flex;
|
| 211 |
+
flex: 1;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
/* 预览框样式 */
|
| 215 |
+
.preview-panel {
|
| 216 |
+
flex: 1;
|
| 217 |
+
background-color: var(--card-bg);
|
| 218 |
+
border-radius: 12px;
|
| 219 |
+
border: 1px solid var(--border-color);
|
| 220 |
+
box-shadow: var(--card-shadow);
|
| 221 |
+
padding: 1rem;
|
| 222 |
+
margin-right: 1rem;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
/* 输出框样式 */
|
| 226 |
+
.output-panel {
|
| 227 |
+
flex: 1;
|
| 228 |
+
background-color: var(--card-bg);
|
| 229 |
+
border-radius: 12px;
|
| 230 |
+
border: 1px solid var(--border-color);
|
| 231 |
+
box-shadow: var(--card-shadow);
|
| 232 |
+
padding: 1rem;
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
/* 响应式布局 */
|
| 236 |
+
@media (max-width: 768px) {
|
| 237 |
+
.main-content {
|
| 238 |
+
flex-direction: column;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
.sidebar, .preview-panel, .output-panel {
|
| 242 |
+
margin-right: 0;
|
| 243 |
+
margin-bottom: 1rem;
|
| 244 |
+
width: 100%;
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
/* 美化文件上传组件 */
|
| 249 |
+
#file-upload {
|
| 250 |
+
margin-bottom: 1.5rem;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
#file-upload .file-preview {
|
| 254 |
+
border: 2px dashed var(--border-color);
|
| 255 |
+
padding: 1.5rem;
|
| 256 |
+
transition: all 0.3s ease;
|
| 257 |
+
text-align: center;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
#file-upload .file-preview:hover {
|
| 261 |
+
border-color: var(--primary-dark);
|
| 262 |
+
background-color: rgba(158, 201, 227, 0.1);
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
/* 隐藏原始标签 */
|
| 266 |
+
#file-upload .label-wrap {
|
| 267 |
+
display: none;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
/* 美化示例文件列表 */
|
| 271 |
+
#example-files .gr-samples-table {
|
| 272 |
+
border: none;
|
| 273 |
+
background: transparent;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
#example-files .gr-samples-table td {
|
| 277 |
+
border: none;
|
| 278 |
+
padding: 0.5rem;
|
| 279 |
+
transition: all 0.3s ease;
|
| 280 |
+
border-radius: 8px;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
#example-files .gr-samples-table tr:hover td {
|
| 284 |
+
background-color: rgba(158, 201, 227, 0.1);
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
#example-files .gr-samples-table td a {
|
| 288 |
+
display: flex;
|
| 289 |
+
align-items: center;
|
| 290 |
+
color: var(--dark-text);
|
| 291 |
+
text-decoration: none;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
#example-files .gr-samples-table td a::before {
|
| 295 |
+
content: "\f1c1";
|
| 296 |
+
font-family: "Font Awesome 6 Free";
|
| 297 |
+
font-weight: 900;
|
| 298 |
+
margin-right: 0.5rem;
|
| 299 |
+
color: var(--primary-dark);
|
| 300 |
+
font-size: 1.2rem;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
/* 隐藏分页控件 */
|
| 304 |
+
#example-files .gr-samples-pagination {
|
| 305 |
+
display: none;
|
| 306 |
+
}
|
utils/markdown_utils.py
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
| 3 |
+
SPDX-License-Identifier: MIT
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
import base64
|
| 8 |
+
from typing import List, Dict, Any, Optional
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
"""
|
| 12 |
+
Example input:
|
| 13 |
+
[
|
| 14 |
+
{"label": "tab", "bbox": [0.176, 0.74, 0.824, 0.82], "text": "<table><tr><td></td><td>HellaSwag</td><td>Obqa</td><td>WinoGrande</td><td>ARC-c</td><td>ARC-e</td><td>boolq</td><td>piqa</td><td>Avg</td></tr><tr><td>OPT-1.3B</td><td>53.65</td><td>33.40</td><td>59.59</td><td>29.44</td><td>50.80</td><td>60.83</td><td>72.36</td><td>51.44</td></tr><tr><td>Pythia-1.0B</td><td>47.16</td><td>31.40</td><td>53.43</td><td>27.05</td><td>48.99</td><td>57.83</td><td>69.21</td><td>48.30</td></tr><tr><td>Pythia-1.4B</td><td>52.01</td><td>33.20</td><td>57.38</td><td>28.50</td><td>54.00</td><td>63.27</td><td>70.95</td><td>51.33</td></tr><tr><td>TinyLlama-1.1B</td><td>59.20</td><td>36.00</td><td>59.12</td><td>30.10</td><td>55.25</td><td>57.83</td><td>73.29</td><td>52.99</td></tr></table>", "reading_order": 6},
|
| 15 |
+
{"label": "cap", "bbox": [0.28, 0.729, 0.711, 0.74], "text": "Table 2: Zero-shot performance on commonsense reasoning tasks", "reading_order": 7},
|
| 16 |
+
{"label": "para", "bbox": [0.176, 0.848, 0.826, 0.873], "text": "We of performance during training We tracked the accuracy of TinyLlama on common-\nsense reasoning benchmarks during its pre-training, as shown in Fig. 2 . Generally, the performance of", "reading_order": 8},
|
| 17 |
+
{"label": "fnote", "bbox": [0.176, 0.88, 0.824, 0.912], "text": "${ }^{4}$ Due to a bug in the config file, the learning rate did not decrease immediately after warmup and remained at\nthe maximum value for several steps before we fixed this.", "reading_order": 9},
|
| 18 |
+
{"label": "foot", "bbox": [0.496, 0.939, 0.501, 0.95], "text": "14", "reading_order": 10}
|
| 19 |
+
]
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def extract_table_from_html(html_string):
|
| 24 |
+
"""Extract and clean table tags from HTML string"""
|
| 25 |
+
try:
|
| 26 |
+
table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
|
| 27 |
+
tables = table_pattern.findall(html_string)
|
| 28 |
+
tables = [re.sub(r'<table[^>]*>', '<table>', table) for table in tables]
|
| 29 |
+
return '\n'.join(tables)
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"extract_table_from_html error: {str(e)}")
|
| 32 |
+
return f"<table><tr><td>Error extracting table: {str(e)}</td></tr></table>"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class MarkdownConverter:
|
| 36 |
+
"""Convert structured recognition results to Markdown format"""
|
| 37 |
+
|
| 38 |
+
def __init__(self):
|
| 39 |
+
# Define heading levels for different section types
|
| 40 |
+
self.heading_levels = {
|
| 41 |
+
'title': '#',
|
| 42 |
+
'sec': '##',
|
| 43 |
+
'sub_sec': '###'
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
# Define which labels need special handling
|
| 47 |
+
self.special_labels = {
|
| 48 |
+
'tab', 'fig', 'title', 'sec', 'sub_sec',
|
| 49 |
+
'list', 'formula', 'reference', 'alg'
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
def try_remove_newline(self, text: str) -> str:
|
| 53 |
+
try:
|
| 54 |
+
# Preprocess text to handle line breaks
|
| 55 |
+
text = text.strip()
|
| 56 |
+
text = text.replace('-\n', '')
|
| 57 |
+
|
| 58 |
+
# Handle Chinese text line breaks
|
| 59 |
+
def is_chinese(char):
|
| 60 |
+
return '\u4e00' <= char <= '\u9fff'
|
| 61 |
+
|
| 62 |
+
lines = text.split('\n')
|
| 63 |
+
processed_lines = []
|
| 64 |
+
|
| 65 |
+
# Process all lines except the last one
|
| 66 |
+
for i in range(len(lines)-1):
|
| 67 |
+
current_line = lines[i].strip()
|
| 68 |
+
next_line = lines[i+1].strip()
|
| 69 |
+
|
| 70 |
+
# Always add the current line, but determine if we need a newline
|
| 71 |
+
if current_line: # If current line is not empty
|
| 72 |
+
if next_line: # If next line is not empty
|
| 73 |
+
# For Chinese text handling
|
| 74 |
+
if is_chinese(current_line[-1]) and is_chinese(next_line[0]):
|
| 75 |
+
processed_lines.append(current_line)
|
| 76 |
+
else:
|
| 77 |
+
processed_lines.append(current_line + ' ')
|
| 78 |
+
else:
|
| 79 |
+
# Next line is empty, add current line with newline
|
| 80 |
+
processed_lines.append(current_line + '\n')
|
| 81 |
+
else:
|
| 82 |
+
# Current line is empty, add an empty line
|
| 83 |
+
processed_lines.append('\n')
|
| 84 |
+
|
| 85 |
+
# Add the last line
|
| 86 |
+
if lines and lines[-1].strip():
|
| 87 |
+
processed_lines.append(lines[-1].strip())
|
| 88 |
+
|
| 89 |
+
text = ''.join(processed_lines)
|
| 90 |
+
|
| 91 |
+
return text
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"try_remove_newline error: {str(e)}")
|
| 94 |
+
return text # Return original text on error
|
| 95 |
+
|
| 96 |
+
def _handle_text(self, text: str) -> str:
|
| 97 |
+
"""
|
| 98 |
+
Process regular text content, preserving paragraph structure
|
| 99 |
+
"""
|
| 100 |
+
try:
|
| 101 |
+
if not text:
|
| 102 |
+
return ""
|
| 103 |
+
|
| 104 |
+
if text.strip().startswith("\\begin{array}") and text.strip().endswith("\\end{array}"):
|
| 105 |
+
text = "$$" + text + "$$"
|
| 106 |
+
elif ("_{" in text or "^{" in text or "\\" in text or "_ {" in text or "^ {" in text) and ("$" not in text) and ("\\begin" not in text):
|
| 107 |
+
text = "$" + text + "$"
|
| 108 |
+
|
| 109 |
+
# Process formulas in text before handling other text processing
|
| 110 |
+
text = self._process_formulas_in_text(text)
|
| 111 |
+
|
| 112 |
+
text = self.try_remove_newline(text)
|
| 113 |
+
|
| 114 |
+
# Return processed text
|
| 115 |
+
return text
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"_handle_text error: {str(e)}")
|
| 118 |
+
return text # Return original text on error
|
| 119 |
+
|
| 120 |
+
def _process_formulas_in_text(self, text: str) -> str:
|
| 121 |
+
"""
|
| 122 |
+
Process mathematical formulas in text by iteratively finding and replacing formulas.
|
| 123 |
+
- Identify inline and block formulas
|
| 124 |
+
- Replace newlines within formulas with \\
|
| 125 |
+
"""
|
| 126 |
+
try:
|
| 127 |
+
# Define formula delimiters and their corresponding patterns
|
| 128 |
+
delimiters = [
|
| 129 |
+
('$$', '$$'), # Block formula with $$
|
| 130 |
+
('\\[', '\\]'), # Block formula with \[ \]
|
| 131 |
+
('$', '$'), # Inline formula with $
|
| 132 |
+
('\\(', '\\)') # Inline formula with \( \)
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
# Process the text by iterating through each delimiter type
|
| 136 |
+
result = text
|
| 137 |
+
|
| 138 |
+
for start_delim, end_delim in delimiters:
|
| 139 |
+
# Create a pattern that matches from start to end delimiter
|
| 140 |
+
# Using a custom approach to avoid issues with nested delimiters
|
| 141 |
+
current_pos = 0
|
| 142 |
+
processed_parts = []
|
| 143 |
+
|
| 144 |
+
while current_pos < len(result):
|
| 145 |
+
# Find the next start delimiter
|
| 146 |
+
start_pos = result.find(start_delim, current_pos)
|
| 147 |
+
if start_pos == -1:
|
| 148 |
+
# No more formulas of this type
|
| 149 |
+
processed_parts.append(result[current_pos:])
|
| 150 |
+
break
|
| 151 |
+
|
| 152 |
+
# Add text before the formula
|
| 153 |
+
processed_parts.append(result[current_pos:start_pos])
|
| 154 |
+
|
| 155 |
+
# Find the matching end delimiter
|
| 156 |
+
end_pos = result.find(end_delim, start_pos + len(start_delim))
|
| 157 |
+
if end_pos == -1:
|
| 158 |
+
# No matching end delimiter, treat as regular text
|
| 159 |
+
processed_parts.append(result[start_pos:])
|
| 160 |
+
break
|
| 161 |
+
|
| 162 |
+
# Extract the formula content (without delimiters)
|
| 163 |
+
formula_content = result[start_pos + len(start_delim):end_pos]
|
| 164 |
+
|
| 165 |
+
# Process the formula content - replace newlines with \\
|
| 166 |
+
processed_formula = formula_content.replace('\n', ' \\\\ ')
|
| 167 |
+
|
| 168 |
+
# Add the processed formula with its delimiters
|
| 169 |
+
processed_parts.append(f"{start_delim}{processed_formula}{end_delim}")
|
| 170 |
+
|
| 171 |
+
# Move past this formula
|
| 172 |
+
current_pos = end_pos + len(end_delim)
|
| 173 |
+
|
| 174 |
+
# Update the result with processed text
|
| 175 |
+
result = ''.join(processed_parts)
|
| 176 |
+
return result
|
| 177 |
+
except Exception as e:
|
| 178 |
+
print(f"_process_formulas_in_text error: {str(e)}")
|
| 179 |
+
return text # Return original text on error
|
| 180 |
+
|
| 181 |
+
def _remove_newline_in_heading(self, text: str) -> str:
|
| 182 |
+
"""
|
| 183 |
+
Remove newline in heading
|
| 184 |
+
"""
|
| 185 |
+
try:
|
| 186 |
+
# Handle Chinese text line breaks
|
| 187 |
+
def is_chinese(char):
|
| 188 |
+
return '\u4e00' <= char <= '\u9fff'
|
| 189 |
+
|
| 190 |
+
# Check if the text contains Chinese characters
|
| 191 |
+
if any(is_chinese(char) for char in text):
|
| 192 |
+
return text.replace('\n', '')
|
| 193 |
+
else:
|
| 194 |
+
return text.replace('\n', ' ')
|
| 195 |
+
|
| 196 |
+
except Exception as e:
|
| 197 |
+
print(f"_remove_newline_in_heading error: {str(e)}")
|
| 198 |
+
return text
|
| 199 |
+
|
| 200 |
+
def _handle_heading(self, text: str, label: str) -> str:
|
| 201 |
+
"""
|
| 202 |
+
Convert section headings to appropriate markdown format
|
| 203 |
+
"""
|
| 204 |
+
try:
|
| 205 |
+
level = self.heading_levels.get(label, '#')
|
| 206 |
+
text = text.strip()
|
| 207 |
+
text = self._remove_newline_in_heading(text)
|
| 208 |
+
text = self._handle_text(text)
|
| 209 |
+
return f"{level} {text}\n\n"
|
| 210 |
+
except Exception as e:
|
| 211 |
+
print(f"_handle_heading error: {str(e)}")
|
| 212 |
+
return f"# Error processing heading: {text}\n\n"
|
| 213 |
+
|
| 214 |
+
def _handle_list_item(self, text: str) -> str:
|
| 215 |
+
"""
|
| 216 |
+
Convert list items to markdown list format
|
| 217 |
+
"""
|
| 218 |
+
try:
|
| 219 |
+
return f"- {text.strip()}\n"
|
| 220 |
+
except Exception as e:
|
| 221 |
+
print(f"_handle_list_item error: {str(e)}")
|
| 222 |
+
return f"- Error processing list item: {text}\n"
|
| 223 |
+
|
| 224 |
+
def _handle_figure(self, text: str, section_count: int) -> str:
|
| 225 |
+
"""
|
| 226 |
+
Convert base64 encoded image to markdown image syntax
|
| 227 |
+
"""
|
| 228 |
+
try:
|
| 229 |
+
# Determine image format (assuming PNG if not specified)
|
| 230 |
+
img_format = "png"
|
| 231 |
+
if text.startswith("data:image/"):
|
| 232 |
+
# Extract format from data URI
|
| 233 |
+
img_format = text.split(";")[0].split("/")[1]
|
| 234 |
+
elif ";" in text and "," in text:
|
| 235 |
+
# Already in data URI format
|
| 236 |
+
return f"\n\n"
|
| 237 |
+
else:
|
| 238 |
+
# Raw base64, convert to data URI
|
| 239 |
+
data_uri = f"data:image/{img_format};base64,{text}"
|
| 240 |
+
return f"\n\n"
|
| 241 |
+
except Exception as e:
|
| 242 |
+
print(f"_handle_figure error: {str(e)}")
|
| 243 |
+
return f"*[Error processing figure: {str(e)}]*\n\n"
|
| 244 |
+
|
| 245 |
+
def _handle_table(self, text: str) -> str:
|
| 246 |
+
"""
|
| 247 |
+
Convert table content to markdown format
|
| 248 |
+
"""
|
| 249 |
+
try:
|
| 250 |
+
markdown_content = []
|
| 251 |
+
if '<table' in text.lower() or '<tr' in text.lower():
|
| 252 |
+
markdown_table = extract_table_from_html(text)
|
| 253 |
+
markdown_content.append(markdown_table + "\n")
|
| 254 |
+
else:
|
| 255 |
+
table_lines = text.split('\n')
|
| 256 |
+
if table_lines:
|
| 257 |
+
col_count = len(table_lines[0].split()) if table_lines[0] else 1
|
| 258 |
+
header = '| ' + ' | '.join(table_lines[0].split()) + ' |'
|
| 259 |
+
markdown_content.append(header)
|
| 260 |
+
markdown_content.append('| ' + ' | '.join(['---'] * col_count) + ' |')
|
| 261 |
+
for line in table_lines[1:]:
|
| 262 |
+
cells = line.split()
|
| 263 |
+
while len(cells) < col_count:
|
| 264 |
+
cells.append('')
|
| 265 |
+
markdown_content.append('| ' + ' | '.join(cells) + ' |')
|
| 266 |
+
return '\n'.join(markdown_content) + '\n\n'
|
| 267 |
+
except Exception as e:
|
| 268 |
+
print(f"_handle_table error: {str(e)}")
|
| 269 |
+
return f"*[Error processing table: {str(e)}]*\n\n"
|
| 270 |
+
|
| 271 |
+
def _handle_algorithm(self, text: str) -> str:
|
| 272 |
+
"""
|
| 273 |
+
Process algorithm blocks with proper formatting
|
| 274 |
+
"""
|
| 275 |
+
try:
|
| 276 |
+
# Remove algorithm environment tags if present
|
| 277 |
+
text = re.sub(r'\\begin\{algorithm\}(.*?)\\end\{algorithm\}', r'\1', text, flags=re.DOTALL)
|
| 278 |
+
text = text.replace('\\begin{algorithm}', '').replace('\\end{algorithm}', '')
|
| 279 |
+
text = text.replace('\\begin{algorithmic}', '').replace('\\end{algorithmic}', '')
|
| 280 |
+
|
| 281 |
+
# Process the algorithm text
|
| 282 |
+
lines = text.strip().split('\n')
|
| 283 |
+
|
| 284 |
+
# Check if there's a caption or label
|
| 285 |
+
caption = ""
|
| 286 |
+
algorithm_text = []
|
| 287 |
+
|
| 288 |
+
for line in lines:
|
| 289 |
+
if '\\caption' in line:
|
| 290 |
+
# Extract caption text
|
| 291 |
+
caption_match = re.search(r'\\caption\{(.*?)\}', line)
|
| 292 |
+
if caption_match:
|
| 293 |
+
caption = f"**{caption_match.group(1)}**\n\n"
|
| 294 |
+
continue
|
| 295 |
+
elif '\\label' in line:
|
| 296 |
+
continue # Skip label lines
|
| 297 |
+
else:
|
| 298 |
+
algorithm_text.append(line)
|
| 299 |
+
|
| 300 |
+
# Join the algorithm text and wrap in code block
|
| 301 |
+
formatted_text = '\n'.join(algorithm_text)
|
| 302 |
+
|
| 303 |
+
# Return the formatted algorithm with caption
|
| 304 |
+
return f"{caption}```\n{formatted_text}\n```\n\n"
|
| 305 |
+
except Exception as e:
|
| 306 |
+
print(f"_handle_algorithm error: {str(e)}")
|
| 307 |
+
return f"*[Error processing algorithm: {str(e)}]*\n\n{text}\n\n"
|
| 308 |
+
|
| 309 |
+
def _handle_formula(self, text: str) -> str:
|
| 310 |
+
"""
|
| 311 |
+
Handle formula-specific content
|
| 312 |
+
"""
|
| 313 |
+
try:
|
| 314 |
+
# Process the formula content
|
| 315 |
+
processed_text = self._process_formulas_in_text(text)
|
| 316 |
+
|
| 317 |
+
# For formula blocks, ensure they're properly formatted in markdown
|
| 318 |
+
if '$$' not in processed_text and '\\[' not in processed_text:
|
| 319 |
+
# If no block formula delimiters are present, wrap in $$ for block formula
|
| 320 |
+
processed_text = f'$${processed_text}$$'
|
| 321 |
+
|
| 322 |
+
return f"{processed_text}\n\n"
|
| 323 |
+
except Exception as e:
|
| 324 |
+
print(f"_handle_formula error: {str(e)}")
|
| 325 |
+
return f"*[Error processing formula: {str(e)}]*\n\n"
|
| 326 |
+
|
| 327 |
+
def convert(self, recognition_results: List[Dict[str, Any]]) -> str:
|
| 328 |
+
"""
|
| 329 |
+
Convert recognition results to markdown format
|
| 330 |
+
"""
|
| 331 |
+
try:
|
| 332 |
+
markdown_content = []
|
| 333 |
+
|
| 334 |
+
for section_count, result in enumerate(recognition_results):
|
| 335 |
+
try:
|
| 336 |
+
label = result.get('label', '')
|
| 337 |
+
text = result.get('text', '').strip()
|
| 338 |
+
|
| 339 |
+
# Skip empty text
|
| 340 |
+
if not text:
|
| 341 |
+
continue
|
| 342 |
+
|
| 343 |
+
# Handle different content types
|
| 344 |
+
if label in {'title', 'sec', 'sub_sec'}:
|
| 345 |
+
markdown_content.append(self._handle_heading(text, label))
|
| 346 |
+
elif label == 'list':
|
| 347 |
+
markdown_content.append(self._handle_list_item(text))
|
| 348 |
+
elif label == 'fig':
|
| 349 |
+
markdown_content.append(self._handle_figure(text, section_count))
|
| 350 |
+
elif label == 'tab':
|
| 351 |
+
markdown_content.append(self._handle_table(text))
|
| 352 |
+
elif label == 'alg':
|
| 353 |
+
markdown_content.append(self._handle_algorithm(text))
|
| 354 |
+
elif label == 'formula':
|
| 355 |
+
markdown_content.append(self._handle_formula(text))
|
| 356 |
+
elif label not in self.special_labels:
|
| 357 |
+
# Handle regular text (paragraphs, etc.)
|
| 358 |
+
processed_text = self._handle_text(text)
|
| 359 |
+
markdown_content.append(f"{processed_text}\n\n")
|
| 360 |
+
except Exception as e:
|
| 361 |
+
print(f"Error processing item {section_count}: {str(e)}")
|
| 362 |
+
# Add a placeholder for the failed item
|
| 363 |
+
markdown_content.append(f"*[Error processing content]*\n\n")
|
| 364 |
+
|
| 365 |
+
# Join all content and apply post-processing
|
| 366 |
+
result = ''.join(markdown_content)
|
| 367 |
+
return self._post_process(result)
|
| 368 |
+
except Exception as e:
|
| 369 |
+
print(f"convert error: {str(e)}")
|
| 370 |
+
return f"Error generating markdown content: {str(e)}"
|
| 371 |
+
|
| 372 |
+
def _post_process(self, markdown_content: str) -> str:
|
| 373 |
+
"""
|
| 374 |
+
Apply post-processing fixes to the generated markdown content
|
| 375 |
+
"""
|
| 376 |
+
try:
|
| 377 |
+
# Handle author information
|
| 378 |
+
author_pattern = re.compile(r'\\author\{(.*?)\}', re.DOTALL)
|
| 379 |
+
|
| 380 |
+
def process_author_match(match):
|
| 381 |
+
# Extract author content
|
| 382 |
+
author_content = match.group(1)
|
| 383 |
+
# Process the author content
|
| 384 |
+
return self._handle_text(author_content)
|
| 385 |
+
|
| 386 |
+
# Replace \author{...} with processed content
|
| 387 |
+
markdown_content = author_pattern.sub(process_author_match, markdown_content)
|
| 388 |
+
|
| 389 |
+
# Handle special case where author is inside math environment
|
| 390 |
+
math_author_pattern = re.compile(r'\$(\\author\{.*?\})\$', re.DOTALL)
|
| 391 |
+
match = math_author_pattern.search(markdown_content)
|
| 392 |
+
if match:
|
| 393 |
+
# Extract the author command
|
| 394 |
+
author_cmd = match.group(1)
|
| 395 |
+
# Extract content from author command
|
| 396 |
+
author_content_match = re.search(r'\\author\{(.*?)\}', author_cmd, re.DOTALL)
|
| 397 |
+
if author_content_match:
|
| 398 |
+
# Get author content and process it
|
| 399 |
+
author_content = author_content_match.group(1)
|
| 400 |
+
processed_content = self._handle_text(author_content)
|
| 401 |
+
# Replace the entire $\author{...}$ block with processed content
|
| 402 |
+
markdown_content = markdown_content.replace(match.group(0), processed_content)
|
| 403 |
+
|
| 404 |
+
# Replace LaTeX abstract environment with plain text
|
| 405 |
+
markdown_content = re.sub(r'\\begin\{abstract\}(.*?)\\end\{abstract\}',
|
| 406 |
+
r'**Abstract** \1',
|
| 407 |
+
markdown_content,
|
| 408 |
+
flags=re.DOTALL)
|
| 409 |
+
|
| 410 |
+
# Replace standalone \begin{abstract} (without matching end)
|
| 411 |
+
markdown_content = re.sub(r'\\begin\{abstract\}',
|
| 412 |
+
r'**Abstract**',
|
| 413 |
+
markdown_content)
|
| 414 |
+
|
| 415 |
+
# Replace LaTeX equation numbers with tag format, handling cases with extra backslashes
|
| 416 |
+
markdown_content = re.sub(r'\\eqno\{\((.*?)\)\}',
|
| 417 |
+
r'\\tag{\1}',
|
| 418 |
+
markdown_content)
|
| 419 |
+
|
| 420 |
+
# Find the starting tag of the formula
|
| 421 |
+
markdown_content = markdown_content.replace("\[ \\\\", "$$ \\\\")
|
| 422 |
+
|
| 423 |
+
# Find the ending tag of the formula (ensure this is the only ending tag)
|
| 424 |
+
markdown_content = markdown_content.replace("\\\\ \]", "\\\\ $$")
|
| 425 |
+
|
| 426 |
+
# Fix other common LaTeX issues
|
| 427 |
+
replacements = [
|
| 428 |
+
# Fix spacing issues in subscripts and superscripts
|
| 429 |
+
(r'_ {', r'_{'),
|
| 430 |
+
(r'^ {', r'^{'),
|
| 431 |
+
|
| 432 |
+
# Fix potential issues with multiple consecutive newlines
|
| 433 |
+
(r'\n{3,}', r'\n\n')
|
| 434 |
+
]
|
| 435 |
+
|
| 436 |
+
for old, new in replacements:
|
| 437 |
+
markdown_content = re.sub(old, new, markdown_content)
|
| 438 |
+
|
| 439 |
+
return markdown_content
|
| 440 |
+
except Exception as e:
|
| 441 |
+
print(f"_post_process error: {str(e)}")
|
| 442 |
+
return markdown_content # Return original content if post-processing fails
|
utils/utils.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
| 3 |
+
SPDX-License-Identifier: MIT
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import copy
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from typing import List, Tuple
|
| 12 |
+
|
| 13 |
+
import albumentations as alb
|
| 14 |
+
import cv2
|
| 15 |
+
import numpy as np
|
| 16 |
+
from albumentations.pytorch import ToTensorV2
|
| 17 |
+
from PIL import Image
|
| 18 |
+
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
|
| 19 |
+
from torchvision.transforms.functional import resize
|
| 20 |
+
|
| 21 |
+
from utils.markdown_utils import MarkdownConverter
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def alb_wrapper(transform):
|
| 25 |
+
def f(im):
|
| 26 |
+
return transform(image=np.asarray(im))["image"]
|
| 27 |
+
|
| 28 |
+
return f
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
test_transform = alb_wrapper(
|
| 32 |
+
alb.Compose(
|
| 33 |
+
[
|
| 34 |
+
alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
|
| 35 |
+
ToTensorV2(),
|
| 36 |
+
]
|
| 37 |
+
)
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
|
| 42 |
+
# print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}")
|
| 43 |
+
if x2 <= x1 or y2 <= y1:
|
| 44 |
+
return False, f"[{x1}, {y1}, {x2}, {y2}]"
|
| 45 |
+
if x1 < 0 or y1 < 0:
|
| 46 |
+
return False, f"[{x1}, {y1}, {x2}, {y2}]"
|
| 47 |
+
if not abs_coord:
|
| 48 |
+
if x2 > 1 or y2 > 1:
|
| 49 |
+
return False, f"[{x1}, {y1}, {x2}, {y2}]"
|
| 50 |
+
elif image_size is not None: # has image size
|
| 51 |
+
if x2 > image_size[0] or y2 > image_size[1]:
|
| 52 |
+
return False, f"[{x1}, {y1}, {x2}, {y2}]"
|
| 53 |
+
return True, None
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def adjust_box_edges(image, boxes: List[List[float]], max_pixels=15, threshold=0.2):
|
| 57 |
+
"""
|
| 58 |
+
Image: cv2.image object, or Path
|
| 59 |
+
Input: boxes: list of boxes [[x1, y1, x2, y2]]. Using absolute coordinates.
|
| 60 |
+
"""
|
| 61 |
+
if isinstance(image, str):
|
| 62 |
+
image = cv2.imread(image)
|
| 63 |
+
img_h, img_w = image.shape[:2]
|
| 64 |
+
new_boxes = []
|
| 65 |
+
for box in boxes:
|
| 66 |
+
best_box = copy.deepcopy(box)
|
| 67 |
+
|
| 68 |
+
def check_edge(img, current_box, i, is_vertical):
|
| 69 |
+
edge = current_box[i]
|
| 70 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 71 |
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
| 72 |
+
|
| 73 |
+
if is_vertical:
|
| 74 |
+
line = binary[current_box[1] : current_box[3] + 1, edge]
|
| 75 |
+
else:
|
| 76 |
+
line = binary[edge, current_box[0] : current_box[2] + 1]
|
| 77 |
+
|
| 78 |
+
transitions = np.abs(np.diff(line))
|
| 79 |
+
return np.sum(transitions) / len(transitions)
|
| 80 |
+
|
| 81 |
+
# Only widen the box
|
| 82 |
+
edges = [(0, -1, True), (2, 1, True), (1, -1, False), (3, 1, False)]
|
| 83 |
+
|
| 84 |
+
current_box = copy.deepcopy(box)
|
| 85 |
+
# make sure the box is within the image
|
| 86 |
+
current_box[0] = min(max(current_box[0], 0), img_w - 1)
|
| 87 |
+
current_box[1] = min(max(current_box[1], 0), img_h - 1)
|
| 88 |
+
current_box[2] = min(max(current_box[2], 0), img_w - 1)
|
| 89 |
+
current_box[3] = min(max(current_box[3], 0), img_h - 1)
|
| 90 |
+
|
| 91 |
+
for i, direction, is_vertical in edges:
|
| 92 |
+
best_score = check_edge(image, current_box, i, is_vertical)
|
| 93 |
+
if best_score <= threshold:
|
| 94 |
+
continue
|
| 95 |
+
for step in range(max_pixels):
|
| 96 |
+
current_box[i] += direction
|
| 97 |
+
if i == 0 or i == 2:
|
| 98 |
+
current_box[i] = min(max(current_box[i], 0), img_w - 1)
|
| 99 |
+
else:
|
| 100 |
+
current_box[i] = min(max(current_box[i], 0), img_h - 1)
|
| 101 |
+
score = check_edge(image, current_box, i, is_vertical)
|
| 102 |
+
|
| 103 |
+
if score < best_score:
|
| 104 |
+
best_score = score
|
| 105 |
+
best_box = copy.deepcopy(current_box)
|
| 106 |
+
|
| 107 |
+
if score <= threshold:
|
| 108 |
+
break
|
| 109 |
+
new_boxes.append(best_box)
|
| 110 |
+
|
| 111 |
+
return new_boxes
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def parse_layout_string(bbox_str):
|
| 115 |
+
"""Parse layout string using regular expressions"""
|
| 116 |
+
pattern = r"\[(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+)\]\s*(\w+)"
|
| 117 |
+
matches = re.finditer(pattern, bbox_str)
|
| 118 |
+
|
| 119 |
+
parsed_results = []
|
| 120 |
+
for match in matches:
|
| 121 |
+
coords = [float(match.group(i)) for i in range(1, 5)]
|
| 122 |
+
label = match.group(5).strip()
|
| 123 |
+
parsed_results.append((coords, label))
|
| 124 |
+
|
| 125 |
+
return parsed_results
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
@dataclass
|
| 129 |
+
class ImageDimensions:
|
| 130 |
+
"""Class to store image dimensions"""
|
| 131 |
+
original_w: int
|
| 132 |
+
original_h: int
|
| 133 |
+
padded_w: int
|
| 134 |
+
padded_h: int
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[int, int, int, int]:
|
| 138 |
+
"""Map coordinates from padded image back to original image
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
x1, y1, x2, y2: Coordinates in padded image
|
| 142 |
+
dims: Image dimensions object
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
tuple: (x1, y1, x2, y2) coordinates in original image
|
| 146 |
+
"""
|
| 147 |
+
try:
|
| 148 |
+
# Calculate padding offsets
|
| 149 |
+
top = (dims.padded_h - dims.original_h) // 2
|
| 150 |
+
left = (dims.padded_w - dims.original_w) // 2
|
| 151 |
+
|
| 152 |
+
# Map back to original coordinates
|
| 153 |
+
orig_x1 = max(0, x1 - left)
|
| 154 |
+
orig_y1 = max(0, y1 - top)
|
| 155 |
+
orig_x2 = min(dims.original_w, x2 - left)
|
| 156 |
+
orig_y2 = min(dims.original_h, y2 - top)
|
| 157 |
+
|
| 158 |
+
# Ensure we have a valid box (width and height > 0)
|
| 159 |
+
if orig_x2 <= orig_x1:
|
| 160 |
+
orig_x2 = min(orig_x1 + 1, dims.original_w)
|
| 161 |
+
if orig_y2 <= orig_y1:
|
| 162 |
+
orig_y2 = min(orig_y1 + 1, dims.original_h)
|
| 163 |
+
|
| 164 |
+
return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(f"map_to_original_coordinates error: {str(e)}")
|
| 167 |
+
# Return safe coordinates
|
| 168 |
+
return 0, 0, min(100, dims.original_w), min(100, dims.original_h)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
|
| 172 |
+
"""
|
| 173 |
+
From absolute coordinates to relevant coordinates
|
| 174 |
+
e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
|
| 175 |
+
"""
|
| 176 |
+
try:
|
| 177 |
+
x1, y1, x2, y2 = abs_coords
|
| 178 |
+
return round(x1 / dims.original_w, 3), round(y1 / dims.original_h, 3), round(x2 / dims.original_w, 3), round(y2 / dims.original_h, 3)
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"map_to_relevant_coordinates error: {str(e)}")
|
| 181 |
+
return 0.0, 0.0, 1.0, 1.0 # Return full image coordinates
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
|
| 185 |
+
"""Process and adjust coordinates
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
coords: Normalized coordinates [x1, y1, x2, y2]
|
| 189 |
+
padded_image: Padded image
|
| 190 |
+
dims: Image dimensions object
|
| 191 |
+
previous_box: Previous box coordinates for overlap adjustment
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
tuple: (x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box)
|
| 195 |
+
"""
|
| 196 |
+
try:
|
| 197 |
+
# Convert normalized coordinates to absolute coordinates
|
| 198 |
+
x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
|
| 199 |
+
x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
|
| 200 |
+
|
| 201 |
+
# Ensure coordinates are within image bounds before adjustment
|
| 202 |
+
x1 = max(0, min(x1, dims.padded_w - 1))
|
| 203 |
+
y1 = max(0, min(y1, dims.padded_h - 1))
|
| 204 |
+
x2 = max(0, min(x2, dims.padded_w))
|
| 205 |
+
y2 = max(0, min(y2, dims.padded_h))
|
| 206 |
+
|
| 207 |
+
# Ensure width and height are at least 1 pixel
|
| 208 |
+
if x2 <= x1:
|
| 209 |
+
x2 = min(x1 + 1, dims.padded_w)
|
| 210 |
+
if y2 <= y1:
|
| 211 |
+
y2 = min(y1 + 1, dims.padded_h)
|
| 212 |
+
|
| 213 |
+
# Extend box boundaries
|
| 214 |
+
new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
|
| 215 |
+
x1, y1, x2, y2 = new_boxes[0]
|
| 216 |
+
|
| 217 |
+
# Ensure coordinates are still within image bounds after adjustment
|
| 218 |
+
x1 = max(0, min(x1, dims.padded_w - 1))
|
| 219 |
+
y1 = max(0, min(y1, dims.padded_h - 1))
|
| 220 |
+
x2 = max(0, min(x2, dims.padded_w))
|
| 221 |
+
y2 = max(0, min(y2, dims.padded_h))
|
| 222 |
+
|
| 223 |
+
# Ensure width and height are at least 1 pixel after adjustment
|
| 224 |
+
if x2 <= x1:
|
| 225 |
+
x2 = min(x1 + 1, dims.padded_w)
|
| 226 |
+
if y2 <= y1:
|
| 227 |
+
y2 = min(y1 + 1, dims.padded_h)
|
| 228 |
+
|
| 229 |
+
# Check for overlap with previous box and adjust
|
| 230 |
+
if previous_box is not None:
|
| 231 |
+
prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
|
| 232 |
+
if (x1 < prev_x2 and x2 > prev_x1) and (y1 < prev_y2 and y2 > prev_y1):
|
| 233 |
+
y1 = prev_y2
|
| 234 |
+
# Ensure y1 is still valid
|
| 235 |
+
y1 = min(y1, dims.padded_h - 1)
|
| 236 |
+
# Make sure y2 is still greater than y1
|
| 237 |
+
if y2 <= y1:
|
| 238 |
+
y2 = min(y1 + 1, dims.padded_h)
|
| 239 |
+
|
| 240 |
+
# Update previous box
|
| 241 |
+
new_previous_box = [x1, y1, x2, y2]
|
| 242 |
+
|
| 243 |
+
# Map to original coordinates
|
| 244 |
+
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
|
| 245 |
+
x1, y1, x2, y2, dims
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
|
| 249 |
+
except Exception as e:
|
| 250 |
+
print(f"process_coordinates error: {str(e)}")
|
| 251 |
+
# Return safe values
|
| 252 |
+
orig_x1, orig_y1, orig_x2, orig_y2 = 0, 0, min(100, dims.original_w), min(100, dims.original_h)
|
| 253 |
+
return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100]
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
|
| 257 |
+
"""Load and prepare image with padding while maintaining aspect ratio
|
| 258 |
+
|
| 259 |
+
Args:
|
| 260 |
+
image: PIL image
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
tuple: (padded_image, image_dimensions)
|
| 264 |
+
"""
|
| 265 |
+
try:
|
| 266 |
+
# Convert PIL image to OpenCV format
|
| 267 |
+
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 268 |
+
original_h, original_w = image.shape[:2]
|
| 269 |
+
|
| 270 |
+
# Calculate padding to make square image
|
| 271 |
+
max_size = max(original_h, original_w)
|
| 272 |
+
top = (max_size - original_h) // 2
|
| 273 |
+
bottom = max_size - original_h - top
|
| 274 |
+
left = (max_size - original_w) // 2
|
| 275 |
+
right = max_size - original_w - left
|
| 276 |
+
|
| 277 |
+
# Apply padding
|
| 278 |
+
padded_image = cv2.copyMakeBorder(image, top, bottom, left, right,
|
| 279 |
+
cv2.BORDER_CONSTANT, value=(0, 0, 0))
|
| 280 |
+
|
| 281 |
+
padded_h, padded_w = padded_image.shape[:2]
|
| 282 |
+
|
| 283 |
+
dimensions = ImageDimensions(
|
| 284 |
+
original_w=original_w,
|
| 285 |
+
original_h=original_h,
|
| 286 |
+
padded_w=padded_w,
|
| 287 |
+
padded_h=padded_h
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
return padded_image, dimensions
|
| 291 |
+
except Exception as e:
|
| 292 |
+
print(f"prepare_image error: {str(e)}")
|
| 293 |
+
# Create a minimal valid image and dimensions
|
| 294 |
+
h, w = image.height, image.width
|
| 295 |
+
dimensions = ImageDimensions(
|
| 296 |
+
original_w=w,
|
| 297 |
+
original_h=h,
|
| 298 |
+
padded_w=w,
|
| 299 |
+
padded_h=h
|
| 300 |
+
)
|
| 301 |
+
# Return a black image of the same size
|
| 302 |
+
return np.zeros((h, w, 3), dtype=np.uint8), dimensions
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def setup_output_dirs(save_dir):
|
| 308 |
+
"""Create necessary output directories"""
|
| 309 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 310 |
+
os.makedirs(os.path.join(save_dir, "markdown"), exist_ok=True)
|
| 311 |
+
os.makedirs(os.path.join(save_dir, "recognition_json"), exist_ok=True)
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def save_outputs(recognition_results, image_path, save_dir):
|
| 315 |
+
"""Save JSON and markdown outputs"""
|
| 316 |
+
basename = os.path.splitext(os.path.basename(image_path))[0]
|
| 317 |
+
|
| 318 |
+
# Save JSON file
|
| 319 |
+
json_path = os.path.join(save_dir, "recognition_json", f"{basename}.json")
|
| 320 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
| 321 |
+
json.dump(recognition_results, f, ensure_ascii=False, indent=2)
|
| 322 |
+
|
| 323 |
+
# Generate and save markdown file
|
| 324 |
+
markdown_converter = MarkdownConverter()
|
| 325 |
+
markdown_content = markdown_converter.convert(recognition_results)
|
| 326 |
+
markdown_path = os.path.join(save_dir, "markdown", f"{basename}.md")
|
| 327 |
+
with open(markdown_path, "w", encoding="utf-8") as f:
|
| 328 |
+
f.write(markdown_content)
|
| 329 |
+
|
| 330 |
+
return json_path
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def crop_margin(img: Image.Image) -> Image.Image:
|
| 334 |
+
"""Crop margins from image"""
|
| 335 |
+
try:
|
| 336 |
+
width, height = img.size
|
| 337 |
+
if width == 0 or height == 0:
|
| 338 |
+
print("Warning: Image has zero width or height")
|
| 339 |
+
return img
|
| 340 |
+
|
| 341 |
+
data = np.array(img.convert("L"))
|
| 342 |
+
data = data.astype(np.uint8)
|
| 343 |
+
max_val = data.max()
|
| 344 |
+
min_val = data.min()
|
| 345 |
+
if max_val == min_val:
|
| 346 |
+
return img
|
| 347 |
+
data = (data - min_val) / (max_val - min_val) * 255
|
| 348 |
+
gray = 255 * (data < 200).astype(np.uint8)
|
| 349 |
+
|
| 350 |
+
coords = cv2.findNonZero(gray) # Find all non-zero points (text)
|
| 351 |
+
if coords is None:
|
| 352 |
+
return img
|
| 353 |
+
a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
|
| 354 |
+
|
| 355 |
+
# Ensure crop coordinates are within image bounds
|
| 356 |
+
a = max(0, a)
|
| 357 |
+
b = max(0, b)
|
| 358 |
+
w = min(w, width - a)
|
| 359 |
+
h = min(h, height - b)
|
| 360 |
+
|
| 361 |
+
# Only crop if we have a valid region
|
| 362 |
+
if w > 0 and h > 0:
|
| 363 |
+
return img.crop((a, b, a + w, b + h))
|
| 364 |
+
return img
|
| 365 |
+
except Exception as e:
|
| 366 |
+
print(f"crop_margin error: {str(e)}")
|
| 367 |
+
return img # Return original image on error
|