automation commited on
Commit
f7a3bfb
·
1 Parent(s): f6c9fd4

Upd module type #2 for torch

Browse files
Files changed (2) hide show
  1. app.py +261 -100
  2. requirements.txt +0 -1
app.py CHANGED
@@ -13,28 +13,91 @@ import warnings
13
  import numpy as np
14
  import base64
15
  from io import StringIO, BytesIO
 
 
 
 
16
 
17
  MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
18
 
19
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
20
- model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
21
- model = model.eval().cuda()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  MODEL_CONFIGS = {
24
- "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
25
- "🚀 Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
26
- "📄 Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
27
- "📊 Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
28
- "🎯 Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}
 
 
 
 
 
 
 
 
 
29
  }
 
30
 
31
  TASK_PROMPTS = {
32
- "📋 Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
33
- "📝 Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
34
- "📍 Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
35
- "🔍 Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
36
- "✏️ Custom": {"prompt": "", "has_grounding": False}
 
 
 
 
 
 
 
 
37
  }
 
38
 
39
  def extract_grounding_references(text):
40
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
@@ -106,28 +169,31 @@ def embed_images(markdown, crops):
106
  markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
107
  return markdown
108
 
109
- @spaces.GPU(duration=60)
110
- def process_image(image, mode, task, custom_prompt):
111
  if image is None:
112
  return " Error Upload image", "", "", None, []
113
- if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
114
  return "Enter prompt", "", "", None, []
115
 
116
  if image.mode in ('RGBA', 'LA', 'P'):
117
  image = image.convert('RGB')
118
  image = ImageOps.exif_transpose(image)
119
 
120
- config = MODEL_CONFIGS[mode]
 
 
 
121
 
122
- if task == "✏️ Custom":
123
  prompt = f"<image>\n{custom_prompt.strip()}"
124
  has_grounding = '<|grounding|>' in custom_prompt
125
- elif task == "📍 Locate":
126
  prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
127
  has_grounding = True
128
  else:
129
- prompt = TASK_PROMPTS[task]["prompt"]
130
- has_grounding = TASK_PROMPTS[task]["has_grounding"]
131
 
132
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
133
  image.save(tmp.name, 'JPEG', quality=95)
@@ -161,21 +227,81 @@ def process_image(image, mode, task, custom_prompt):
161
  if refs:
162
  img_out, crops = draw_bounding_boxes(image, refs, True)
163
 
164
- markdown = embed_images(markdown, crops)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
  return cleaned, markdown, result, img_out, crops
167
 
168
- @spaces.GPU(duration=300)
169
- def process_pdf(path, mode, task, custom_prompt):
170
  doc = fitz.open(path)
171
  texts, markdowns, raws, all_crops = [], [], [], []
172
 
173
- for i in range(len(doc)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  page = doc.load_page(i)
175
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
176
  img = Image.open(BytesIO(pix.tobytes("png")))
177
 
178
- text, md, raw, _, crops = process_image(img, mode, task, custom_prompt)
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  if text and text != "No text":
181
  texts.append(f"### Page {i + 1}\n\n{text}")
@@ -185,23 +311,24 @@ def process_pdf(path, mode, task, custom_prompt):
185
 
186
  doc.close()
187
 
188
- return ("\n\n---\n\n".join(texts) if texts else "No text in PDF",
189
- "\n\n---\n\n".join(markdowns) if markdowns else "No text in PDF",
 
190
  "\n\n".join(raws), None, all_crops)
191
 
192
- def process_file(path, mode, task, custom_prompt):
193
  if not path:
194
  return "Error Upload file", "", "", None, []
195
 
196
  if path.lower().endswith('.pdf'):
197
- return process_pdf(path, mode, task, custom_prompt)
198
  else:
199
- return process_image(Image.open(path), mode, task, custom_prompt)
200
 
201
- def toggle_prompt(task):
202
- if task == "✏️ Custom":
203
  return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
204
- elif task == "📍 Locate":
205
  return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
206
  return gr.update(visible=False)
207
 
@@ -218,74 +345,108 @@ def load_image(file_path):
218
  else:
219
  return Image.open(file_path)
220
 
221
- with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
222
- gr.Markdown("""
223
- # 🚀 DeepSeek-OCR Demo
224
- **Convert documents to markdown, extract raw text, and locate specific content with bounding boxes. Check the info at the bottom of the page for more information.**
225
-
226
- **Hope this tool was helpful! If so, a quick like ❤️ would mean a lot :)**
227
- """)
228
-
229
- with gr.Row():
230
- with gr.Column(scale=1):
231
- file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
232
- input_img = gr.Image(label="Input Image", type="pil", height=300)
233
- mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="⚡ Gundam", label="Mode")
234
- task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
235
- prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
236
- btn = gr.Button("Extract", variant="primary", size="lg")
237
-
238
- with gr.Column(scale=2):
239
- with gr.Tabs():
240
- with gr.Tab("📝 Text"):
241
- text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
242
- with gr.Tab("🎨 Markdown"):
243
- md_out = gr.Markdown("")
244
- with gr.Tab("🖼️ Boxes"):
245
- img_out = gr.Image(type="pil", height=500, show_label=False)
246
- with gr.Tab("🖼️ Cropped Images"):
247
- gallery = gr.Gallery(show_label=False, columns=3, height=400)
248
- with gr.Tab("🔍 Raw"):
249
- raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
250
-
251
- gr.Examples(
252
- examples=[
253
- ["examples/ocr.jpg", "⚡ Gundam", "📋 Markdown", ""],
254
- ["examples/reachy-mini.jpg", "⚡ Gundam", "📍 Locate", "Robot"]
255
- ],
256
- inputs=[input_img, mode, task, prompt],
257
- cache_examples=False
258
- )
259
-
260
- with gr.Accordion("ℹ️ Info", open=False):
261
  gr.Markdown("""
262
- ### Modes
263
- - **Gundam**: 1024 base + 640 tiles with cropping - Best balance
264
- - **Tiny**: 512×512, no crop - Fastest
265
- - **Small**: 640×640, no crop - Quick
266
- - **Base**: 1024×1024, no crop - Standard
267
- - **Large**: 1280×1280, no crop - Highest quality
268
-
269
- ### Tasks
270
- - **Markdown**: Convert document to structured markdown (grounding ✅)
271
- - **Free OCR**: Simple text extraction
272
- - **Locate**: Find specific text in image (grounding ✅)
273
- - **Describe**: General image description
274
- - **Custom**: Your own prompt (add `<|grounding|>` for boxes)
275
  """)
276
-
277
- file_in.change(load_image, [file_in], [input_img])
278
- task.change(toggle_prompt, [task], [prompt])
279
-
280
- def run(image, file_path, mode, task, custom_prompt):
281
- if image is not None:
282
- return process_image(image, mode, task, custom_prompt)
283
- if file_path:
284
- return process_file(file_path, mode, task, custom_prompt)
285
- return "Error uploading file or image", "", "", None, []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
- btn.click(run, [input_img, file_in, mode, task, prompt],
288
- [text_out, md_out, raw_out, img_out, gallery])
289
 
290
  if __name__ == "__main__":
291
- demo.queue(max_size=20).launch()
 
13
  import numpy as np
14
  import base64
15
  from io import StringIO, BytesIO
16
+ import subprocess
17
+ import importlib
18
+ import time
19
+ import zipfile
20
 
21
  MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
22
 
23
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
24
+
25
+ def ensure_flash_attn_if_cuda():
26
+ # Only attempt install when CUDA is available
27
+ if not torch.cuda.is_available():
28
+ return False
29
+ try:
30
+ importlib.import_module('flash_attn')
31
+ return True
32
+ except Exception:
33
+ pass
34
+ try:
35
+ # Install without build isolation so setup can import torch
36
+ subprocess.check_call([
37
+ sys.executable, '-m', 'pip', 'install', '--no-build-isolation', '--no-cache-dir', 'flash-attn==2.7.3'
38
+ ])
39
+ importlib.invalidate_caches()
40
+ importlib.import_module('flash_attn')
41
+ return True
42
+ except Exception:
43
+ return False
44
+ flash_ok = ensure_flash_attn_if_cuda()
45
+ try:
46
+ model = AutoModel.from_pretrained(
47
+ MODEL_NAME,
48
+ _attn_implementation='flash_attention_2' if flash_ok else None,
49
+ torch_dtype=torch.bfloat16,
50
+ trust_remote_code=True,
51
+ use_safetensors=True,
52
+ )
53
+ if torch.cuda.is_available():
54
+ model = model.eval().cuda()
55
+ else:
56
+ raise RuntimeError("CUDA not available; cannot use flash attention")
57
+ except Exception as e:
58
+ warnings.warn(f"Flash attention/CUDA unavailable ({e}); falling back to default attention.")
59
+ model = AutoModel.from_pretrained(
60
+ MODEL_NAME,
61
+ trust_remote_code=True,
62
+ use_safetensors=True,
63
+ )
64
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
65
+ model = model.to(device).eval()
66
 
67
  MODEL_CONFIGS = {
68
+ "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
69
+ "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
70
+ "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
71
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
72
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}
73
+ }
74
+
75
+ # UI labels mapped to internal keys (use plain labels to match dropdown values)
76
+ MODE_LABEL_TO_KEY = {
77
+ "Gundam": "Gundam",
78
+ "Tiny": "Tiny",
79
+ "Small": "Small",
80
+ "Base": "Base",
81
+ "Large": "Large",
82
  }
83
+ KEY_TO_MODE_LABEL = {v: k for k, v in MODE_LABEL_TO_KEY.items()}
84
 
85
  TASK_PROMPTS = {
86
+ "Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to GitHub-flavored Markdown. Preserve headings, lists, links, code blocks, and tables.", "has_grounding": True},
87
+ "Tables": {"prompt": "<image>\n<|grounding|>Extract ALL tables only as GitHub Markdown tables. Preserve merged cells as best as possible. Do not include non-table content.", "has_grounding": True},
88
+ "Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
89
+ "Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
90
+ "Custom": {"prompt": "", "has_grounding": False}
91
+ }
92
+
93
+ TASK_LABEL_TO_KEY = {
94
+ "Markdown": "Markdown",
95
+ "Tables": "Tables",
96
+ "Locate": "Locate",
97
+ "Describe": "Describe",
98
+ "Custom": "Custom",
99
  }
100
+ KEY_TO_TASK_LABEL = {v: k for k, v in TASK_LABEL_TO_KEY.items()}
101
 
102
  def extract_grounding_references(text):
103
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
 
169
  markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
170
  return markdown
171
 
172
+ @spaces.GPU(duration=120)
173
+ def process_image(image, mode_label, task_label, custom_prompt, embed_figures=False, high_accuracy=False):
174
  if image is None:
175
  return " Error Upload image", "", "", None, []
176
+ if task_label in ["Custom", "Locate"] and not custom_prompt.strip():
177
  return "Enter prompt", "", "", None, []
178
 
179
  if image.mode in ('RGBA', 'LA', 'P'):
180
  image = image.convert('RGB')
181
  image = ImageOps.exif_transpose(image)
182
 
183
+ # Normalize labels to internal keys
184
+ mode_key = MODE_LABEL_TO_KEY.get(mode_label, mode_label)
185
+ task_key = TASK_LABEL_TO_KEY.get(task_label, task_label)
186
+ config = MODEL_CONFIGS[mode_key]
187
 
188
+ if task_label == "Custom":
189
  prompt = f"<image>\n{custom_prompt.strip()}"
190
  has_grounding = '<|grounding|>' in custom_prompt
191
+ elif task_label == "Locate":
192
  prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
193
  has_grounding = True
194
  else:
195
+ prompt = TASK_PROMPTS[task_key]["prompt"]
196
+ has_grounding = TASK_PROMPTS[task_key]["has_grounding"]
197
 
198
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
199
  image.save(tmp.name, 'JPEG', quality=95)
 
227
  if refs:
228
  img_out, crops = draw_bounding_boxes(image, refs, True)
229
 
230
+ if embed_figures:
231
+ markdown = embed_images(markdown, crops)
232
+
233
+ # Optional second pass for high accuracy (focus on tables refinement)
234
+ if high_accuracy and task_key in ["Markdown", "Tables"]:
235
+ refine_prompt = "<image>\nRefine the previous extraction with emphasis on accurate table structure and alignment. Output GitHub Markdown only."
236
+ tmp2 = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
237
+ image.save(tmp2.name, 'JPEG', quality=95)
238
+ tmp2.close()
239
+ out_dir2 = tempfile.mkdtemp()
240
+ stdout2 = sys.stdout
241
+ sys.stdout = StringIO()
242
+ model.infer(tokenizer=tokenizer, prompt=refine_prompt, image_file=tmp2.name, output_path=out_dir2,
243
+ base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"])
244
+ refine_result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
245
+ if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
246
+ sys.stdout = stdout2
247
+ os.unlink(tmp2.name)
248
+ shutil.rmtree(out_dir2, ignore_errors=True)
249
+ if refine_result:
250
+ refined_md = clean_output(refine_result, embed_figures, True)
251
+ # Prefer refined markdown if longer (heuristic)
252
+ if len(refined_md) > len(markdown):
253
+ markdown = refined_md
254
 
255
  return cleaned, markdown, result, img_out, crops
256
 
257
+ @spaces.GPU(duration=120)
258
+ def process_pdf(path, mode_label, task_label, custom_prompt, dpi=300, page_range_text="", embed_figures=False, high_accuracy=False, insert_separators=True, max_retries=5, retry_backoff_seconds=5):
259
  doc = fitz.open(path)
260
  texts, markdowns, raws, all_crops = [], [], [], []
261
 
262
+ # Parse page range like "1-3,5"
263
+ def parse_ranges(s, total):
264
+ if not s.strip():
265
+ return list(range(total))
266
+ pages = set()
267
+ parts = [p.strip() for p in s.split(',') if p.strip()]
268
+ for part in parts:
269
+ if '-' in part:
270
+ a, b = part.split('-', 1)
271
+ try:
272
+ a, b = int(a) - 1, int(b) - 1
273
+ except:
274
+ continue
275
+ for x in range(max(0, a), min(total - 1, b) + 1):
276
+ pages.add(x)
277
+ else:
278
+ try:
279
+ idx = int(part) - 1
280
+ if 0 <= idx < total:
281
+ pages.add(idx)
282
+ except:
283
+ continue
284
+ return sorted(pages)
285
+
286
+ target_pages = parse_ranges(page_range_text, len(doc))
287
+
288
+ for i in target_pages:
289
  page = doc.load_page(i)
290
+ pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72), alpha=False)
291
  img = Image.open(BytesIO(pix.tobytes("png")))
292
 
293
+ # Retry loop to handle GPU timeouts/busy states gracefully
294
+ attempt = 0
295
+ while True:
296
+ try:
297
+ text, md, raw, _, crops = process_image(img, mode_label, task_label, custom_prompt, embed_figures=embed_figures, high_accuracy=high_accuracy)
298
+ break
299
+ except Exception:
300
+ attempt += 1
301
+ if attempt >= max_retries:
302
+ text, md, raw, crops = "", f"<!-- Failed to process page {i+1} after retries -->", "", []
303
+ break
304
+ time.sleep(retry_backoff_seconds * attempt)
305
 
306
  if text and text != "No text":
307
  texts.append(f"### Page {i + 1}\n\n{text}")
 
311
 
312
  doc.close()
313
 
314
+ sep = "\n\n---\n\n" if insert_separators else "\n\n"
315
+ return (sep.join(texts) if texts else "No text in PDF",
316
+ sep.join(markdowns) if markdowns else "No text in PDF",
317
  "\n\n".join(raws), None, all_crops)
318
 
319
+ def process_file(path, mode_label, task_label, custom_prompt, dpi=300, page_range_text="", embed_figures=False, high_accuracy=False, insert_separators=True):
320
  if not path:
321
  return "Error Upload file", "", "", None, []
322
 
323
  if path.lower().endswith('.pdf'):
324
+ return process_pdf(path, mode_label, task_label, custom_prompt, dpi=dpi, page_range_text=page_range_text, embed_figures=embed_figures, high_accuracy=high_accuracy, insert_separators=insert_separators)
325
  else:
326
+ return process_image(Image.open(path), mode_label, task_label, custom_prompt, embed_figures=embed_figures, high_accuracy=high_accuracy)
327
 
328
+ def toggle_prompt(task_label):
329
+ if task_label == "Custom":
330
  return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
331
+ elif task_label == "Locate":
332
  return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
333
  return gr.update(visible=False)
334
 
 
345
  else:
346
  return Image.open(file_path)
347
 
348
+ def build_blocks(theme):
349
+ with gr.Blocks(theme=theme, title="DeepSeek-OCR") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  gr.Markdown("""
351
+ # DeepSeek-OCR WebUI
352
+ **Convert documents to markdown, extract raw text, and locate specific content with bounding boxes.**
 
 
 
 
 
 
 
 
 
 
 
353
  """)
354
+
355
+ with gr.Row():
356
+ with gr.Column(scale=1):
357
+ file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
358
+ input_img = gr.Image(label="Input Image", type="pil", height=300)
359
+ mode = gr.Dropdown(list(MODE_LABEL_TO_KEY.keys()), value="Gundam", label="Mode")
360
+ task = gr.Dropdown(list(TASK_LABEL_TO_KEY.keys()), value="Markdown", label="Task")
361
+ prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
362
+ with gr.Row():
363
+ embed_fig = gr.Checkbox(value=True, label="Embed figures into Markdown")
364
+ high_acc = gr.Checkbox(value=False, label="High accuracy (slower)")
365
+ with gr.Row():
366
+ dpi = gr.Slider(150, 600, value=300, step=50, label="PDF DPI")
367
+ page_range = gr.Textbox(label="Page range (e.g. 1-3,5)", placeholder="All pages")
368
+ page_seps = gr.Checkbox(value=True, label="Insert page separators (---)")
369
+ btn = gr.Button("Extract", variant="primary", size="lg")
370
+
371
+ with gr.Column(scale=2):
372
+ with gr.Tabs():
373
+ with gr.Tab("📝 Text"):
374
+ text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
375
+ with gr.Tab("Markdown"):
376
+ md_out = gr.Markdown("")
377
+ with gr.Tab("Boxes"):
378
+ img_out = gr.Image(type="pil", height=500, show_label=False)
379
+ with gr.Tab("Cropped Images"):
380
+ gallery = gr.Gallery(show_label=False, columns=3, height=400)
381
+ with gr.Tab("Raw"):
382
+ raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
383
+ with gr.Row():
384
+ dl_md = gr.DownloadButton(label="Download Markdown", value=None)
385
+ dl_txt = gr.DownloadButton(label="Download Text", value=None)
386
+ dl_md_zip = gr.DownloadButton(label="Download Markdown (split pages)", value=None)
387
+
388
+ with gr.Accordion("ℹ️ Info", open=False):
389
+ gr.Markdown("""
390
+ ### Modes
391
+ - ⚡ Gundam: 1024 base + 640 tiles with cropping - Best balance
392
+ - 🧩 Tiny: 512×512, no crop - Fastest
393
+ - 📄 Small: 640×640, no crop - Quick
394
+ - 📚 Base: 1024×1024, no crop - Standard
395
+ - 🖼️ Large: 1280×1280, no crop - Highest quality
396
+
397
+ ### Tasks
398
+ - Markdown: Convert document to structured markdown (grounding ✅)
399
+ - Tables: Extract tables only as Markdown (grounding ✅)
400
+ - Locate: Find specific text in image (grounding ✅)
401
+ - Describe: General image description
402
+ - Custom: Your own prompt (add `<|grounding|>` for boxes)
403
+ """)
404
+
405
+ file_in.change(load_image, [file_in], [input_img])
406
+ task.change(toggle_prompt, [task], [prompt])
407
+
408
+ def run(image, file_path, mode_label, task_label, custom_prompt, dpi_val, page_range_text, embed, hiacc, sep_pages):
409
+ if image is not None:
410
+ text, md, raw, img, crops = process_image(image, mode_label, task_label, custom_prompt, embed_figures=embed, high_accuracy=hiacc)
411
+ elif file_path:
412
+ text, md, raw, img, crops = process_file(file_path, mode_label, task_label, custom_prompt, dpi=int(dpi_val), page_range_text=page_range_text, embed_figures=embed, high_accuracy=hiacc, insert_separators=sep_pages)
413
+ else:
414
+ return "Error uploading file or image", "", "", None, [], None, None, None
415
+
416
+ # Create temp files for download
417
+ md_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".md")
418
+ txt_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
419
+ with open(md_tmp.name, 'w', encoding='utf-8') as f:
420
+ f.write(md or "")
421
+ with open(txt_tmp.name, 'w', encoding='utf-8') as f:
422
+ f.write(text or "")
423
+ # Optional ZIP split by '---' separators
424
+ zip_path = None
425
+ try:
426
+ if md:
427
+ # Split on standalone '---' separator variants
428
+ parts = re.split(r"\n\s*---\s*\n", md)
429
+ if len(parts) > 1:
430
+ zip_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".zip")
431
+ with zipfile.ZipFile(zip_tmp.name, 'w', zipfile.ZIP_DEFLATED) as zf:
432
+ for idx, part in enumerate(parts, start=1):
433
+ fname = f"page_{idx:03d}.md"
434
+ zf.writestr(fname, part.strip() + "\n")
435
+ zip_path = zip_tmp.name
436
+ except Exception:
437
+ zip_path = None
438
+ return text, md, raw, img, crops, md_tmp.name, txt_tmp.name, zip_path
439
+
440
+ btn.click(run, [input_img, file_in, mode, task, prompt, dpi, page_range, embed_fig, high_acc, page_seps],
441
+ [text_out, md_out, raw_out, img_out, gallery, dl_md, dl_txt, dl_md_zip])
442
+
443
+ return demo
444
+
445
+ # Build two themed experiences as a light/dark separator without custom CSS/JS
446
+ light_demo = build_blocks(gr.themes.Soft())
447
+ dark_demo = build_blocks(gr.themes.Monochrome())
448
 
449
+ app = gr.TabbedInterface([light_demo, dark_demo], ["🌞 Light", "🌙 Dark"])
 
450
 
451
  if __name__ == "__main__":
452
+ app.queue(max_size=20).launch()
requirements.txt CHANGED
@@ -6,6 +6,5 @@ einops
6
  addict
7
  easydict
8
  torchvision
9
- flash-attn==2.7.3; platform_system == "Linux" and platform_machine == "x86_64"
10
  PyMuPDF
11
  hf_transfer
 
6
  addict
7
  easydict
8
  torchvision
 
9
  PyMuPDF
10
  hf_transfer