prithivMLmods commited on
Commit
3e612b3
·
verified ·
1 Parent(s): 4ad0254

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -339
app.py DELETED
@@ -1,339 +0,0 @@
1
- import os
2
- import sys
3
- from typing import Iterable, Optional, Tuple, Dict, Any, List
4
- import hashlib
5
- import spaces
6
- import re
7
- import time
8
- import click
9
- import gradio as gr
10
- from io import BytesIO
11
- from PIL import Image
12
- from loguru import logger
13
- from pathlib import Path
14
- import torch
15
- from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoModel
16
- from transformers.image_utils import load_image
17
- import fitz
18
- import html2text
19
- import markdown
20
- import tempfile
21
-
22
- from gradio.themes import Soft
23
- from gradio.themes.utils import colors, fonts, sizes
24
-
25
- # --- Theme and CSS Definition ---
26
-
27
- colors.steel_blue = colors.Color(
28
- name="steel_blue",
29
- c50="#EBF3F8", c100="#D3E5F0", c200="#A8CCE1", c300="#7DB3D2",
30
- c400="#529AC3", c500="#4682B4", c600="#3E72A0", c700="#36638C",
31
- c800="#2E5378", c900="#264364", c950="#1E3450",
32
- )
33
-
34
- class SteelBlueTheme(Soft):
35
- def __init__(
36
- self,
37
- *,
38
- primary_hue: colors.Color | str = colors.gray,
39
- secondary_hue: colors.Color | str = colors.steel_blue,
40
- neutral_hue: colors.Color | str = colors.slate,
41
- text_size: sizes.Size | str = sizes.text_lg,
42
- font: fonts.Font | str | Iterable[fonts.Font | str] = (
43
- fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
44
- ),
45
- font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
46
- fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
47
- ),
48
- ):
49
- super().__init__(
50
- primary_hue=primary_hue, secondary_hue=secondary_hue, neutral_hue=neutral_hue,
51
- text_size=text_size, font=font, font_mono=font_mono,
52
- )
53
- super().set(
54
- background_fill_primary="*primary_50",
55
- background_fill_primary_dark="*primary_900",
56
- body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
57
- body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
58
- button_primary_text_color="white",
59
- button_primary_text_color_hover="white",
60
- button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
61
- button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
62
- button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
63
- button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
64
- slider_color="*secondary_500",
65
- slider_color_dark="*secondary_600",
66
- block_title_text_weight="600",
67
- block_border_width="3px",
68
- block_shadow="*shadow_drop_lg",
69
- button_primary_shadow="*shadow_drop_lg",
70
- button_large_padding="11px",
71
- color_accent_soft="*primary_100",
72
- block_label_background_fill="*primary_200",
73
- )
74
-
75
- steel_blue_theme = SteelBlueTheme()
76
-
77
- # --- Model and App Logic ---
78
-
79
- pdf_suffixes = [".pdf"]
80
- image_suffixes = [".png", ".jpeg", ".jpg"]
81
- device = "cuda" if torch.cuda.is_available() else "cpu"
82
-
83
- logger.info(f"Using device: {device}")
84
-
85
- # Model 1: Logics-Parsing
86
- MODEL_ID_1 = "Logics-MLLM/Logics-Parsing"
87
- logger.info(f"Loading model 1: {MODEL_ID_1}")
88
- processor_1 = AutoProcessor.from_pretrained(MODEL_ID_1, trust_remote_code=True)
89
- model_1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
90
- MODEL_ID_1,
91
- trust_remote_code=True,
92
- torch_dtype=torch.float16 if device == "cuda" else torch.float32
93
- ).to(device).eval()
94
- logger.info(f"Model '{MODEL_ID_1}' loaded successfully.")
95
-
96
- # Model 2: DeepSeek-OCR
97
- logger.info("Loading model and tokenizer for DeepSeek-OCR...")
98
- model_name_2 = "deepseek-ai/DeepSeek-OCR"
99
- tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2, trust_remote_code=True)
100
- model_2 = AutoModel.from_pretrained(
101
- model_name_2,
102
- _attn_implementation="flash_attention_2",
103
- trust_remote_code=True
104
- ).eval()
105
- logger.info("✅ DeepSeek-OCR model loaded successfully.")
106
-
107
-
108
- @spaces.GPU
109
- def parse_page(image: Image.Image, model_name: str) -> str:
110
- if model_name == "Logics-Parsing":
111
- current_processor, current_model = processor_1, model_1
112
- messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
113
- prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
114
- inputs = current_processor(text=prompt_full, images=[image.convert("RGB")], return_tensors="pt").to(device)
115
-
116
- with torch.no_grad():
117
- generated_ids = current_model.generate(**inputs, max_new_tokens=2048, do_sample=False)
118
-
119
- generated_ids = generated_ids[:, inputs['input_ids'].shape[1]:]
120
- output_text = current_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
121
- return output_text
122
-
123
- elif model_name == "DeepSeek-OCR":
124
- # Move model to the correct device for inference
125
- model_2.to(device)
126
-
127
- conversation = [
128
- {"role": "user", "content": ["", image]},
129
- ]
130
-
131
- input_tensor = tokenizer_2.apply_chat_template(conversation, return_tensors="pt")
132
-
133
- with torch.no_grad():
134
- output_tensor = model_2.run(input_tensor.to(device))
135
-
136
- # This model returns plain text, so we wrap it in basic HTML for consistency
137
- ocr_text = output_tensor[0]
138
- html_output = "".join(f"<p>{line}</p>" for line in ocr_text.split('\n'))
139
- return html_output
140
-
141
- else:
142
- raise ValueError(f"Unknown model choice: {model_name}")
143
-
144
- def convert_file_to_images(file_path: str, dpi: int = 200) -> List[Image.Image]:
145
- images = []
146
- file_ext = Path(file_path).suffix.lower()
147
-
148
- if file_ext in image_suffixes:
149
- images.append(Image.open(file_path).convert("RGB"))
150
- return images
151
-
152
- if file_ext not in pdf_suffixes:
153
- raise ValueError(f"Unsupported file type: {file_ext}")
154
-
155
- try:
156
- pdf_document = fitz.open(file_path)
157
- zoom = dpi / 72.0
158
- mat = fitz.Matrix(zoom, zoom)
159
- for page_num in range(len(pdf_document)):
160
- page = pdf_document.load_page(page_num)
161
- pix = page.get_pixmap(matrix=mat)
162
- img_data = pix.tobytes("png")
163
- images.append(Image.open(BytesIO(img_data)).convert("RGB"))
164
- pdf_document.close()
165
- except Exception as e:
166
- logger.error(f"Failed to convert PDF using PyMuPDF: {e}")
167
- raise
168
- return images
169
-
170
- def get_initial_state() -> Dict[str, Any]:
171
- return {"pages": [], "total_pages": 0, "current_page_index": 0, "page_results": []}
172
-
173
- def load_and_preview_file(file_path: Optional[str]) -> Tuple[Optional[Image.Image], str, Dict[str, Any]]:
174
- state = get_initial_state()
175
- if not file_path:
176
- return None, '<div class="page-info">No file loaded</div>', state
177
-
178
- try:
179
- pages = convert_file_to_images(file_path)
180
- if not pages:
181
- return None, '<div class="page-info">Could not load file</div>', state
182
-
183
- state["pages"] = pages
184
- state["total_pages"] = len(pages)
185
- page_info_html = f'<div class="page-info">Page 1 / {state["total_pages"]}</div>'
186
- return pages[0], page_info_html, state
187
- except Exception as e:
188
- logger.error(f"Failed to load and preview file: {e}")
189
- return None, '<div class="page-info">Failed to load preview</div>', state
190
-
191
- async def process_all_pages(state: Dict[str, Any], model_choice: str, progress=gr.Progress(track_tqdm=True)):
192
- if not state or not state["pages"]:
193
- error_msg = "<h3>Please upload a file first.</h3>"
194
- return error_msg, "", "", None, "Error: No file to process", state
195
-
196
- logger.info(f'Processing {state["total_pages"]} pages with model: {model_choice}')
197
- start_time = time.time()
198
-
199
- try:
200
- page_results = []
201
- for i, page_img in progress.tqdm(enumerate(state["pages"]), desc="Processing Pages"):
202
- html_result = parse_page(page_img, model_choice)
203
- page_results.append({'raw_html': html_result})
204
-
205
- state["page_results"] = page_results
206
-
207
- full_html_content = "\n\n".join([f'<!-- Page {i+1} -->\n{res["raw_html"]}' for i, res in enumerate(page_results)])
208
- full_markdown = html2text.html2text(full_html_content)
209
- with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f:
210
- f.write(full_markdown)
211
- md_path = f.name
212
-
213
- parsing_time = time.time() - start_time
214
- cost_time_str = f'Total processing time: {parsing_time:.2f}s'
215
-
216
- current_page_results = get_page_outputs(state)
217
-
218
- return *current_page_results, md_path, cost_time_str, state
219
-
220
- except Exception as e:
221
- logger.error(f"Parsing failed: {e}", exc_info=True)
222
- error_html = f"<h3>An error occurred during processing:</h3><p>{str(e)}</p>"
223
- return error_html, "", "", None, f"Error: {str(e)}", state
224
-
225
- def navigate_page(direction: str, state: Dict[str, Any]):
226
- if not state or not state["pages"]:
227
- return None, '<div class="page-info">No file loaded</div>', *get_page_outputs(state), state
228
-
229
- current_index = state["current_page_index"]
230
- total_pages = state["total_pages"]
231
-
232
- if direction == "prev":
233
- new_index = max(0, current_index - 1)
234
- elif direction == "next":
235
- new_index = min(total_pages - 1, current_index + 1)
236
- else:
237
- new_index = current_index
238
-
239
- state["current_page_index"] = new_index
240
-
241
- image_preview = state["pages"][new_index]
242
- page_info_html = f'<div class="page-info">Page {new_index + 1} / {total_pages}</div>'
243
-
244
- page_outputs = get_page_outputs(state)
245
-
246
- return image_preview, page_info_html, *page_outputs, state
247
-
248
- def get_page_outputs(state: Dict[str, Any]) -> Tuple[str, str, str]:
249
- if not state or not state.get("page_results"):
250
- return "<h3>Process the document to see results.</h3>", "", ""
251
-
252
- index = state["current_page_index"]
253
- if index >= len(state["page_results"]):
254
- return "<h3>Result not available for this page.</h3>", "", ""
255
-
256
- result = state["page_results"][index]
257
- raw_html = result['raw_html']
258
-
259
- md_source = html2text.html2text(raw_html)
260
- md_render = markdown.markdown(md_source, extensions=['fenced_code', 'tables'])
261
-
262
- return md_render, md_source, raw_html
263
-
264
- def clear_all():
265
- return None, None, "<h3>Results will be displayed here after processing.</h3>", "", "", None, "", '<div class="page-info">No file loaded</div>', get_initial_state()
266
-
267
- @click.command()
268
- def main():
269
- css = """
270
- .main-container { max-width: 1400px; margin: 0 auto; }
271
- .header-text { text-align: center; margin-bottom: 20px; }
272
- .page-info { text-align: center; padding: 8px 16px; font-weight: bold; margin: 10px 0; }
273
- """
274
- with gr.Blocks(theme=steel_blue_theme, css=css, title="Logics-Parsing Demo") as demo:
275
- app_state = gr.State(value=get_initial_state())
276
-
277
- gr.HTML("""
278
- <div class="header-text">
279
- <h1>📄 Multimodal: VLM Parsing & OCR</h1>
280
- <p style="font-size: 1.1em;">An advanced Vision Language Model to parse documents and images into clean Markdown (html)</p>
281
- <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
282
- <a href="https://huggingface.co/collections/prithivMLmods/mm-vlm-parsing-68e33e52bfb9ae60b50602dc" target="_blank" style="text-decoration: none; font-weight: 500;">🤗 Model Info</a>
283
- <a href="https://github.com/PRITHIVSAKTHIUR/VLM-Parsing" target="_blank" style="text-decoration: none; font-weight: 500;">💻 GitHub</a>
284
- <a href="https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending" target="_blank" style="text-decoration: none; font-weight: 500;">📝 Multimodal VLMs</a>
285
- </div>
286
- </div>
287
- """)
288
-
289
- with gr.Row(elem_classes=["main-container"]):
290
- with gr.Column(scale=1):
291
- model_choice = gr.Dropdown(choices=["Logics-Parsing", "DeepSeek-OCR"], label="Select Model", value="Logics-Parsing")
292
- file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
293
-
294
- process_btn = gr.Button("🚀Process Document", variant="primary", size="lg")
295
- clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
296
-
297
- image_preview = gr.Image(label="Preview", type="pil", interactive=False, height=320)
298
-
299
- with gr.Row():
300
- prev_page_btn = gr.Button("◀ Previous")
301
- page_info = gr.HTML('<div class="page-info">No file loaded</div>')
302
- next_page_btn = gr.Button("Next ▶")
303
-
304
- example_root = "examples"
305
- if os.path.exists(example_root) and os.path.isdir(example_root):
306
- example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
307
- if example_files:
308
- gr.Examples(examples=example_files, inputs=file_input, label="Examples")
309
-
310
- with gr.Accordion("Download & Details", open=False):
311
- output_file = gr.File(label='Download Markdown Result', interactive=False)
312
- cost_time = gr.Textbox(label='Time Cost', interactive=False)
313
-
314
- with gr.Column(scale=2):
315
- with gr.Tabs():
316
- with gr.Tab("Markdown Source"):
317
- md_source_output = gr.Code(language="markdown", label="Markdown Source")
318
- with gr.Tab("Rendered Markdown"):
319
- md_render_output = gr.Markdown(label='Markdown Rendering')
320
- with gr.Tab("Generated HTML"):
321
- raw_html_output = gr.Code(language="html", label="Generated HTML")
322
-
323
- file_input.change(fn=load_and_preview_file, inputs=file_input, outputs=[image_preview, page_info, app_state], show_progress="full")
324
-
325
- process_btn.click(fn=process_all_pages, inputs=[app_state, model_choice], outputs=[md_render_output, md_source_output, raw_html_output, output_file, cost_time, app_state], show_progress="full")
326
-
327
- prev_page_btn.click(fn=lambda s: navigate_page("prev", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
328
-
329
- next_page_btn.click(fn=lambda s: navigate_page("next", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
330
-
331
- clear_btn.click(fn=clear_all, outputs=[file_input, image_preview, md_render_output, md_source_output, raw_html_output, output_file, cost_time, page_info, app_state])
332
-
333
- demo.queue().launch(debug=True, show_error=True)
334
-
335
- if __name__ == '__main__':
336
- if not os.path.exists("examples"):
337
- os.makedirs("examples")
338
- logger.info("Created 'examples' directory. Please add some sample PDF/image files there.")
339
- main()