Spaces:
Running
Running
| import os | |
| import re | |
| import sys | |
| import json | |
| import time | |
| import copy | |
| import base64 | |
| import asyncio | |
| import tempfile | |
| import subprocess | |
| from pathlib import Path | |
| from datetime import datetime | |
| import zipfile | |
| import httpx, aiofiles, os, asyncio | |
| import numpy as np | |
| import gradio as gr | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| from loguru import logger | |
| from openai import OpenAI, AsyncOpenAI | |
| from gradio_pdf import PDF | |
| import certifi | |
| import httpx | |
| import aiohttp | |
| import uuid | |
| import tqdm | |
| import base64, pathlib | |
| from io import BytesIO | |
| from pdf2image import convert_from_bytes, convert_from_path # pip install pdf2image | |
| import requests | |
| def setup_poppler_linux(): | |
| poppler_dir = "/tmp/poppler" | |
| if not os.path.exists(poppler_dir): | |
| os.makedirs(poppler_dir, exist_ok=True) | |
| subprocess.run([ | |
| "apt-get", "update" | |
| ], check=True) | |
| subprocess.run([ | |
| "apt-get", "install", "-y", "poppler-utils" | |
| ], check=True) | |
| setup_poppler_linux() | |
| preset_prompts = [ | |
| "Please convert the document into Markdown format.", | |
| "Generate a clean and structured Markdown version of the document.", | |
| "Transform this content into Markdown with proper headings and bullet points.", | |
| "Convert the text to Markdown, preserving structure and formatting.", | |
| "Reformat this document as Markdown with clear sections and lists.", | |
| ] | |
| def send_pdf_to_parse(file_path, server_ip, port, route="/upload", api_key=None): | |
| url = f"{openai_api_base}{route}" | |
| headers = {} | |
| if api_key: | |
| headers["Authorization"] = f"Bearer {api_key}" | |
| with open(file_path, "rb") as f: | |
| files = {"file": (os.path.basename(file_path), f, "application/pdf")} | |
| response = requests.post(url, files=files, headers=headers) | |
| return response | |
| async def send_pdf_async_aiohttp(file_path, server_ip, route="/upload", Authorization=None): | |
| """使用aiohttp异步发送PDF""" | |
| url = f"{server_ip}{route}" | |
| headers = {} | |
| if Authorization: | |
| headers["Authorization"] = f"Bearer {Authorization}" | |
| try: | |
| async with aiohttp.ClientSession() as session: | |
| with open(file_path, "rb") as f: | |
| data = aiohttp.FormData() | |
| data.add_field('file', f, filename=os.path.basename(file_path), content_type='application/pdf') | |
| async with session.post(url, data=data, headers=headers) as response: | |
| print(f"PDF发送成功: {file_path}, 状态码: {response.status}") | |
| return response | |
| except Exception as e: | |
| print(f"PDF发送失败: {file_path}, 错误: {e}") | |
| return None | |
| def extract_makrdown(text): | |
| m = re.search(r'```markdown\s*([\s\S]*?)```', text, re.MULTILINE) | |
| if m: | |
| return m.group(1).strip() | |
| else: | |
| return text | |
| openai_api_key = "EMPTY" | |
| openai_api_base = os.environ.get("openai_api_base") | |
| IP = os.environ.get("IP") | |
| PORT = os.environ.get("PORT") | |
| Authorization = os.environ.get("Authorization") | |
| client = AsyncOpenAI( | |
| api_key=openai_api_key, | |
| base_url=openai_api_base + "/v1", | |
| http_client=httpx.AsyncClient(verify=False) | |
| ) | |
| async def request(messages): | |
| chat_completion_from_base64 = await client.chat.completions.create( | |
| messages=messages, | |
| extra_headers={ | |
| "Authorization": f"Bearer {Authorization}" | |
| }, | |
| model="Qwen2_5VL", | |
| max_completion_tokens=4096, | |
| stream=True, | |
| temperature=0.0, | |
| top_p=0.95 | |
| ) | |
| page = "" | |
| async for chunk in chat_completion_from_base64: | |
| if chunk.choices[0].delta.content: | |
| content = chunk.choices[0].delta.content | |
| choice = chunk.choices[0] | |
| if choice.finish_reason is not None: | |
| print(f"end reason = {choice.finish_reason}") | |
| break | |
| page += content | |
| yield content | |
| def images_to_pdf(img_paths, pdf_path): | |
| if isinstance(img_paths, (str, Path)): | |
| img_paths = [img_paths] | |
| if not img_paths: | |
| raise ValueError("img_paths is empty") | |
| images = [] | |
| for p in img_paths: | |
| p = Path(p) | |
| if not p.is_file(): | |
| raise FileNotFoundError(p) | |
| img = Image.open(p) | |
| if img.mode in ("RGBA", "P"): | |
| img = img.convert("RGB") | |
| images.append(img) | |
| pdf_path = Path(pdf_path) | |
| pdf_path.parent.mkdir(parents=True, exist_ok=True) | |
| images[0].save(pdf_path, | |
| save_all=True, | |
| append_images=images[1:], | |
| resolution=300.0) | |
| return pdf_path | |
| def encode_image(image_path): | |
| with open(image_path, "rb") as image_file: | |
| return base64.b64encode(image_file.read()).decode("utf-8") | |
| def build_message(image_path, prompt): | |
| content = [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{encode_image(image_path)}" | |
| } | |
| }, | |
| {"type": "text", 'text': prompt} | |
| ] | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {'role': 'user', 'content': content} | |
| ] | |
| return messages | |
| def download_markdown_file(md_text): | |
| filename = f"markdown_{uuid.uuid4().hex[:8]}.md" | |
| filepath = Path("downloads") / filename | |
| filepath.parent.mkdir(exist_ok=True) | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write(md_text) | |
| return str(filepath) | |
| async def doc_parser(doc_path, prompt): | |
| doc_path = Path(doc_path) | |
| if not doc_path.is_file(): | |
| raise FileNotFoundError(doc_path) | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| tmpdir = Path(tmpdir) | |
| queries = [] | |
| if doc_path.suffix.lower() == ".pdf": | |
| pages: List[Image.Image] = convert_from_path(doc_path, dpi=300) | |
| for idx, page in enumerate(pages, start=1): | |
| img_path = tmpdir / f"page_{idx}.png" | |
| page.save(img_path, "PNG") | |
| messages = build_message(img_path, prompt) | |
| queries.append(messages) | |
| else: | |
| messages = build_message(doc_path, prompt) | |
| queries.append(messages) | |
| all_pages = [] | |
| all_pages_raw = [] | |
| for query in queries: | |
| pages = "" | |
| async for chunk in request(query): | |
| pages += chunk | |
| yield extract_makrdown(pages), pages | |
| all_pages.append(extract_makrdown(pages)) | |
| all_pages_raw.append(pages) | |
| print(all_pages) | |
| yield "\n---\n".join(all_pages), "\n\n".join(all_pages_raw) | |
| def compress_directory_to_zip(directory_path, output_zip_path): | |
| try: | |
| with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
| for root, dirs, files in os.walk(directory_path): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| arcname = os.path.relpath(file_path, directory_path) | |
| zipf.write(file_path, arcname) | |
| return 0 | |
| except Exception as e: | |
| logger.exception(e) | |
| return -1 | |
| latex_delimiters = [ | |
| {'left': '$$', 'right': '$$', 'display': True}, | |
| {'left': '$', 'right': '$', 'display': False}, | |
| {'left': '\\(', 'right': '\\)', 'display': False}, | |
| {'left': '\\[', 'right': '\\]', 'display': True}, | |
| ] | |
| def check_prompt(prompt): | |
| if not prompt or prompt.strip() == "": | |
| raise gr.Error("Please select or enter a prompt before parsing.") | |
| return prompt | |
| def to_file(image_path): | |
| if image_path.endswith("Academic_Papers.png"): | |
| image_path = image_path.replace("Academic_Papers.png", "Academic_Papers.pdf") | |
| return image_path | |
| def render_img(b64_list, idx, scale): | |
| """根据当前索引 idx 和缩放倍数 scale 渲染 HTML。""" | |
| if not b64_list: | |
| return "<p style='color:gray'>请先上传图片</p>" | |
| idx %= len(b64_list) | |
| src = b64_list[idx] | |
| # return ( | |
| # f'<div style="overflow:auto;border:1px solid #ccc;' | |
| # f'display:flex;justify-content:center;align-items:center;' # ① 横纵向居中 | |
| # f'width:100%;height:800px;">' # ② 容器尺寸 | |
| # f'<img src="{src}" ' | |
| # f'style="transform:scale({scale});transform-origin:center center;" />' # ③ 以中心缩放 | |
| # f'</div>' | |
| # ) | |
| # 以百分比形式设置 width,height 自动等比 | |
| percent = scale * 100 | |
| if scale <= 1: | |
| # ---------- 居中模式 ---------- | |
| return f""" | |
| <div style=" | |
| width:100%; | |
| height:800px; | |
| overflow:auto; | |
| border:1px solid #ccc; | |
| "> | |
| <div style=" | |
| min-width:100%; /* 保证外层 div 至少跟容器一样宽 */ | |
| display:flex; | |
| justify-content:center; /* 小图水平居中 */ | |
| "> | |
| <img src="{src}" style=" | |
| width:{percent}%; | |
| height:auto; | |
| display:block; | |
| "> | |
| </div> | |
| </div> | |
| """ | |
| else: | |
| # ---------- 放大模式 ---------- | |
| return ( | |
| f'<div style="overflow:auto;border:1px solid #ccc;' | |
| f'width:100%;height:800px;">' | |
| f' <img src="{src}" ' | |
| f' style="width:{percent}%;max-width:none;' | |
| f' height:auto;display:block;" />' | |
| f'</div>' | |
| ) | |
| def files_to_b64(file, pdf_dpi: int = 200): | |
| out: list[str] = [] | |
| if hasattr(file, "data"): | |
| raw_bytes = file.data | |
| suffix = pathlib.Path(file.name).suffix.lower() | |
| # -- PDF -- | |
| if suffix == ".pdf": | |
| pages = convert_from_bytes(raw_bytes, dpi=pdf_dpi) | |
| for page in pages: | |
| buf = BytesIO() | |
| page.save(buf, format="PNG") | |
| b64 = base64.b64encode(buf.getvalue()).decode() | |
| out.append(f"data:image/png;base64,{b64}") | |
| else: | |
| b64 = base64.b64encode(raw_bytes).decode() | |
| out.append(f"data:image/{suffix[1:]};base64,{b64}") | |
| else: | |
| path = pathlib.Path(file) | |
| suffix = path.suffix.lower() | |
| if suffix == ".pdf": | |
| pages = convert_from_path(str(path), dpi=pdf_dpi) | |
| for page in pages: | |
| buf = BytesIO() | |
| page.save(buf, format="PNG") | |
| b64 = base64.b64encode(buf.getvalue()).decode() | |
| out.append(f"data:image/png;base64,{b64}") | |
| else: | |
| raw_bytes = path.read_bytes() | |
| b64 = base64.b64encode(raw_bytes).decode() | |
| out.append(f"data:image/{suffix[1:]};base64,{b64}") | |
| return out | |
| async def process_file(file_path): | |
| """使用asyncio的异步方案""" | |
| if file_path is None: | |
| return None | |
| if not file_path.endswith(".pdf"): | |
| tmp_file_path = Path(file_path) | |
| tmp_file_path = tmp_file_path.with_suffix(".pdf") | |
| images_to_pdf(file_path, tmp_file_path) | |
| else: | |
| tmp_file_path = file_path | |
| asyncio.create_task(send_pdf_async_aiohttp(tmp_file_path, server_ip=openai_api_base, Authorization=Authorization)) | |
| return str(tmp_file_path) | |
| if __name__ == '__main__': | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| with gr.Column(variant='panel', scale=5): | |
| file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'], type="filepath") | |
| prompts = gr.Dropdown( | |
| choices=preset_prompts, | |
| label="Prompt", | |
| info="Enter or select prompts...", | |
| value=preset_prompts[0], | |
| multiselect=False, | |
| interactive=True, | |
| allow_custom_value=True, | |
| ) | |
| with gr.Row(): | |
| change_bu = gr.Button('Parse') | |
| clear_bu = gr.ClearButton(value='Clear') | |
| zoom = gr.Slider(0.5, 3, value=1, step=0.1, label="Image Scale") | |
| with gr.Row(): | |
| prev_btn = gr.Button("⬅️ Pre") | |
| next_btn = gr.Button("Next ➡️") | |
| viewer = gr.HTML() | |
| example_root = os.path.join(os.path.dirname(__file__), 'examples') | |
| images = [ | |
| os.path.join(example_root, f) | |
| for f in os.listdir(example_root) | |
| if f.lower().endswith(('png', 'jpg', 'jpeg')) | |
| ] | |
| with gr.Column(variant='panel', scale=5): | |
| with gr.Accordion("Examples", open=True): | |
| example_root = "examples" | |
| file_path = [ | |
| os.path.join(example_root, f) | |
| for f in ["Financial_Reports.png", "Books.png", "Magazines.png", "Academic_Papers.png"] | |
| ] | |
| with gr.Row(): | |
| for i, label in enumerate(["Financial Reports(IMG)", "Books(IMG)", "Magazines(IMG)", "Academic Papers(PDF)"]): | |
| with gr.Column(scale=1, min_width=120): | |
| gr.Image( | |
| value=file_path[i], | |
| width=120, | |
| height=90, | |
| show_label=False, | |
| show_download_button=False | |
| ) | |
| gr.Button(label).click(fn=to_file, inputs=gr.State(file_path[i]), outputs=file) | |
| download_btn = gr.Button("⬇️ Generate download link", size="sm") | |
| output_file = gr.File(label='Parse result', interactive=False, elem_id="down-file-box",visible=False) | |
| gr.HTML(""" | |
| <style> | |
| #down-file-box { | |
| max-height: 300px; | |
| } | |
| </style> | |
| """) | |
| with gr.Tabs(): | |
| with gr.Tab('Markdown rendering'): | |
| md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True, | |
| latex_delimiters=latex_delimiters, | |
| line_breaks=True) | |
| with gr.Tab('Markdown text'): | |
| md_text = gr.TextArea(lines=45, show_copy_button=True) | |
| img_list_state = gr.State([]) | |
| idx_state = gr.State(0) | |
| async def upload_handler(files): | |
| if files is None: | |
| return [], 0, "" | |
| if files.lower().endswith(".pdf"): | |
| asyncio.create_task(send_pdf_async_aiohttp(files, server_ip=openai_api_base, Authorization=Authorization)) | |
| b64s = files_to_b64(files) | |
| return b64s, 0, render_img(b64s, 0, 1) | |
| file.change( | |
| upload_handler, | |
| inputs=file, | |
| outputs=[img_list_state, idx_state, viewer], | |
| ).then( | |
| lambda: gr.update(value=1), # 无输入,直接把 zoom 设为 1 | |
| None, # inputs=None | |
| zoom # outputs=[zoom] | |
| ) | |
| def show_prev(b64s, idx, scale): | |
| idx -= 1 | |
| return idx, render_img(b64s, idx, scale) | |
| prev_btn.click( | |
| show_prev, | |
| inputs=[img_list_state, idx_state, zoom], | |
| outputs=[idx_state, viewer], | |
| ) | |
| def show_next(b64s, idx, scale): | |
| idx += 1 | |
| return idx, render_img(b64s, idx, scale) | |
| next_btn.click( | |
| show_next, | |
| inputs=[img_list_state, idx_state, zoom], | |
| outputs=[idx_state, viewer], | |
| ) | |
| zoom.change( | |
| lambda b64s, idx, scale: render_img(b64s, idx, scale), | |
| inputs=[img_list_state, idx_state, zoom], | |
| outputs=viewer, | |
| ) | |
| change_bu.click( | |
| fn=check_prompt, | |
| inputs=prompts, | |
| outputs=prompts | |
| ).then( | |
| lambda f: gr.update(visible=False), | |
| inputs=output_file, | |
| outputs=output_file | |
| ).then( | |
| fn=doc_parser, | |
| inputs=[file, prompts], | |
| outputs=[md, md_text] | |
| ) | |
| clear_bu.add([file, md, md_text]) | |
| download_btn.click( | |
| fn=download_markdown_file, | |
| inputs=md_text, | |
| outputs=output_file | |
| ).then( | |
| lambda f: gr.update(visible=True), | |
| inputs=output_file, | |
| outputs=output_file | |
| ) | |
| demo.launch(server_name='0.0.0.0',share=True) |