Spaces:
Running
Running
| import os | |
| import re | |
| import sys | |
| import json | |
| import time | |
| import copy | |
| import base64 | |
| import asyncio | |
| import tempfile | |
| import subprocess | |
| from pathlib import Path | |
| from datetime import datetime | |
| import zipfile | |
| import httpx, aiofiles, os, asyncio | |
| import numpy as np | |
| import gradio as gr | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| from loguru import logger | |
| from openai import OpenAI, AsyncOpenAI | |
| from gradio_pdf import PDF | |
| import aiohttp | |
| import uuid | |
| import tqdm | |
| import requests | |
| def setup_poppler_linux(): | |
| poppler_dir = "/tmp/poppler" | |
| if not os.path.exists(poppler_dir): | |
| os.makedirs(poppler_dir, exist_ok=True) | |
| subprocess.run([ | |
| "apt-get", "update" | |
| ], check=True) | |
| subprocess.run([ | |
| "apt-get", "install", "-y", "poppler-utils" | |
| ], check=True) | |
| setup_poppler_linux() | |
| preset_prompts = [ | |
| "Please convert the document into Markdown format.", | |
| "Generate a clean and structured Markdown version of the document.", | |
| "Transform this content into Markdown with proper headings and bullet points.", | |
| "Convert the text to Markdown, preserving structure and formatting.", | |
| "Reformat this document as Markdown with clear sections and lists.", | |
| ] | |
| def send_pdf_to_parse(file_path, server_ip, port, route="/upload", api_key=None): | |
| url = f"{openai_api_base}{route}" | |
| headers = {} | |
| if api_key: | |
| headers["Authorization"] = f"Bearer {api_key}" | |
| with open(file_path, "rb") as f: | |
| files = {"file": (os.path.basename(file_path), f, "application/pdf")} | |
| response = requests.post(url, files=files, headers=headers) | |
| return response | |
| async def send_pdf_async_aiohttp(file_path, server_ip, route="/upload", Authorization=None): | |
| """使用aiohttp异步发送PDF""" | |
| # url = f"http://{server_ip}:{port}{route}" | |
| url = f"{server_ip}{route}" | |
| headers = {} | |
| if Authorization: | |
| headers["Authorization"] = f"Bearer {Authorization}" | |
| try: | |
| async with aiohttp.ClientSession() as session: | |
| with open(file_path, "rb") as f: | |
| data = aiohttp.FormData() | |
| data.add_field('file', f, filename=os.path.basename(file_path), content_type='application/pdf') | |
| async with session.post(url, data=data, headers=headers) as response: | |
| print(f"PDF发送成功: {file_path}, 状态码: {response.status}") | |
| return response | |
| except Exception as e: | |
| print(f"PDF发送失败: {file_path}, 错误: {e}") | |
| return None | |
| def extract_makrdown(text): | |
| m = re.search(r'```markdown\s*([\s\S]*?)```', text, re.MULTILINE) | |
| if m: | |
| return m.group(1).strip() | |
| else: | |
| return text | |
| openai_api_key = "EMPTY" | |
| openai_api_base = os.environ.get("openai_api_base") | |
| IP = os.environ.get("IP") | |
| PORT = os.environ.get("PORT") | |
| Authorization = os.environ.get("Authorization") | |
| client = AsyncOpenAI( | |
| api_key=openai_api_key, | |
| base_url=openai_api_base + "/v1", | |
| ) | |
| async def request(messages): | |
| chat_completion_from_base64 = await client.chat.completions.create( | |
| messages=messages, | |
| extra_headers={ | |
| "Authorization": f"Bearer {Authorization}" | |
| }, | |
| model="Qwen2_5VL", | |
| max_completion_tokens=4096, | |
| stream=True, | |
| temperature=0.0, | |
| top_p=0.95 | |
| ) | |
| page = "" | |
| async for chunk in chat_completion_from_base64: | |
| if chunk.choices[0].delta.content: | |
| content = chunk.choices[0].delta.content | |
| choice = chunk.choices[0] | |
| if choice.finish_reason is not None: | |
| print(f"end reason = {choice.finish_reason}") | |
| break | |
| page += content | |
| yield content | |
| def images_to_pdf(img_paths, pdf_path): | |
| if isinstance(img_paths, (str, Path)): | |
| img_paths = [img_paths] | |
| if not img_paths: | |
| raise ValueError("img_paths is empty") | |
| images = [] | |
| for p in img_paths: | |
| p = Path(p) | |
| if not p.is_file(): | |
| raise FileNotFoundError(p) | |
| img = Image.open(p) | |
| if img.mode in ("RGBA", "P"): | |
| img = img.convert("RGB") | |
| images.append(img) | |
| pdf_path = Path(pdf_path) | |
| pdf_path.parent.mkdir(parents=True, exist_ok=True) | |
| images[0].save(pdf_path, | |
| save_all=True, | |
| append_images=images[1:], | |
| resolution=300.0) | |
| return pdf_path | |
| def encode_image(image_path): | |
| with open(image_path, "rb") as image_file: | |
| return base64.b64encode(image_file.read()).decode("utf-8") | |
| def build_message(image_path, prompt): | |
| content = [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{encode_image(image_path)}" | |
| } | |
| }, | |
| {"type": "text", 'text': prompt} | |
| ] | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {'role': 'user', 'content': content} | |
| ] | |
| return messages | |
| def download_markdown_file(md_text): | |
| filename = f"markdown_{uuid.uuid4().hex[:8]}.md" | |
| filepath = Path("downloads") / filename | |
| filepath.parent.mkdir(exist_ok=True) | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write(md_text) | |
| return str(filepath) | |
| async def doc_parser(doc_path, prompt): | |
| doc_path = Path(doc_path) | |
| if not doc_path.is_file(): | |
| raise FileNotFoundError(doc_path) | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| tmpdir = Path(tmpdir) | |
| queries = [] | |
| if doc_path.suffix.lower() == ".pdf": | |
| pages: List[Image.Image] = convert_from_path(doc_path, dpi=300) | |
| for idx, page in enumerate(pages, start=1): | |
| img_path = tmpdir / f"page_{idx}.png" | |
| page.save(img_path, "PNG") | |
| messages = build_message(img_path, prompt) | |
| queries.append(messages) | |
| else: | |
| messages = build_message(doc_path, prompt) | |
| queries.append(messages) | |
| all_pages = [] | |
| all_pages_raw = [] | |
| for query in queries: | |
| pages = "" | |
| async for chunk in request(query): | |
| pages += chunk | |
| yield extract_makrdown(pages), pages | |
| all_pages.append(extract_makrdown(pages)) | |
| all_pages_raw.append(pages) | |
| print(all_pages) | |
| yield "\n---\n".join(all_pages), "\n\n".join(all_pages_raw) | |
| def compress_directory_to_zip(directory_path, output_zip_path): | |
| try: | |
| with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
| for root, dirs, files in os.walk(directory_path): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| arcname = os.path.relpath(file_path, directory_path) | |
| zipf.write(file_path, arcname) | |
| return 0 | |
| except Exception as e: | |
| logger.exception(e) | |
| return -1 | |
| latex_delimiters = [ | |
| {'left': '$$', 'right': '$$', 'display': True}, | |
| {'left': '$', 'right': '$', 'display': False}, | |
| {'left': '\\(', 'right': '\\)', 'display': False}, | |
| {'left': '\\[', 'right': '\\]', 'display': True}, | |
| ] | |
| def check_prompt(prompt): | |
| if not prompt or prompt.strip() == "": | |
| raise gr.Error("Please select or enter a prompt before parsing.") | |
| return prompt | |
| def to_file(image_path): | |
| if image_path.endswith("Academic_Papers.png"): | |
| image_path = image_path.replace("Academic_Papers.png", "Academic_Papers.pdf") | |
| return image_path | |
| # async def process_file(file_path): | |
| # if not file_path.endswith(".pdf"): | |
| # tmp_path = Path(file_path).with_suffix(".pdf") | |
| # images_to_pdf(file_path, tmp_path) | |
| # else: | |
| # tmp_path = Path(file_path) | |
| # async with httpx.AsyncClient() as client: | |
| # await send_pdf_to_parse_async(client, str(tmp_path), IP, PORT) | |
| # return str(tmp_path) | |
| async def process_file(file_path): | |
| """使用asyncio的异步方案""" | |
| if file_path is None: | |
| return None | |
| if not file_path.endswith(".pdf"): | |
| tmp_file_path = Path(file_path) | |
| tmp_file_path = tmp_file_path.with_suffix(".pdf") | |
| images_to_pdf(file_path, tmp_file_path) | |
| else: | |
| tmp_file_path = file_path | |
| asyncio.create_task(send_pdf_async_aiohttp(tmp_file_path, server_ip=openai_api_base, Authorization=Authorization)) | |
| return str(tmp_file_path) | |
| if __name__ == '__main__': | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| with gr.Column(variant='panel', scale=5): | |
| file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'], type="filepath") | |
| prompts = gr.Dropdown( | |
| choices=preset_prompts, | |
| label="Prompt", | |
| info="Enter or select prompts...", | |
| value=preset_prompts[0], | |
| multiselect=False, | |
| interactive=True, | |
| allow_custom_value=True, | |
| ) | |
| with gr.Row(): | |
| change_bu = gr.Button('Parse') | |
| clear_bu = gr.ClearButton(value='Clear') | |
| pdf_show = PDF(label='Preview', interactive=False, visible=True, height=800) | |
| example_root = os.path.join(os.path.dirname(__file__), 'examples') | |
| images = [ | |
| os.path.join(example_root, f) | |
| for f in os.listdir(example_root) | |
| if f.lower().endswith(('png', 'jpg', 'jpeg')) | |
| ] | |
| with gr.Column(variant='panel', scale=5): | |
| with gr.Accordion("Examples", open=True): | |
| example_root = "examples" | |
| file_path = [ | |
| os.path.join(example_root, f) | |
| for f in ["Financial_Reports.png", "Books.png", "Magazines.png", "Academic_Papers.png"] | |
| ] | |
| with gr.Row(): | |
| for i, label in enumerate(["Financial Reports(IMG)", "Books(IMG)", "Magazines(IMG)", "Academic Papers(PDF)"]): | |
| with gr.Column(scale=1, min_width=120): | |
| gr.Image( | |
| value=file_path[i], | |
| width=120, | |
| height=90, | |
| show_label=False, | |
| show_download_button=False | |
| ) | |
| gr.Button(label).click(fn=to_file, inputs=gr.State(file_path[i]), outputs=file) | |
| download_btn = gr.Button("⬇️ Generate download link", size="sm") | |
| output_file = gr.File(label='Parse result', interactive=False, elem_id="down-file-box",visible=False) | |
| gr.HTML(""" | |
| <style> | |
| #down-file-box { | |
| max-height: 300px; | |
| } | |
| </style> | |
| """) | |
| with gr.Tabs(): | |
| with gr.Tab('Markdown rendering'): | |
| md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True, | |
| latex_delimiters=latex_delimiters, | |
| line_breaks=True) | |
| with gr.Tab('Markdown text'): | |
| md_text = gr.TextArea(lines=45, show_copy_button=True) | |
| file.change(fn=process_file, inputs=file, outputs=pdf_show) | |
| change_bu.click( | |
| fn=check_prompt, | |
| inputs=prompts, | |
| outputs=prompts | |
| ).then( | |
| lambda f: gr.update(visible=False), | |
| inputs=output_file, | |
| outputs=output_file | |
| ).then( | |
| fn=doc_parser, | |
| inputs=[file, prompts], | |
| outputs=[md, md_text] | |
| ) | |
| clear_bu.add([file, md, pdf_show, md_text]) | |
| download_btn.click( | |
| fn=download_markdown_file, | |
| inputs=md_text, | |
| outputs=output_file | |
| ).then( | |
| lambda f: gr.update(visible=True), | |
| inputs=output_file, | |
| outputs=output_file | |
| ) | |
| demo.launch(server_name='0.0.0.0',share=True) |