Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@ import subprocess
|
|
| 11 |
from pathlib import Path
|
| 12 |
from datetime import datetime
|
| 13 |
import zipfile
|
| 14 |
-
|
| 15 |
import numpy as np
|
| 16 |
import gradio as gr
|
| 17 |
from PIL import Image
|
|
@@ -20,6 +20,7 @@ from loguru import logger
|
|
| 20 |
from openai import OpenAI, AsyncOpenAI
|
| 21 |
from gradio_pdf import PDF
|
| 22 |
|
|
|
|
| 23 |
import uuid
|
| 24 |
import tqdm
|
| 25 |
|
|
@@ -40,6 +41,7 @@ def setup_poppler_linux():
|
|
| 40 |
setup_poppler_linux()
|
| 41 |
|
| 42 |
|
|
|
|
| 43 |
preset_prompts = [
|
| 44 |
"Please convert the document into Markdown format.",
|
| 45 |
"Generate a clean and structured Markdown version of the document.",
|
|
@@ -61,6 +63,28 @@ def send_pdf_to_parse(file_path, server_ip, port, route="/upload", api_key=None)
|
|
| 61 |
return response
|
| 62 |
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def extract_makrdown(text):
|
| 65 |
m = re.search(r'```markdown\s*([\s\S]*?)```', text, re.MULTILINE)
|
| 66 |
if m:
|
|
@@ -245,17 +269,31 @@ def to_file(image_path):
|
|
| 245 |
|
| 246 |
return image_path
|
| 247 |
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
if file_path is None:
|
| 250 |
return None
|
|
|
|
| 251 |
if not file_path.endswith(".pdf"):
|
| 252 |
-
|
| 253 |
-
tmp_file_path = Path(file_path)
|
| 254 |
tmp_file_path = tmp_file_path.with_suffix(".pdf")
|
| 255 |
images_to_pdf(file_path, tmp_file_path)
|
| 256 |
else:
|
| 257 |
-
send_pdf_to_parse(file_path, IP, PORT)
|
| 258 |
tmp_file_path = file_path
|
|
|
|
| 259 |
|
| 260 |
return str(tmp_file_path)
|
| 261 |
|
|
@@ -362,4 +400,4 @@ if __name__ == '__main__':
|
|
| 362 |
)
|
| 363 |
|
| 364 |
|
| 365 |
-
demo.launch(server_name='0.0.0.0',share=True)
|
|
|
|
| 11 |
from pathlib import Path
|
| 12 |
from datetime import datetime
|
| 13 |
import zipfile
|
| 14 |
+
import httpx, aiofiles, os, asyncio
|
| 15 |
import numpy as np
|
| 16 |
import gradio as gr
|
| 17 |
from PIL import Image
|
|
|
|
| 20 |
from openai import OpenAI, AsyncOpenAI
|
| 21 |
from gradio_pdf import PDF
|
| 22 |
|
| 23 |
+
import aiohttp
|
| 24 |
import uuid
|
| 25 |
import tqdm
|
| 26 |
|
|
|
|
| 41 |
setup_poppler_linux()
|
| 42 |
|
| 43 |
|
| 44 |
+
|
| 45 |
preset_prompts = [
|
| 46 |
"Please convert the document into Markdown format.",
|
| 47 |
"Generate a clean and structured Markdown version of the document.",
|
|
|
|
| 63 |
return response
|
| 64 |
|
| 65 |
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
async def send_pdf_async_aiohttp(file_path, server_ip, port, route="/upload", api_key=None):
|
| 69 |
+
"""使用aiohttp异步发送PDF"""
|
| 70 |
+
url = f"http://{server_ip}:{port}{route}"
|
| 71 |
+
headers = {}
|
| 72 |
+
if api_key:
|
| 73 |
+
headers["Authorization"] = f"Bearer {api_key}"
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
async with aiohttp.ClientSession() as session:
|
| 77 |
+
with open(file_path, "rb") as f:
|
| 78 |
+
data = aiohttp.FormData()
|
| 79 |
+
data.add_field('file', f, filename=os.path.basename(file_path), content_type='application/pdf')
|
| 80 |
+
async with session.post(url, data=data, headers=headers) as response:
|
| 81 |
+
print(f"PDF发送成功: {file_path}, 状态码: {response.status}")
|
| 82 |
+
return response
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"PDF发送失败: {file_path}, 错误: {e}")
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
|
| 88 |
def extract_makrdown(text):
|
| 89 |
m = re.search(r'```markdown\s*([\s\S]*?)```', text, re.MULTILINE)
|
| 90 |
if m:
|
|
|
|
| 269 |
|
| 270 |
return image_path
|
| 271 |
|
| 272 |
+
|
| 273 |
+
# async def process_file(file_path):
|
| 274 |
+
# if not file_path.endswith(".pdf"):
|
| 275 |
+
# tmp_path = Path(file_path).with_suffix(".pdf")
|
| 276 |
+
# images_to_pdf(file_path, tmp_path)
|
| 277 |
+
# else:
|
| 278 |
+
# tmp_path = Path(file_path)
|
| 279 |
+
|
| 280 |
+
# async with httpx.AsyncClient() as client:
|
| 281 |
+
# await send_pdf_to_parse_async(client, str(tmp_path), IP, PORT)
|
| 282 |
+
# return str(tmp_path)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
async def process_file(file_path):
|
| 286 |
+
"""使用asyncio的异步方案"""
|
| 287 |
if file_path is None:
|
| 288 |
return None
|
| 289 |
+
|
| 290 |
if not file_path.endswith(".pdf"):
|
| 291 |
+
tmp_file_path = Path(file_path)
|
|
|
|
| 292 |
tmp_file_path = tmp_file_path.with_suffix(".pdf")
|
| 293 |
images_to_pdf(file_path, tmp_file_path)
|
| 294 |
else:
|
|
|
|
| 295 |
tmp_file_path = file_path
|
| 296 |
+
asyncio.create_task(send_pdf_async_aiohttp(tmp_file_path, IP, PORT))
|
| 297 |
|
| 298 |
return str(tmp_file_path)
|
| 299 |
|
|
|
|
| 400 |
)
|
| 401 |
|
| 402 |
|
| 403 |
+
demo.launch(server_name='0.0.0.0',share=True)
|