Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,6 @@ print(f"Gradio version: {gr.__version__}")
|
|
| 7 |
|
| 8 |
from PyPDF2 import PdfReader
|
| 9 |
import fitz # pymupdf
|
| 10 |
-
from pdf2md.converter import PDF2Markdown
|
| 11 |
|
| 12 |
import logging
|
| 13 |
import webbrowser
|
|
@@ -202,17 +201,6 @@ class PDFProcessor:
|
|
| 202 |
logging.error(f"Error in txt conversion: {e}")
|
| 203 |
return f"Error: {str(e)}"
|
| 204 |
|
| 205 |
-
@staticmethod
|
| 206 |
-
def md_convert_with_pdf2md(pdf_path: str) -> str:
|
| 207 |
-
"""Convert PDF to Markdown using pdf2md"""
|
| 208 |
-
try:
|
| 209 |
-
converter = PDF2Markdown()
|
| 210 |
-
markdown_text = converter.convert(pdf_path)
|
| 211 |
-
return markdown_text
|
| 212 |
-
except Exception as e:
|
| 213 |
-
logging.error(f"Error in pdf2md conversion: {e}")
|
| 214 |
-
return f"Error: {str(e)}"
|
| 215 |
-
|
| 216 |
@staticmethod
|
| 217 |
def md_convert_with_pymupdf(pdf_path: str) -> str:
|
| 218 |
"""Convert PDF to Markdown using pymupdf"""
|
|
@@ -261,14 +249,13 @@ class PDFProcessor:
|
|
| 261 |
# Initialize model registry
|
| 262 |
model_registry = ModelRegistry()
|
| 263 |
|
| 264 |
-
def extract_text_from_pdf(pdf_path: str, format_type: str = "txt"
|
| 265 |
"""
|
| 266 |
Extract and format text from PDF using different processors based on format.
|
| 267 |
|
| 268 |
Args:
|
| 269 |
pdf_path: Path to PDF file
|
| 270 |
format_type: Either 'txt' or 'md'
|
| 271 |
-
md_engine: When format_type is 'md', either 'pdf2md' or 'pymupdf'
|
| 272 |
|
| 273 |
Returns:
|
| 274 |
Formatted text content
|
|
@@ -279,12 +266,7 @@ def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: st
|
|
| 279 |
if format_type == "txt":
|
| 280 |
return processor.txt_convert(pdf_path)
|
| 281 |
elif format_type == "md":
|
| 282 |
-
|
| 283 |
-
return processor.md_convert_with_pdf2md(pdf_path)
|
| 284 |
-
elif md_engine == "pymupdf":
|
| 285 |
-
return processor.md_convert_with_pymupdf(pdf_path)
|
| 286 |
-
else:
|
| 287 |
-
return f"Error: Unsupported markdown engine: {md_engine}"
|
| 288 |
else:
|
| 289 |
return f"Error: Unsupported format type: {format_type}"
|
| 290 |
except Exception as e:
|
|
@@ -629,7 +611,7 @@ with gr.Blocks(css="""
|
|
| 629 |
)
|
| 630 |
|
| 631 |
format_type = gr.Radio(
|
| 632 |
-
choices=["txt", "md
|
| 633 |
value="txt",
|
| 634 |
label="π Output Format"
|
| 635 |
)
|
|
@@ -871,15 +853,16 @@ with gr.Blocks(css="""
|
|
| 871 |
]
|
| 872 |
|
| 873 |
# PDF Processing Handlers
|
| 874 |
-
def handle_pdf_process(pdf, fmt,
|
| 875 |
if not pdf:
|
| 876 |
return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
|
| 877 |
|
| 878 |
try:
|
| 879 |
-
text = extract_text_from_pdf(pdf.name, format_type=fmt
|
| 880 |
if text.startswith("Error"):
|
| 881 |
return text, "", "", [], gr.update(choices=[], value=None), None
|
| 882 |
|
|
|
|
| 883 |
snippets_list = split_into_snippets(text, ctx_size)
|
| 884 |
snippet_choices = update_snippet_choices(snippets_list)
|
| 885 |
|
|
|
|
| 7 |
|
| 8 |
from PyPDF2 import PdfReader
|
| 9 |
import fitz # pymupdf
|
|
|
|
| 10 |
|
| 11 |
import logging
|
| 12 |
import webbrowser
|
|
|
|
| 201 |
logging.error(f"Error in txt conversion: {e}")
|
| 202 |
return f"Error: {str(e)}"
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
@staticmethod
|
| 205 |
def md_convert_with_pymupdf(pdf_path: str) -> str:
|
| 206 |
"""Convert PDF to Markdown using pymupdf"""
|
|
|
|
| 249 |
# Initialize model registry
|
| 250 |
model_registry = ModelRegistry()
|
| 251 |
|
| 252 |
+
def extract_text_from_pdf(pdf_path: str, format_type: str = "txt") -> str:
|
| 253 |
"""
|
| 254 |
Extract and format text from PDF using different processors based on format.
|
| 255 |
|
| 256 |
Args:
|
| 257 |
pdf_path: Path to PDF file
|
| 258 |
format_type: Either 'txt' or 'md'
|
|
|
|
| 259 |
|
| 260 |
Returns:
|
| 261 |
Formatted text content
|
|
|
|
| 266 |
if format_type == "txt":
|
| 267 |
return processor.txt_convert(pdf_path)
|
| 268 |
elif format_type == "md":
|
| 269 |
+
return processor.md_convert_with_pymupdf(pdf_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
else:
|
| 271 |
return f"Error: Unsupported format type: {format_type}"
|
| 272 |
except Exception as e:
|
|
|
|
| 611 |
)
|
| 612 |
|
| 613 |
format_type = gr.Radio(
|
| 614 |
+
choices=["txt", "md"],
|
| 615 |
value="txt",
|
| 616 |
label="π Output Format"
|
| 617 |
)
|
|
|
|
| 853 |
]
|
| 854 |
|
| 855 |
# PDF Processing Handlers
|
| 856 |
+
def handle_pdf_process(pdf, fmt, ctx_size): # Remove md_eng parameter
|
| 857 |
if not pdf:
|
| 858 |
return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
|
| 859 |
|
| 860 |
try:
|
| 861 |
+
text = extract_text_from_pdf(pdf.name, format_type=fmt) # Just use format_type
|
| 862 |
if text.startswith("Error"):
|
| 863 |
return text, "", "", [], gr.update(choices=[], value=None), None
|
| 864 |
|
| 865 |
+
# The important part: still do snippets processing
|
| 866 |
snippets_list = split_into_snippets(text, ctx_size)
|
| 867 |
snippet_choices = update_snippet_choices(snippets_list)
|
| 868 |
|