Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,9 @@ import gradio as gr
|
|
| 6 |
print(f"Gradio version: {gr.__version__}")
|
| 7 |
|
| 8 |
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
| 9 |
import logging
|
| 10 |
import webbrowser
|
| 11 |
from huggingface_hub import InferenceClient
|
|
@@ -179,26 +182,114 @@ class ModelRegistry:
|
|
| 179 |
self.groq_models = self._fetch_groq_models()
|
| 180 |
return self.groq_models
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
# Initialize model registry
|
| 183 |
model_registry = ModelRegistry()
|
| 184 |
|
| 185 |
-
def extract_text_from_pdf(pdf_path: str) -> str:
|
| 186 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
try:
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
| 194 |
else:
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
return "Error:
|
| 198 |
-
return text
|
| 199 |
except Exception as e:
|
| 200 |
-
logging.error(f"Error
|
| 201 |
-
return f"Error
|
| 202 |
|
| 203 |
def format_content(text: str, format_type: str) -> str:
|
| 204 |
"""Format extracted text according to specified format."""
|
|
@@ -538,7 +629,7 @@ with gr.Blocks(css="""
|
|
| 538 |
)
|
| 539 |
|
| 540 |
format_type = gr.Radio(
|
| 541 |
-
choices=["txt", "md", "
|
| 542 |
value="txt",
|
| 543 |
label="📝 Output Format"
|
| 544 |
)
|
|
@@ -780,46 +871,34 @@ with gr.Blocks(css="""
|
|
| 780 |
]
|
| 781 |
|
| 782 |
# PDF Processing Handlers
|
| 783 |
-
def handle_pdf_process(pdf, fmt, ctx_size):
|
| 784 |
-
"""Process PDF, format text, and return formatted text and snippets."""
|
| 785 |
if not pdf:
|
| 786 |
return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
|
| 787 |
|
| 788 |
try:
|
| 789 |
-
text = extract_text_from_pdf(pdf.name)
|
| 790 |
if text.startswith("Error"):
|
| 791 |
return text, "", "", [], gr.update(choices=[], value=None), None
|
| 792 |
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
snippets_list = split_into_snippets(formatted_text, ctx_size)
|
| 796 |
|
| 797 |
-
with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=f'.{fmt}') as f:
|
| 798 |
-
f.write(
|
| 799 |
download_file = f.name
|
| 800 |
|
| 801 |
-
snippet_choices = update_snippet_choices(snippets_list) # Pre-calculate choices
|
| 802 |
-
|
| 803 |
return (
|
| 804 |
f"PDF processed successfully! Generated {len(snippets_list)} snippets.",
|
| 805 |
-
|
| 806 |
-
|
| 807 |
snippets_list,
|
| 808 |
gr.update(choices=snippet_choices, value=snippet_choices[0] if snippet_choices else None),
|
| 809 |
download_file
|
| 810 |
)
|
| 811 |
-
|
| 812 |
except Exception as e:
|
| 813 |
error_msg = f"Error processing PDF: {str(e)}"
|
| 814 |
logging.error(error_msg)
|
| 815 |
-
return (
|
| 816 |
-
error_msg,
|
| 817 |
-
"",
|
| 818 |
-
"",
|
| 819 |
-
[],
|
| 820 |
-
gr.update(choices=[], value=None),
|
| 821 |
-
None
|
| 822 |
-
)
|
| 823 |
|
| 824 |
def handle_snippet_selection(choice, snippets_list): # Add download_snippet output
|
| 825 |
"""Handle snippet selection, update prompt, and provide snippet download."""
|
|
|
|
| 6 |
print(f"Gradio version: {gr.__version__}")
|
| 7 |
|
| 8 |
from PyPDF2 import PdfReader
|
| 9 |
+
import fitz # pymupdf
|
| 10 |
+
from pdf2md.converter import PDF2Markdown
|
| 11 |
+
|
| 12 |
import logging
|
| 13 |
import webbrowser
|
| 14 |
from huggingface_hub import InferenceClient
|
|
|
|
| 182 |
self.groq_models = self._fetch_groq_models()
|
| 183 |
return self.groq_models
|
| 184 |
|
| 185 |
+
class PDFProcessor:
|
| 186 |
+
"""Handles PDF conversion to text and markdown using different methods"""
|
| 187 |
+
|
| 188 |
+
@staticmethod
|
| 189 |
+
def txt_convert(pdf_path: str) -> str:
|
| 190 |
+
"""Basic text extraction using PyPDF2"""
|
| 191 |
+
try:
|
| 192 |
+
reader = PdfReader(pdf_path)
|
| 193 |
+
text = ""
|
| 194 |
+
for page_num, page in enumerate(reader.pages, start=1):
|
| 195 |
+
page_text = page.extract_text()
|
| 196 |
+
if page_text:
|
| 197 |
+
text += page_text + "\n"
|
| 198 |
+
else:
|
| 199 |
+
logging.warning(f"No text found on page {page_num}.")
|
| 200 |
+
return text
|
| 201 |
+
except Exception as e:
|
| 202 |
+
logging.error(f"Error in txt conversion: {e}")
|
| 203 |
+
return f"Error: {str(e)}"
|
| 204 |
+
|
| 205 |
+
@staticmethod
|
| 206 |
+
def md_convert_with_pdf2md(pdf_path: str) -> str:
|
| 207 |
+
"""Convert PDF to Markdown using pdf2md"""
|
| 208 |
+
try:
|
| 209 |
+
converter = PDF2Markdown()
|
| 210 |
+
markdown_text = converter.convert(pdf_path)
|
| 211 |
+
return markdown_text
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logging.error(f"Error in pdf2md conversion: {e}")
|
| 214 |
+
return f"Error: {str(e)}"
|
| 215 |
+
|
| 216 |
+
@staticmethod
|
| 217 |
+
def md_convert_with_pymupdf(pdf_path: str) -> str:
|
| 218 |
+
"""Convert PDF to Markdown using pymupdf"""
|
| 219 |
+
try:
|
| 220 |
+
doc = fitz.open(pdf_path)
|
| 221 |
+
markdown_text = []
|
| 222 |
+
|
| 223 |
+
for page in doc:
|
| 224 |
+
blocks = page.get_text("dict")["blocks"]
|
| 225 |
+
|
| 226 |
+
for block in blocks:
|
| 227 |
+
if "lines" in block:
|
| 228 |
+
for line in block["lines"]:
|
| 229 |
+
for span in line["spans"]:
|
| 230 |
+
font_size = span["size"]
|
| 231 |
+
content = span["text"]
|
| 232 |
+
font_flags = span["flags"] # Contains bold, italic info
|
| 233 |
+
|
| 234 |
+
# Handle headers based on font size
|
| 235 |
+
if font_size > 20:
|
| 236 |
+
markdown_text.append(f"# {content}\n")
|
| 237 |
+
elif font_size > 16:
|
| 238 |
+
markdown_text.append(f"## {content}\n")
|
| 239 |
+
elif font_size > 14:
|
| 240 |
+
markdown_text.append(f"### {content}\n")
|
| 241 |
+
else:
|
| 242 |
+
# Handle bold and italic
|
| 243 |
+
if font_flags & 2**4: # Bold
|
| 244 |
+
content = f"**{content}**"
|
| 245 |
+
if font_flags & 2**1: # Italic
|
| 246 |
+
content = f"*{content}*"
|
| 247 |
+
markdown_text.append(content)
|
| 248 |
+
|
| 249 |
+
markdown_text.append(" ") # Space between spans
|
| 250 |
+
markdown_text.append("\n") # Newline between lines
|
| 251 |
+
|
| 252 |
+
# Add extra newline between blocks for paragraphs
|
| 253 |
+
markdown_text.append("\n")
|
| 254 |
+
|
| 255 |
+
doc.close()
|
| 256 |
+
return "".join(markdown_text)
|
| 257 |
+
except Exception as e:
|
| 258 |
+
logging.error(f"Error in pymupdf conversion: {e}")
|
| 259 |
+
return f"Error: {str(e)}"
|
| 260 |
+
|
| 261 |
# Initialize model registry
|
| 262 |
model_registry = ModelRegistry()
|
| 263 |
|
| 264 |
+
def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: str = "pdf2md") -> str:
|
| 265 |
+
"""
|
| 266 |
+
Extract and format text from PDF using different processors based on format.
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
pdf_path: Path to PDF file
|
| 270 |
+
format_type: Either 'txt' or 'md'
|
| 271 |
+
md_engine: When format_type is 'md', either 'pdf2md' or 'pymupdf'
|
| 272 |
+
|
| 273 |
+
Returns:
|
| 274 |
+
Formatted text content
|
| 275 |
+
"""
|
| 276 |
+
processor = PDFProcessor()
|
| 277 |
+
|
| 278 |
try:
|
| 279 |
+
if format_type == "txt":
|
| 280 |
+
return processor.txt_convert(pdf_path)
|
| 281 |
+
elif format_type == "md":
|
| 282 |
+
if md_engine == "pdf2md":
|
| 283 |
+
return processor.md_convert_with_pdf2md(pdf_path)
|
| 284 |
+
elif md_engine == "pymupdf":
|
| 285 |
+
return processor.md_convert_with_pymupdf(pdf_path)
|
| 286 |
else:
|
| 287 |
+
return f"Error: Unsupported markdown engine: {md_engine}"
|
| 288 |
+
else:
|
| 289 |
+
return f"Error: Unsupported format type: {format_type}"
|
|
|
|
| 290 |
except Exception as e:
|
| 291 |
+
logging.error(f"Error in PDF conversion: {e}")
|
| 292 |
+
return f"Error: {str(e)}"
|
| 293 |
|
| 294 |
def format_content(text: str, format_type: str) -> str:
|
| 295 |
"""Format extracted text according to specified format."""
|
|
|
|
| 629 |
)
|
| 630 |
|
| 631 |
format_type = gr.Radio(
|
| 632 |
+
choices=["txt", "md (pdf2md)", "md (pymupdf)"],
|
| 633 |
value="txt",
|
| 634 |
label="📝 Output Format"
|
| 635 |
)
|
|
|
|
| 871 |
]
|
| 872 |
|
| 873 |
# PDF Processing Handlers
|
| 874 |
+
def handle_pdf_process(pdf, fmt, md_eng, ctx_size):
|
|
|
|
| 875 |
if not pdf:
|
| 876 |
return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
|
| 877 |
|
| 878 |
try:
|
| 879 |
+
text = extract_text_from_pdf(pdf.name, format_type=fmt, md_engine=md_eng)
|
| 880 |
if text.startswith("Error"):
|
| 881 |
return text, "", "", [], gr.update(choices=[], value=None), None
|
| 882 |
|
| 883 |
+
snippets_list = split_into_snippets(text, ctx_size)
|
| 884 |
+
snippet_choices = update_snippet_choices(snippets_list)
|
|
|
|
| 885 |
|
| 886 |
+
with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=f'.{fmt}') as f:
|
| 887 |
+
f.write(text)
|
| 888 |
download_file = f.name
|
| 889 |
|
|
|
|
|
|
|
| 890 |
return (
|
| 891 |
f"PDF processed successfully! Generated {len(snippets_list)} snippets.",
|
| 892 |
+
text,
|
| 893 |
+
text,
|
| 894 |
snippets_list,
|
| 895 |
gr.update(choices=snippet_choices, value=snippet_choices[0] if snippet_choices else None),
|
| 896 |
download_file
|
| 897 |
)
|
|
|
|
| 898 |
except Exception as e:
|
| 899 |
error_msg = f"Error processing PDF: {str(e)}"
|
| 900 |
logging.error(error_msg)
|
| 901 |
+
return error_msg, "", "", [], gr.update(choices=[], value=None), None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 902 |
|
| 903 |
def handle_snippet_selection(choice, snippets_list): # Add download_snippet output
|
| 904 |
"""Handle snippet selection, update prompt, and provide snippet download."""
|