Spaces:

cstr
/

PDF-Summarizer

Running

App Files Files Community

cstr commited on Dec 7, 2024

Commit

39a451a

verified ·

1 Parent(s): b4d1472

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -23

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ print(f"Gradio version: {gr.__version__}")
 from PyPDF2 import PdfReader
 import fitz  # pymupdf
-from pdf2md.converter import PDF2Markdown
 import logging
 import webbrowser
@@ -202,17 +201,6 @@ class PDFProcessor:
             logging.error(f"Error in txt conversion: {e}")
             return f"Error: {str(e)}"
-    @staticmethod
-    def md_convert_with_pdf2md(pdf_path: str) -> str:
-        """Convert PDF to Markdown using pdf2md"""
-        try:
-            converter = PDF2Markdown()
-            markdown_text = converter.convert(pdf_path)
-            return markdown_text
-        except Exception as e:
-            logging.error(f"Error in pdf2md conversion: {e}")
-            return f"Error: {str(e)}"
     @staticmethod
     def md_convert_with_pymupdf(pdf_path: str) -> str:
         """Convert PDF to Markdown using pymupdf"""
@@ -261,14 +249,13 @@ class PDFProcessor:
 # Initialize model registry
 model_registry = ModelRegistry()
-def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: str = "pdf2md") -> str:
     """
     Extract and format text from PDF using different processors based on format.
     Args:
         pdf_path: Path to PDF file
         format_type: Either 'txt' or 'md'
-        md_engine: When format_type is 'md', either 'pdf2md' or 'pymupdf'
     Returns:
         Formatted text content
@@ -279,12 +266,7 @@ def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: st
         if format_type == "txt":
             return processor.txt_convert(pdf_path)
         elif format_type == "md":
-            if md_engine == "pdf2md":
-                return processor.md_convert_with_pdf2md(pdf_path)
-            elif md_engine == "pymupdf":
-                return processor.md_convert_with_pymupdf(pdf_path)
-            else:
-                return f"Error: Unsupported markdown engine: {md_engine}"
         else:
             return f"Error: Unsupported format type: {format_type}"
     except Exception as e:
@@ -629,7 +611,7 @@ with gr.Blocks(css="""
                     )
                     format_type = gr.Radio(
-                        choices=["txt", "md (pdf2md)", "md (pymupdf)"],
                         value="txt",
                         label="📝 Output Format"
                     )
@@ -871,15 +853,16 @@ with gr.Blocks(css="""
         ]
     # PDF Processing Handlers
-    def handle_pdf_process(pdf, fmt, md_eng, ctx_size):
         if not pdf:
             return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
         try:
-            text = extract_text_from_pdf(pdf.name, format_type=fmt, md_engine=md_eng)
             if text.startswith("Error"):
                 return text, "", "", [], gr.update(choices=[], value=None), None
             snippets_list = split_into_snippets(text, ctx_size)
             snippet_choices = update_snippet_choices(snippets_list)

 from PyPDF2 import PdfReader
 import fitz  # pymupdf
 import logging
 import webbrowser
             logging.error(f"Error in txt conversion: {e}")
             return f"Error: {str(e)}"
     @staticmethod
     def md_convert_with_pymupdf(pdf_path: str) -> str:
         """Convert PDF to Markdown using pymupdf"""
 # Initialize model registry
 model_registry = ModelRegistry()
+def extract_text_from_pdf(pdf_path: str, format_type: str = "txt") -> str:
     """
     Extract and format text from PDF using different processors based on format.
     Args:
         pdf_path: Path to PDF file
         format_type: Either 'txt' or 'md'
     Returns:
         Formatted text content
         if format_type == "txt":
             return processor.txt_convert(pdf_path)
         elif format_type == "md":
+            return processor.md_convert_with_pymupdf(pdf_path)
         else:
             return f"Error: Unsupported format type: {format_type}"
     except Exception as e:
                     )
                     format_type = gr.Radio(
+                        choices=["txt", "md"],
                         value="txt",
                         label="📝 Output Format"
                     )
         ]
     # PDF Processing Handlers
+    def handle_pdf_process(pdf, fmt, ctx_size):  # Remove md_eng parameter
         if not pdf:
             return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
         try:
+            text = extract_text_from_pdf(pdf.name, format_type=fmt)  # Just use format_type
             if text.startswith("Error"):
                 return text, "", "", [], gr.update(choices=[], value=None), None
+            # The important part: still do snippets processing
             snippets_list = split_into_snippets(text, ctx_size)
             snippet_choices = update_snippet_choices(snippets_list)