Spaces:

huggingchat
/

document-parser

Running on Zero

App Files Files Community

Liam Dyer commited on May 23, 2024

Commit

6c400a9

unverified ·

1 Parent(s): 8815210

feat: pdf and plain text support

Browse files

Files changed (3) hide show

app.py +64 -4
packages.txt +2 -0
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import gradio as gr
-import os
 import spaces
 import string
 import random
 def random_word(length):
@@ -10,11 +13,54 @@ def random_word(length):
     return "".join(random.choice(letters) for _ in range(length))
-@spaces.GPU
-def convert(input_file):
     # Convert the file to markdown with pandoc
     output_file = f"{random_word(16)}.md"
-    os.system(f"pandoc {input_file} -t markdown -o {output_file}")
     # Read the file and delete
     with open(output_file, "r") as f:
@@ -24,6 +70,20 @@ def convert(input_file):
     return markdown
 gr.Interface(
     convert,
     inputs=gr.File(label="Upload File", type="filepath"),

 import gradio as gr
 import spaces
+import subprocess
+import os
 import string
 import random
+from pypdf import PdfReader
+import ocrmypdf
 def random_word(length):
     return "".join(random.choice(letters) for _ in range(length))
+def convert_pdf(input_file):
+    reader = PdfReader(input_file)
+    metadata = extract_metadata_from_pdf(reader)
+    text = extract_text_from_pdf(reader)
+    # Check if there are any images
+    image_count = 0
+    for page in reader.pages:
+        image_count += len(page.images)
+    # If there are images and not much content, perform OCR on the document
+    if image_count > 0 and len(text) < 1000:
+        out_pdf_file = input_file.replace(".pdf", "_ocr.pdf")
+        ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True)
+        # Re-extract text
+        text = extract_text_from_pdf(PdfReader(input_file))
+        # Delete the OCR file
+        os.remove(out_pdf_file)
+    return text, metadata
+def extract_text_from_pdf(reader):
+    full_text = ""
+    for idx, page in enumerate(reader.pages):
+        text = page.extract_text()
+        if len(text) > 0:
+            full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
+    return full_text.strip()
+def extract_metadata_from_pdf(reader):
+    return {
+        "author": reader.metadata.author,
+        "creator": reader.metadata.creator,
+        "producer": reader.metadata.producer,
+        "subject": reader.metadata.subject,
+        "title": reader.metadata.title,
+    }
+def convert_pandoc(input_file):
     # Convert the file to markdown with pandoc
     output_file = f"{random_word(16)}.md"
+    result = subprocess.call(f"pandoc {input_file} -t markdown -o {output_file}")
     # Read the file and delete
     with open(output_file, "r") as f:
     return markdown
+@spaces.GPU
+def convert(input_file):
+    plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
+    # Already a plain text file that wouldn't benefit from pandoc so return the content
+    if any(input_file.endswith(ft) for ft in plain_text_filetypes):
+        with open(input_file, "r") as f:
+            return f.read()
+    if input_file.endswith(".pdf"):
+        return convert_pdf(input_file)
+    return convert_pandoc(input_file)
 gr.Interface(
     convert,
     inputs=gr.File(label="Upload File", type="filepath"),

packages.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	pandoc

 pandoc
+ocrmypdf
+tesseract-ocr-eng

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ocrmypdf==16.3.1
2	+ pypdf==4.2.0