Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import spaces | |
| import subprocess | |
| import os | |
| import shutil | |
| import string | |
| import random | |
| from pypdf import PdfReader | |
| import ocrmypdf | |
| def random_word(length): | |
| letters = string.ascii_lowercase | |
| return "".join(random.choice(letters) for _ in range(length)) | |
| def convert_pdf(input_file): | |
| reader = PdfReader(input_file) | |
| metadata = extract_metadata_from_pdf(reader) | |
| text = extract_text_from_pdf(reader) | |
| # Check if there are any images | |
| image_count = 0 | |
| for page in reader.pages: | |
| image_count += len(page.images) | |
| # If there are images and not much content, perform OCR on the document | |
| if image_count > 0 and len(text) < 1000: | |
| out_pdf_file = input_file.replace(".pdf", "_ocr.pdf") | |
| ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True) | |
| # Re-extract text | |
| text = extract_text_from_pdf(PdfReader(input_file)) | |
| # Delete the OCR file | |
| os.remove(out_pdf_file) | |
| return text, metadata | |
| def extract_text_from_pdf(reader): | |
| full_text = "" | |
| for idx, page in enumerate(reader.pages): | |
| text = page.extract_text() | |
| if len(text) > 0: | |
| full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n" | |
| return full_text.strip() | |
| def extract_metadata_from_pdf(reader): | |
| return { | |
| "author": reader.metadata.author, | |
| "creator": reader.metadata.creator, | |
| "producer": reader.metadata.producer, | |
| "subject": reader.metadata.subject, | |
| "title": reader.metadata.title, | |
| } | |
| def convert_pandoc(input_file, filename): | |
| # Temporarily copy the file | |
| shutil.copyfile(input_file, filename) | |
| # Convert the file to markdown with pandoc | |
| output_file = f"{random_word(16)}.md" | |
| result = subprocess.call(["pandoc", filename, "-t", "markdown", "-o", output_file]) | |
| if result != 0: | |
| raise ValueError("Error converting file to markdown with pandoc") | |
| # Read the file and delete temporary files | |
| with open(output_file, "r") as f: | |
| markdown = f.read() | |
| os.remove(output_file) | |
| os.remove(filename) | |
| return markdown | |
| def convert(input_file, filename): | |
| plain_text_filetypes = [ | |
| ".txt", | |
| ".csv", | |
| ".tsv", | |
| ".md", | |
| ".yaml", | |
| ".toml", | |
| ".json", | |
| ".json5", | |
| ".jsonc", | |
| ] | |
| # Already a plain text file that wouldn't benefit from pandoc so return the content | |
| if any(filename.endswith(ft) for ft in plain_text_filetypes): | |
| with open(input_file, "r") as f: | |
| return f.read(), {} | |
| if filename.endswith(".pdf"): | |
| return convert_pdf(input_file) | |
| return convert_pandoc(input_file, filename), {} | |
| # We accept a filename because the gradio JS interface removes this information | |
| # and it's critical for choosing the correct processing pipeline | |
| gr.Interface( | |
| convert, | |
| inputs=[gr.File(label="Upload File", type="filepath"), gr.Text(label="Filename")], | |
| outputs=[ | |
| gr.Text(label="Markdown"), | |
| gr.JSON(label="Metadata"), | |
| ], | |
| ).launch() | |