Spaces:

Shami96
/

PDF-Data_Extractor

Running

File size: 2,388 Bytes

import gradio as gr
from docx import Document
from docx.shared import RGBColor
from difflib import get_close_matches
from utils import extract_text_from_pdf, parse_pdf_to_dict

# Improved red color detection
def is_red_color(run):
    color = run.font.color
    if color is None:
        return False
    if color.rgb:
        # Try detecting red tones even if slightly off
        red_shades = [
            RGBColor(255, 0, 0),
            RGBColor(200, 0, 0),
            RGBColor(255, 20, 20),
            RGBColor(192, 0, 0),
            RGBColor(220, 20, 60),
        ]
        return any(color.rgb == shade for shade in red_shades)
    return False

def replace_red_text_with_data(doc_path, data_dict):
    doc = Document(doc_path)
    matched = 0

    for para in doc.paragraphs:
        for run in para.runs:
            if is_red_color(run):
                red_text = run.text.strip()
                print(f"Found red text: {red_text}")
                if not red_text:
                    continue
                # Use fuzzy matching
                key_match = get_close_matches(red_text.lower(), [k.lower() for k in data_dict], n=1, cutoff=0.6)
                if key_match:
                    for key in data_dict:
                        if key.lower() == key_match[0]:
                            print(f"Replacing '{red_text}' with '{data_dict[key]}'")
                            run.text = data_dict[key]
                            matched += 1
                            break
    print(f"Total replacements: {matched}")
    return doc

def process_files(pdf_file, template_docx):
    pdf_path = pdf_file
    doc_path = template_docx
    output_path = "filled_output.docx"

    raw_text = extract_text_from_pdf(pdf_path)
    data_dict = parse_pdf_to_dict(raw_text)

    final_doc = replace_red_text_with_data(doc_path, data_dict)

    final_doc.save(output_path)
    return output_path

demo = gr.Interface(
    fn=process_files,
    inputs=[
        gr.File(label="Upload PDF Report", file_types=[".pdf"]),
        gr.File(label="Upload Word Template (.docx)", file_types=[".docx"])
    ],
    outputs=gr.File(label="Download Filled Report (.docx)"),
    title="Audit Report Generator",
    description="Upload a PDF and a Word template. This tool will auto-fill red-colored outdated text with data from the PDF."
)

if __name__ == "__main__":
    demo.launch()