Spaces:
Running
Running
| # app.py | |
| import gradio as gr | |
| import tempfile | |
| from pdf_extractor import extract_label_value_pairs | |
| from word_extractor import extract_red_text_with_labels, is_red_font | |
| from docx import Document | |
| from docx.shared import RGBColor | |
| import difflib | |
| def find_best_match_label(target_label, pdf_data): | |
| keys = list(pdf_data.keys()) | |
| match = difflib.get_close_matches(target_label.lower(), keys, n=1, cutoff=0.4) | |
| return match[0] if match else None | |
| def replace_red_text_by_label(word_path, label_value_map): | |
| doc = Document(word_path) | |
| for table in doc.tables: | |
| for row in table.rows: | |
| cells = row.cells | |
| if len(cells) >= 2: | |
| label = cells[0].text.strip().replace(":", "").replace("\n", " ") | |
| matched_label = find_best_match_label(label, label_value_map) | |
| if not matched_label: | |
| continue | |
| new_value = label_value_map[matched_label] | |
| for para in cells[1].paragraphs: | |
| for run in para.runs: | |
| if is_red_font(run): | |
| run.text = new_value | |
| run.font.color.rgb = RGBColor(0, 0, 0) # make black | |
| temp_dir = tempfile.mkdtemp() | |
| updated_path = f"{temp_dir}/updated.docx" | |
| doc.save(updated_path) | |
| return updated_path | |
| def process_files(pdf_file, word_file): | |
| pdf_path = pdf_file.name | |
| word_path = word_file.name | |
| pdf_data = extract_label_value_pairs(pdf_path) # {label: value} | |
| word_data = extract_red_text_with_labels(word_path) # {label: [red_texts]} | |
| updated_doc_path = replace_red_text_by_label(word_path, pdf_data) | |
| return updated_doc_path | |
| gr.Interface( | |
| fn=process_files, | |
| inputs=[ | |
| gr.File(label="Upload PDF File", type="file"), | |
| gr.File(label="Upload Word File", type="file") | |
| ], | |
| outputs=gr.File(label="Download Updated Word File"), | |
| title="Red Text Replacer (Label-Aware)", | |
| description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching values from the PDF using label-based matching." | |
| ).launch() |