Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

PDF-Data_Extractor / app.py

Shami96

Update app.py

5fb575a verified 4 months ago

raw

history blame

2.15 kB

	# app.py
	import gradio as gr
	import tempfile
	from pdf_extractor import extract_label_value_pairs
	from word_extractor import extract_red_text_with_labels, is_red_font
	from docx import Document
	from docx.shared import RGBColor
	import difflib


	def find_best_match_label(target_label, pdf_data):
	keys = list(pdf_data.keys())
	match = difflib.get_close_matches(target_label.lower(), keys, n=1, cutoff=0.4)
	return match[0] if match else None


	def replace_red_text_by_label(word_path, label_value_map):
	doc = Document(word_path)

	for table in doc.tables:
	for row in table.rows:
	cells = row.cells
	if len(cells) >= 2:
	label = cells[0].text.strip().replace(":", "").replace("\n", " ")
	matched_label = find_best_match_label(label, label_value_map)
	if not matched_label:
	continue

	new_value = label_value_map[matched_label]

	for para in cells[1].paragraphs:
	for run in para.runs:
	if is_red_font(run):
	run.text = new_value
	run.font.color.rgb = RGBColor(0, 0, 0) # make black

	temp_dir = tempfile.mkdtemp()
	updated_path = f"{temp_dir}/updated.docx"
	doc.save(updated_path)
	return updated_path


	def process_files(pdf_file, word_file):
	pdf_path = pdf_file.name
	word_path = word_file.name

	pdf_data = extract_label_value_pairs(pdf_path) # {label: value}
	word_data = extract_red_text_with_labels(word_path) # {label: [red_texts]}

	updated_doc_path = replace_red_text_by_label(word_path, pdf_data)
	return updated_doc_path


	gr.Interface(
	fn=process_files,
	inputs=[
	gr.File(label="Upload PDF File", type="file"),
	gr.File(label="Upload Word File", type="file")
	],
	outputs=gr.File(label="Download Updated Word File"),
	title="Red Text Replacer (Label-Aware)",
	description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching values from the PDF using label-based matching."
	).launch()