Spaces:

Shami96
/

PDF-Data_Extractor

Running

PDF-Data_Extractor / word_extractor.py

Rename word_updater.py to word_extractor.py

88a026a verified 4 months ago

1.99 kB

	# word_extractor.py
	from docx import Document
	from docx.shared import RGBColor
	from collections import defaultdict
	from typing import List, Dict


	def is_red_font(run) -> bool:
	if run.font.color and run.font.color.rgb:
	rgb = run.font.color.rgb
	r, g, b = rgb[0], rgb[1], rgb[2]
	return r > 150 and g < 100 and b < 100
	return False


	def get_full_text_if_red(para):
	buffer = ""
	collecting = False
	red_texts = []

	for run in para.runs:
	if is_red_font(run):
	buffer += run.text
	collecting = True
	elif collecting:
	red_texts.append(buffer.strip())
	buffer = ""
	collecting = False
	if buffer:
	red_texts.append(buffer.strip())
	return red_texts


	def extract_red_text_with_labels(doc_path: str) -> Dict[str, List[str]]:
	document = Document(doc_path)
	results = defaultdict(list)

	for para in document.paragraphs:
	red_texts = get_full_text_if_red(para)
	for text in red_texts:
	if text.strip():
	results["Unlabeled"].append(text)

	for table_idx, table in enumerate(document.tables):
	for row_idx, row in enumerate(table.rows):
	cells = row.cells
	if len(cells) >= 2:
	label = cells[0].text.strip().replace(":", "").replace("\n", " ")
	values = []
	for para in cells[1].paragraphs:
	values += get_full_text_if_red(para)
	if values:
	clean_label = label if label else f"Table_{table_idx+1}_Row_{row_idx+1}"
	for v in values:
	results[clean_label].append(v)
	elif len(cells) == 1:
	for para in cells[0].paragraphs:
	red_texts = get_full_text_if_red(para)
	for text in red_texts:
	results[f"Single_Column_Table_{table_idx+1}"].append(text)
	return results