Spaces:
Running
Running
| # word_extractor.py | |
| from docx import Document | |
| from docx.shared import RGBColor | |
| from collections import defaultdict | |
| from typing import List, Dict | |
| def is_red_font(run) -> bool: | |
| if run.font.color and run.font.color.rgb: | |
| rgb = run.font.color.rgb | |
| r, g, b = rgb[0], rgb[1], rgb[2] | |
| return r > 150 and g < 100 and b < 100 | |
| return False | |
| def get_full_text_if_red(para): | |
| buffer = "" | |
| collecting = False | |
| red_texts = [] | |
| for run in para.runs: | |
| if is_red_font(run): | |
| buffer += run.text | |
| collecting = True | |
| elif collecting: | |
| red_texts.append(buffer.strip()) | |
| buffer = "" | |
| collecting = False | |
| if buffer: | |
| red_texts.append(buffer.strip()) | |
| return red_texts | |
| def extract_red_text_with_labels(doc_path: str) -> Dict[str, List[str]]: | |
| document = Document(doc_path) | |
| results = defaultdict(list) | |
| for para in document.paragraphs: | |
| red_texts = get_full_text_if_red(para) | |
| for text in red_texts: | |
| if text.strip(): | |
| results["Unlabeled"].append(text) | |
| for table_idx, table in enumerate(document.tables): | |
| for row_idx, row in enumerate(table.rows): | |
| cells = row.cells | |
| if len(cells) >= 2: | |
| label = cells[0].text.strip().replace(":", "").replace("\n", " ") | |
| values = [] | |
| for para in cells[1].paragraphs: | |
| values += get_full_text_if_red(para) | |
| if values: | |
| clean_label = label if label else f"Table_{table_idx+1}_Row_{row_idx+1}" | |
| for v in values: | |
| results[clean_label].append(v) | |
| elif len(cells) == 1: | |
| for para in cells[0].paragraphs: | |
| red_texts = get_full_text_if_red(para) | |
| for text in red_texts: | |
| results[f"Single_Column_Table_{table_idx+1}"].append(text) | |
| return results |