Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 1

Commit

4271bc4

verified ·

1 Parent(s): 377eee0

Delete word_extractor.py

Browse files

Files changed (1) hide show

word_extractor.py +0 -61

word_extractor.py DELETED Viewed

@@ -1,61 +0,0 @@
-# word_extractor.py
-from docx import Document
-from docx.shared import RGBColor
-from collections import defaultdict
-from typing import List, Dict
-def is_red_font(run) -> bool:
-    if run.font.color and run.font.color.rgb:
-        rgb = run.font.color.rgb
-        r, g, b = rgb[0], rgb[1], rgb[2]
-        return r > 150 and g < 100 and b < 100
-    return False
-def get_full_text_if_red(para):
-    buffer = ""
-    collecting = False
-    red_texts = []
-    for run in para.runs:
-        if is_red_font(run):
-            buffer += run.text
-            collecting = True
-        elif collecting:
-            red_texts.append(buffer.strip())
-            buffer = ""
-            collecting = False
-    if buffer:
-        red_texts.append(buffer.strip())
-    return red_texts
-def extract_red_text_with_labels(doc_path: str) -> Dict[str, List[str]]:
-    document = Document(doc_path)
-    results = defaultdict(list)
-    for para in document.paragraphs:
-        red_texts = get_full_text_if_red(para)
-        for text in red_texts:
-            if text.strip():
-                results["Unlabeled"].append(text)
-    for table_idx, table in enumerate(document.tables):
-        for row_idx, row in enumerate(table.rows):
-            cells = row.cells
-            if len(cells) >= 2:
-                label = cells[0].text.strip().replace(":", "").replace("\n", " ")
-                values = []
-                for para in cells[1].paragraphs:
-                    values += get_full_text_if_red(para)
-                if values:
-                    clean_label = label if label else f"Table_{table_idx+1}_Row_{row_idx+1}"
-                    for v in values:
-                        results[clean_label].append(v)
-            elif len(cells) == 1:
-                for para in cells[0].paragraphs:
-                    red_texts = get_full_text_if_red(para)
-                    for text in red_texts:
-                        results[f"Single_Column_Table_{table_idx+1}"].append(text)
-    return results