File size: 1,992 Bytes
88a026a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# word_extractor.py
from docx import Document
from docx.shared import RGBColor
from collections import defaultdict
from typing import List, Dict


def is_red_font(run) -> bool:
    if run.font.color and run.font.color.rgb:
        rgb = run.font.color.rgb
        r, g, b = rgb[0], rgb[1], rgb[2]
        return r > 150 and g < 100 and b < 100
    return False


def get_full_text_if_red(para):
    buffer = ""
    collecting = False
    red_texts = []

    for run in para.runs:
        if is_red_font(run):
            buffer += run.text
            collecting = True
        elif collecting:
            red_texts.append(buffer.strip())
            buffer = ""
            collecting = False
    if buffer:
        red_texts.append(buffer.strip())
    return red_texts


def extract_red_text_with_labels(doc_path: str) -> Dict[str, List[str]]:
    document = Document(doc_path)
    results = defaultdict(list)

    for para in document.paragraphs:
        red_texts = get_full_text_if_red(para)
        for text in red_texts:
            if text.strip():
                results["Unlabeled"].append(text)

    for table_idx, table in enumerate(document.tables):
        for row_idx, row in enumerate(table.rows):
            cells = row.cells
            if len(cells) >= 2:
                label = cells[0].text.strip().replace(":", "").replace("\n", " ")
                values = []
                for para in cells[1].paragraphs:
                    values += get_full_text_if_red(para)
                if values:
                    clean_label = label if label else f"Table_{table_idx+1}_Row_{row_idx+1}"
                    for v in values:
                        results[clean_label].append(v)
            elif len(cells) == 1:
                for para in cells[0].paragraphs:
                    red_texts = get_full_text_if_red(para)
                    for text in red_texts:
                        results[f"Single_Column_Table_{table_idx+1}"].append(text)
    return results