Shami96 commited on
Commit
4271bc4
·
verified ·
1 Parent(s): 377eee0

Delete word_extractor.py

Browse files
Files changed (1) hide show
  1. word_extractor.py +0 -61
word_extractor.py DELETED
@@ -1,61 +0,0 @@
1
- # word_extractor.py
2
- from docx import Document
3
- from docx.shared import RGBColor
4
- from collections import defaultdict
5
- from typing import List, Dict
6
-
7
-
8
- def is_red_font(run) -> bool:
9
- if run.font.color and run.font.color.rgb:
10
- rgb = run.font.color.rgb
11
- r, g, b = rgb[0], rgb[1], rgb[2]
12
- return r > 150 and g < 100 and b < 100
13
- return False
14
-
15
-
16
- def get_full_text_if_red(para):
17
- buffer = ""
18
- collecting = False
19
- red_texts = []
20
-
21
- for run in para.runs:
22
- if is_red_font(run):
23
- buffer += run.text
24
- collecting = True
25
- elif collecting:
26
- red_texts.append(buffer.strip())
27
- buffer = ""
28
- collecting = False
29
- if buffer:
30
- red_texts.append(buffer.strip())
31
- return red_texts
32
-
33
-
34
- def extract_red_text_with_labels(doc_path: str) -> Dict[str, List[str]]:
35
- document = Document(doc_path)
36
- results = defaultdict(list)
37
-
38
- for para in document.paragraphs:
39
- red_texts = get_full_text_if_red(para)
40
- for text in red_texts:
41
- if text.strip():
42
- results["Unlabeled"].append(text)
43
-
44
- for table_idx, table in enumerate(document.tables):
45
- for row_idx, row in enumerate(table.rows):
46
- cells = row.cells
47
- if len(cells) >= 2:
48
- label = cells[0].text.strip().replace(":", "").replace("\n", " ")
49
- values = []
50
- for para in cells[1].paragraphs:
51
- values += get_full_text_if_red(para)
52
- if values:
53
- clean_label = label if label else f"Table_{table_idx+1}_Row_{row_idx+1}"
54
- for v in values:
55
- results[clean_label].append(v)
56
- elif len(cells) == 1:
57
- for para in cells[0].paragraphs:
58
- red_texts = get_full_text_if_red(para)
59
- for text in red_texts:
60
- results[f"Single_Column_Table_{table_idx+1}"].append(text)
61
- return results