Shami96 commited on
Commit
7ec9f58
·
verified ·
1 Parent(s): 5244c54

Delete hf_utils.py

Browse files
Files changed (1) hide show
  1. hf_utils.py +0 -215
hf_utils.py DELETED
@@ -1,215 +0,0 @@
1
- # hf_utils.py
2
- """
3
- Shared helpers for HF red-text extraction / matching.
4
-
5
- Usage:
6
- from hf_utils import (
7
- is_red_font, normalize_text, normalize_header_text,
8
- flatten_json, find_matching_json_key_and_value,
9
- get_clean_text, has_red_text, extract_red_text_segments,
10
- replace_red_text_in_cell, key_is_forbidden_for_position
11
- )
12
- """
13
-
14
- import re
15
- from typing import Any, Dict, Optional, Tuple
16
- from docx.shared import RGBColor
17
-
18
- # -------------------------
19
- # Red color detection
20
- # -------------------------
21
- def is_red_font(run) -> bool:
22
- """Robust red-color detection for docx.run objects.
23
-
24
- - checks run.font.color.rgb when available
25
- - checks run._element.rPr/w:color hex val
26
- - tolerant to slightly different reds (not strict 255,0,0).
27
- """
28
- try:
29
- col = getattr(run.font, "color", None)
30
- if col is not None and getattr(col, "rgb", None):
31
- rgb = col.rgb
32
- try:
33
- # rgb may be sequence-like
34
- r, g, b = rgb[0], rgb[1], rgb[2]
35
- except Exception:
36
- # fallback attribute access
37
- r = getattr(rgb, "r", None) or getattr(rgb, "red", None)
38
- g = getattr(rgb, "g", None) or getattr(rgb, "green", None)
39
- b = getattr(rgb, "b", None) or getattr(rgb, "blue", None)
40
- if r is None:
41
- return False
42
- # tolerant heuristic: red must be noticeably higher than green/blue
43
- if r >= 160 and g <= 120 and b <= 120 and (r - g) >= 30 and (r - b) >= 30:
44
- return True
45
- except Exception:
46
- pass
47
-
48
- # fallback to raw XML color code if present
49
- try:
50
- rPr = run._element.rPr
51
- if rPr is not None:
52
- clr = rPr.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color')
53
- if clr is not None:
54
- val = clr.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
55
- if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
56
- rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
57
- if rr >= 160 and gg <= 120 and bb <= 120 and (rr - gg) >= 30 and (rr - bb) >= 30:
58
- return True
59
- except Exception:
60
- pass
61
-
62
- return False
63
-
64
-
65
- # -------------------------
66
- # Text normalization
67
- # -------------------------
68
- def normalize_text(s: Optional[str]) -> str:
69
- if s is None:
70
- return ""
71
- s = str(s)
72
- s = s.replace('\u2013', '-').replace('\u2014', '-')
73
- s = re.sub(r'[^\w\s\#\%\/\-\(\)]', ' ', s)
74
- s = re.sub(r'\s+', ' ', s).strip()
75
- return s
76
-
77
- def normalize_header_text(s: Optional[str]) -> str:
78
- if not s:
79
- return ""
80
- t = re.sub(r'\([^)]*\)', ' ', s)
81
- t = t.replace("/", " ").replace("\\", " ")
82
- t = re.sub(r'[^\w\s\#\%]', ' ', t)
83
- t = re.sub(r'\s+', ' ', t).strip().lower()
84
- t = t.replace('registrationno', 'registration number')
85
- t = t.replace('registrationnumber', 'registration number')
86
- t = t.replace('sub-contractor', 'sub contractor')
87
- t = t.replace('sub contracted', 'sub contractor')
88
- return t.strip()
89
-
90
-
91
- # -------------------------
92
- # docx helpers
93
- # -------------------------
94
- def get_clean_text(cell) -> str:
95
- out = []
96
- for paragraph in cell.paragraphs:
97
- out.append("".join(run.text for run in paragraph.runs))
98
- return " ".join(out).strip()
99
-
100
- def has_red_text(cell) -> bool:
101
- for paragraph in cell.paragraphs:
102
- for run in paragraph.runs:
103
- try:
104
- if is_red_font(run) and run.text.strip():
105
- return True
106
- except Exception:
107
- continue
108
- return False
109
-
110
- def extract_red_text_segments(cell):
111
- segments = []
112
- for p_idx, paragraph in enumerate(cell.paragraphs):
113
- current_text = ""
114
- current_runs = []
115
- for r_idx, run in enumerate(paragraph.runs):
116
- if is_red_font(run) and run.text.strip():
117
- current_text += run.text
118
- current_runs.append((p_idx, r_idx, run))
119
- else:
120
- if current_runs:
121
- segments.append({'text': current_text, 'runs': current_runs.copy(), 'paragraph_idx': p_idx})
122
- current_text = ""
123
- current_runs = []
124
- if current_runs:
125
- segments.append({'text': current_text, 'runs': current_runs.copy(), 'paragraph_idx': p_idx})
126
- return segments
127
-
128
- def replace_red_text_in_cell(cell, replacement_text: str) -> int:
129
- segments = extract_red_text_segments(cell)
130
- if not segments:
131
- return 0
132
- first = segments[0]
133
- first_run = first['runs'][0][2]
134
- first_run.text = replacement_text
135
- try:
136
- first_run.font.color.rgb = RGBColor(0, 0, 0)
137
- except Exception:
138
- pass
139
- for _, _, run in first['runs'][1:]:
140
- run.text = ''
141
- for seg in segments[1:]:
142
- for _, _, run in seg['runs']:
143
- run.text = ''
144
- return 1
145
-
146
-
147
- # -------------------------
148
- # JSON helpers & matching
149
- # -------------------------
150
- def flatten_json(y: Dict[str, Any], prefix: str = '') -> Dict[str, Any]:
151
- out = {}
152
- for key, val in y.items():
153
- new_key = f"{prefix}.{key}" if prefix else key
154
- if isinstance(val, dict):
155
- out.update(flatten_json(val, new_key))
156
- else:
157
- out[new_key] = val
158
- out[key] = val
159
- return out
160
-
161
- def find_matching_json_key_and_value(field_name: str, flat_json: Dict[str, Any]) -> Optional[Tuple[str, Any]]:
162
- if not field_name:
163
- return None
164
- fn = field_name.strip()
165
- if fn in flat_json:
166
- return fn, flat_json[fn]
167
- for k in flat_json:
168
- if k.lower() == fn.lower():
169
- return k, flat_json[k]
170
- clean_field = normalize_header_text(fn)
171
- for k in flat_json:
172
- if normalize_header_text(k) == clean_field:
173
- return k, flat_json[k]
174
- field_tokens = set(w for w in re.findall(r'\b\w+\b', fn.lower()) if len(w) > 2)
175
- if not field_tokens:
176
- return None
177
- best = None
178
- best_score = 0.0
179
- for k, v in flat_json.items():
180
- key_tokens = set(w for w in re.findall(r'\b\w+\b', k.lower()) if len(w) > 2)
181
- if not key_tokens:
182
- continue
183
- common = field_tokens.intersection(key_tokens)
184
- if common:
185
- sim = len(common) / len(field_tokens.union(key_tokens))
186
- cov = len(common) / len(field_tokens)
187
- score = (0.6 * sim) + (0.4 * cov)
188
- else:
189
- nf = normalize_header_text(fn)
190
- nk = normalize_header_text(k)
191
- if nf and nk and (nf in nk or nk in nf):
192
- substring_score = min(len(nf), len(nk)) / max(len(nf), len(nk))
193
- score = 0.4 * substring_score
194
- else:
195
- score = 0.0
196
- if score > best_score:
197
- best_score = score
198
- best = (k, v)
199
- if best and best_score >= 0.35:
200
- return best[0], best[1]
201
- return None
202
-
203
- # -------------------------
204
- # Small safety helpers
205
- # -------------------------
206
- _POSITION_KEY_BLACKLIST = ["attendance", "attendance list", "attendees", "attendance list (names and position titles)"]
207
-
208
- def key_is_forbidden_for_position(key: Optional[str]) -> bool:
209
- if not key:
210
- return False
211
- lk = key.lower()
212
- for b in _POSITION_KEY_BLACKLIST:
213
- if b in lk:
214
- return True
215
- return False