Shami96 commited on
Commit
97cac57
·
verified ·
1 Parent(s): d77de54

Create hf_utils.py

Browse files
Files changed (1) hide show
  1. hf_utils.py +215 -0
hf_utils.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # hf_utils.py
2
+ """
3
+ Shared helpers for HF red-text extraction / matching.
4
+
5
+ Usage:
6
+ from hf_utils import (
7
+ is_red_font, normalize_text, normalize_header_text,
8
+ flatten_json, find_matching_json_key_and_value,
9
+ get_clean_text, has_red_text, extract_red_text_segments,
10
+ replace_red_text_in_cell, key_is_forbidden_for_position
11
+ )
12
+ """
13
+
14
+ import re
15
+ from typing import Any, Dict, Optional, Tuple
16
+ from docx.shared import RGBColor
17
+
18
+ # -------------------------
19
+ # Red color detection
20
+ # -------------------------
21
+ def is_red_font(run) -> bool:
22
+ """Robust red-color detection for docx.run objects.
23
+
24
+ - checks run.font.color.rgb when available
25
+ - checks run._element.rPr/w:color hex val
26
+ - tolerant to slightly different reds (not strict 255,0,0).
27
+ """
28
+ try:
29
+ col = getattr(run.font, "color", None)
30
+ if col is not None and getattr(col, "rgb", None):
31
+ rgb = col.rgb
32
+ try:
33
+ # rgb may be sequence-like
34
+ r, g, b = rgb[0], rgb[1], rgb[2]
35
+ except Exception:
36
+ # fallback attribute access
37
+ r = getattr(rgb, "r", None) or getattr(rgb, "red", None)
38
+ g = getattr(rgb, "g", None) or getattr(rgb, "green", None)
39
+ b = getattr(rgb, "b", None) or getattr(rgb, "blue", None)
40
+ if r is None:
41
+ return False
42
+ # tolerant heuristic: red must be noticeably higher than green/blue
43
+ if r >= 160 and g <= 120 and b <= 120 and (r - g) >= 30 and (r - b) >= 30:
44
+ return True
45
+ except Exception:
46
+ pass
47
+
48
+ # fallback to raw XML color code if present
49
+ try:
50
+ rPr = run._element.rPr
51
+ if rPr is not None:
52
+ clr = rPr.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color')
53
+ if clr is not None:
54
+ val = clr.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
55
+ if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
56
+ rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
57
+ if rr >= 160 and gg <= 120 and bb <= 120 and (rr - gg) >= 30 and (rr - bb) >= 30:
58
+ return True
59
+ except Exception:
60
+ pass
61
+
62
+ return False
63
+
64
+
65
+ # -------------------------
66
+ # Text normalization
67
+ # -------------------------
68
+ def normalize_text(s: Optional[str]) -> str:
69
+ if s is None:
70
+ return ""
71
+ s = str(s)
72
+ s = s.replace('\u2013', '-').replace('\u2014', '-')
73
+ s = re.sub(r'[^\w\s\#\%\/\-\(\)]', ' ', s)
74
+ s = re.sub(r'\s+', ' ', s).strip()
75
+ return s
76
+
77
+ def normalize_header_text(s: Optional[str]) -> str:
78
+ if not s:
79
+ return ""
80
+ t = re.sub(r'\([^)]*\)', ' ', s)
81
+ t = t.replace("/", " ").replace("\\", " ")
82
+ t = re.sub(r'[^\w\s\#\%]', ' ', t)
83
+ t = re.sub(r'\s+', ' ', t).strip().lower()
84
+ t = t.replace('registrationno', 'registration number')
85
+ t = t.replace('registrationnumber', 'registration number')
86
+ t = t.replace('sub-contractor', 'sub contractor')
87
+ t = t.replace('sub contracted', 'sub contractor')
88
+ return t.strip()
89
+
90
+
91
+ # -------------------------
92
+ # docx helpers
93
+ # -------------------------
94
+ def get_clean_text(cell) -> str:
95
+ out = []
96
+ for paragraph in cell.paragraphs:
97
+ out.append("".join(run.text for run in paragraph.runs))
98
+ return " ".join(out).strip()
99
+
100
+ def has_red_text(cell) -> bool:
101
+ for paragraph in cell.paragraphs:
102
+ for run in paragraph.runs:
103
+ try:
104
+ if is_red_font(run) and run.text.strip():
105
+ return True
106
+ except Exception:
107
+ continue
108
+ return False
109
+
110
+ def extract_red_text_segments(cell):
111
+ segments = []
112
+ for p_idx, paragraph in enumerate(cell.paragraphs):
113
+ current_text = ""
114
+ current_runs = []
115
+ for r_idx, run in enumerate(paragraph.runs):
116
+ if is_red_font(run) and run.text.strip():
117
+ current_text += run.text
118
+ current_runs.append((p_idx, r_idx, run))
119
+ else:
120
+ if current_runs:
121
+ segments.append({'text': current_text, 'runs': current_runs.copy(), 'paragraph_idx': p_idx})
122
+ current_text = ""
123
+ current_runs = []
124
+ if current_runs:
125
+ segments.append({'text': current_text, 'runs': current_runs.copy(), 'paragraph_idx': p_idx})
126
+ return segments
127
+
128
+ def replace_red_text_in_cell(cell, replacement_text: str) -> int:
129
+ segments = extract_red_text_segments(cell)
130
+ if not segments:
131
+ return 0
132
+ first = segments[0]
133
+ first_run = first['runs'][0][2]
134
+ first_run.text = replacement_text
135
+ try:
136
+ first_run.font.color.rgb = RGBColor(0, 0, 0)
137
+ except Exception:
138
+ pass
139
+ for _, _, run in first['runs'][1:]:
140
+ run.text = ''
141
+ for seg in segments[1:]:
142
+ for _, _, run in seg['runs']:
143
+ run.text = ''
144
+ return 1
145
+
146
+
147
+ # -------------------------
148
+ # JSON helpers & matching
149
+ # -------------------------
150
+ def flatten_json(y: Dict[str, Any], prefix: str = '') -> Dict[str, Any]:
151
+ out = {}
152
+ for key, val in y.items():
153
+ new_key = f"{prefix}.{key}" if prefix else key
154
+ if isinstance(val, dict):
155
+ out.update(flatten_json(val, new_key))
156
+ else:
157
+ out[new_key] = val
158
+ out[key] = val
159
+ return out
160
+
161
+ def find_matching_json_key_and_value(field_name: str, flat_json: Dict[str, Any]) -> Optional[Tuple[str, Any]]:
162
+ if not field_name:
163
+ return None
164
+ fn = field_name.strip()
165
+ if fn in flat_json:
166
+ return fn, flat_json[fn]
167
+ for k in flat_json:
168
+ if k.lower() == fn.lower():
169
+ return k, flat_json[k]
170
+ clean_field = normalize_header_text(fn)
171
+ for k in flat_json:
172
+ if normalize_header_text(k) == clean_field:
173
+ return k, flat_json[k]
174
+ field_tokens = set(w for w in re.findall(r'\b\w+\b', fn.lower()) if len(w) > 2)
175
+ if not field_tokens:
176
+ return None
177
+ best = None
178
+ best_score = 0.0
179
+ for k, v in flat_json.items():
180
+ key_tokens = set(w for w in re.findall(r'\b\w+\b', k.lower()) if len(w) > 2)
181
+ if not key_tokens:
182
+ continue
183
+ common = field_tokens.intersection(key_tokens)
184
+ if common:
185
+ sim = len(common) / len(field_tokens.union(key_tokens))
186
+ cov = len(common) / len(field_tokens)
187
+ score = (0.6 * sim) + (0.4 * cov)
188
+ else:
189
+ nf = normalize_header_text(fn)
190
+ nk = normalize_header_text(k)
191
+ if nf and nk and (nf in nk or nk in nf):
192
+ substring_score = min(len(nf), len(nk)) / max(len(nf), len(nk))
193
+ score = 0.4 * substring_score
194
+ else:
195
+ score = 0.0
196
+ if score > best_score:
197
+ best_score = score
198
+ best = (k, v)
199
+ if best and best_score >= 0.35:
200
+ return best[0], best[1]
201
+ return None
202
+
203
+ # -------------------------
204
+ # Small safety helpers
205
+ # -------------------------
206
+ _POSITION_KEY_BLACKLIST = ["attendance", "attendance list", "attendees", "attendance list (names and position titles)"]
207
+
208
+ def key_is_forbidden_for_position(key: Optional[str]) -> bool:
209
+ if not key:
210
+ return False
211
+ lk = key.lower()
212
+ for b in _POSITION_KEY_BLACKLIST:
213
+ if b in lk:
214
+ return True
215
+ return False