Spaces:
Running
Running
File size: 7,345 Bytes
97cac57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
# hf_utils.py
"""
Shared helpers for HF red-text extraction / matching.
Usage:
from hf_utils import (
is_red_font, normalize_text, normalize_header_text,
flatten_json, find_matching_json_key_and_value,
get_clean_text, has_red_text, extract_red_text_segments,
replace_red_text_in_cell, key_is_forbidden_for_position
)
"""
import re
from typing import Any, Dict, Optional, Tuple
from docx.shared import RGBColor
# -------------------------
# Red color detection
# -------------------------
def is_red_font(run) -> bool:
"""Robust red-color detection for docx.run objects.
- checks run.font.color.rgb when available
- checks run._element.rPr/w:color hex val
- tolerant to slightly different reds (not strict 255,0,0).
"""
try:
col = getattr(run.font, "color", None)
if col is not None and getattr(col, "rgb", None):
rgb = col.rgb
try:
# rgb may be sequence-like
r, g, b = rgb[0], rgb[1], rgb[2]
except Exception:
# fallback attribute access
r = getattr(rgb, "r", None) or getattr(rgb, "red", None)
g = getattr(rgb, "g", None) or getattr(rgb, "green", None)
b = getattr(rgb, "b", None) or getattr(rgb, "blue", None)
if r is None:
return False
# tolerant heuristic: red must be noticeably higher than green/blue
if r >= 160 and g <= 120 and b <= 120 and (r - g) >= 30 and (r - b) >= 30:
return True
except Exception:
pass
# fallback to raw XML color code if present
try:
rPr = run._element.rPr
if rPr is not None:
clr = rPr.find('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color')
if clr is not None:
val = clr.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
if val and re.fullmatch(r"[0-9A-Fa-f]{6}", val):
rr, gg, bb = int(val[:2], 16), int(val[2:4], 16), int(val[4:], 16)
if rr >= 160 and gg <= 120 and bb <= 120 and (rr - gg) >= 30 and (rr - bb) >= 30:
return True
except Exception:
pass
return False
# -------------------------
# Text normalization
# -------------------------
def normalize_text(s: Optional[str]) -> str:
if s is None:
return ""
s = str(s)
s = s.replace('\u2013', '-').replace('\u2014', '-')
s = re.sub(r'[^\w\s\#\%\/\-\(\)]', ' ', s)
s = re.sub(r'\s+', ' ', s).strip()
return s
def normalize_header_text(s: Optional[str]) -> str:
if not s:
return ""
t = re.sub(r'\([^)]*\)', ' ', s)
t = t.replace("/", " ").replace("\\", " ")
t = re.sub(r'[^\w\s\#\%]', ' ', t)
t = re.sub(r'\s+', ' ', t).strip().lower()
t = t.replace('registrationno', 'registration number')
t = t.replace('registrationnumber', 'registration number')
t = t.replace('sub-contractor', 'sub contractor')
t = t.replace('sub contracted', 'sub contractor')
return t.strip()
# -------------------------
# docx helpers
# -------------------------
def get_clean_text(cell) -> str:
out = []
for paragraph in cell.paragraphs:
out.append("".join(run.text for run in paragraph.runs))
return " ".join(out).strip()
def has_red_text(cell) -> bool:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
try:
if is_red_font(run) and run.text.strip():
return True
except Exception:
continue
return False
def extract_red_text_segments(cell):
segments = []
for p_idx, paragraph in enumerate(cell.paragraphs):
current_text = ""
current_runs = []
for r_idx, run in enumerate(paragraph.runs):
if is_red_font(run) and run.text.strip():
current_text += run.text
current_runs.append((p_idx, r_idx, run))
else:
if current_runs:
segments.append({'text': current_text, 'runs': current_runs.copy(), 'paragraph_idx': p_idx})
current_text = ""
current_runs = []
if current_runs:
segments.append({'text': current_text, 'runs': current_runs.copy(), 'paragraph_idx': p_idx})
return segments
def replace_red_text_in_cell(cell, replacement_text: str) -> int:
segments = extract_red_text_segments(cell)
if not segments:
return 0
first = segments[0]
first_run = first['runs'][0][2]
first_run.text = replacement_text
try:
first_run.font.color.rgb = RGBColor(0, 0, 0)
except Exception:
pass
for _, _, run in first['runs'][1:]:
run.text = ''
for seg in segments[1:]:
for _, _, run in seg['runs']:
run.text = ''
return 1
# -------------------------
# JSON helpers & matching
# -------------------------
def flatten_json(y: Dict[str, Any], prefix: str = '') -> Dict[str, Any]:
out = {}
for key, val in y.items():
new_key = f"{prefix}.{key}" if prefix else key
if isinstance(val, dict):
out.update(flatten_json(val, new_key))
else:
out[new_key] = val
out[key] = val
return out
def find_matching_json_key_and_value(field_name: str, flat_json: Dict[str, Any]) -> Optional[Tuple[str, Any]]:
if not field_name:
return None
fn = field_name.strip()
if fn in flat_json:
return fn, flat_json[fn]
for k in flat_json:
if k.lower() == fn.lower():
return k, flat_json[k]
clean_field = normalize_header_text(fn)
for k in flat_json:
if normalize_header_text(k) == clean_field:
return k, flat_json[k]
field_tokens = set(w for w in re.findall(r'\b\w+\b', fn.lower()) if len(w) > 2)
if not field_tokens:
return None
best = None
best_score = 0.0
for k, v in flat_json.items():
key_tokens = set(w for w in re.findall(r'\b\w+\b', k.lower()) if len(w) > 2)
if not key_tokens:
continue
common = field_tokens.intersection(key_tokens)
if common:
sim = len(common) / len(field_tokens.union(key_tokens))
cov = len(common) / len(field_tokens)
score = (0.6 * sim) + (0.4 * cov)
else:
nf = normalize_header_text(fn)
nk = normalize_header_text(k)
if nf and nk and (nf in nk or nk in nf):
substring_score = min(len(nf), len(nk)) / max(len(nf), len(nk))
score = 0.4 * substring_score
else:
score = 0.0
if score > best_score:
best_score = score
best = (k, v)
if best and best_score >= 0.35:
return best[0], best[1]
return None
# -------------------------
# Small safety helpers
# -------------------------
_POSITION_KEY_BLACKLIST = ["attendance", "attendance list", "attendees", "attendance list (names and position titles)"]
def key_is_forbidden_for_position(key: Optional[str]) -> bool:
if not key:
return False
lk = key.lower()
for b in _POSITION_KEY_BLACKLIST:
if b in lk:
return True
return False |