Shami96 commited on
Commit
c38c9d4
Β·
verified Β·
1 Parent(s): ddb37e5

Update updated_word.py

Browse files
Files changed (1) hide show
  1. updated_word.py +793 -194
updated_word.py CHANGED
@@ -3,60 +3,50 @@ from docx import Document
3
  from docx.shared import RGBColor
4
  import re
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def load_json(filepath):
7
  with open(filepath, 'r') as file:
8
  return json.load(file)
9
 
10
- def flatten_json_new_system(json_data):
11
- """Flatten your new JSON structure to work with replacement logic"""
12
- flat_json = {}
13
-
14
- for schema_name, schema_data in json_data.items():
15
- if isinstance(schema_data, dict):
16
- for field_name, values in schema_data.items():
17
- # Handle list values (your system returns lists)
18
- if isinstance(values, list) and values:
19
- value = values[0] if len(values) == 1 else values
20
- else:
21
- value = values
22
-
23
- # Add multiple key variations for better matching
24
- flat_json[field_name] = value
25
- flat_json[field_name.lower()] = value
26
- flat_json[field_name.lower().strip()] = value
27
-
28
- # Add schema-prefixed keys
29
- flat_json[f"{schema_name}.{field_name}"] = value
30
- flat_json[f"{schema_name.lower()}.{field_name.lower()}"] = value
31
-
32
- # Special mappings for common cases
33
- if "print name" in field_name.lower():
34
- flat_json["print name"] = value
35
- flat_json["operator name"] = value
36
- flat_json["name"] = value
37
-
38
- if "position title" in field_name.lower():
39
- flat_json["position title"] = value
40
- flat_json["position"] = value
41
- flat_json["title"] = value
42
-
43
- if "accreditation number" in field_name.lower():
44
- flat_json["accreditation number"] = value
45
- flat_json["nhvas accreditation no"] = value
46
-
47
- if "expiry date" in field_name.lower():
48
- flat_json["expiry date"] = value
49
- flat_json["expiry"] = value
50
-
51
- return flat_json
52
 
53
  def is_red(run):
54
- """Detect red colored text"""
55
  color = run.font.color
56
  return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
57
 
58
  def get_value_as_string(value, field_name=""):
59
- """Convert value to string, handling lists appropriately"""
60
  if isinstance(value, list):
61
  if len(value) == 0:
62
  return ""
@@ -64,56 +54,60 @@ def get_value_as_string(value, field_name=""):
64
  return str(value[0])
65
  else:
66
  if "australian company number" in field_name.lower() or "company number" in field_name.lower():
67
- return value # Return as list for ACN processing
68
  else:
69
  return " ".join(str(v) for v in value)
70
  else:
71
  return str(value)
72
 
73
  def find_matching_json_value(field_name, flat_json):
74
- """Enhanced matching for your new JSON structure"""
75
  field_name = field_name.strip()
76
 
77
- # Direct match (exact)
78
  if field_name in flat_json:
79
  print(f" βœ… Direct match found for key '{field_name}'")
80
  return flat_json[field_name]
81
 
82
- # Case-insensitive exact match
83
  for key, value in flat_json.items():
84
  if key.lower() == field_name.lower():
85
  print(f" βœ… Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
86
  return value
87
 
88
- # Partial matching for common field names
89
- field_lower = field_name.lower().strip()
90
-
91
- # Handle common variations
92
- if "print name" in field_lower:
93
- for key in ["Print Name", "print name", "operator name", "name"]:
94
- if key in flat_json:
95
- print(f" βœ… Print name match: '{field_name}' -> '{key}'")
96
- return flat_json[key]
97
-
98
- if "position title" in field_lower:
99
- for key in ["Position Title", "position title", "position", "title"]:
100
- if key in flat_json:
101
- print(f" βœ… Position title match: '{field_name}' -> '{key}'")
102
- return flat_json[key]
103
-
104
- if "accreditation number" in field_lower:
105
- for key in flat_json.keys():
106
- if "accreditation" in key.lower() and "number" in key.lower():
107
- print(f" βœ… Accreditation number match: '{field_name}' -> '{key}'")
108
- return flat_json[key]
109
-
110
- if "expiry date" in field_lower:
111
- for key in flat_json.keys():
112
- if "expiry" in key.lower():
113
- print(f" βœ… Expiry date match: '{field_name}' -> '{key}'")
114
- return flat_json[key]
115
-
116
- # Fuzzy matching
 
 
 
 
117
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
118
  if not field_words:
119
  return None
@@ -127,9 +121,13 @@ def find_matching_json_value(field_name, flat_json):
127
  if not key_words:
128
  continue
129
 
 
130
  common_words = field_words.intersection(key_words)
131
  if common_words:
 
132
  similarity = len(common_words) / len(field_words.union(key_words))
 
 
133
  coverage = len(common_words) / len(field_words)
134
  final_score = (similarity * 0.6) + (coverage * 0.4)
135
 
@@ -146,7 +144,6 @@ def find_matching_json_value(field_name, flat_json):
146
  return None
147
 
148
  def get_clean_text(cell):
149
- """Extract clean text from cell"""
150
  text = ""
151
  for paragraph in cell.paragraphs:
152
  for run in paragraph.runs:
@@ -154,130 +151,560 @@ def get_clean_text(cell):
154
  return text.strip()
155
 
156
  def has_red_text(cell):
157
- """Check if cell has red text"""
158
  for paragraph in cell.paragraphs:
159
  for run in paragraph.runs:
160
  if is_red(run) and run.text.strip():
161
  return True
162
  return False
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  def replace_red_text_in_cell(cell, replacement_text):
165
- """Replace red text in cell with new text"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  replacements_made = 0
167
 
168
- for paragraph in cell.paragraphs:
169
- for run in paragraph.runs:
170
- if is_red(run) and run.text.strip():
171
- run.text = replacement_text
172
- run.font.color.rgb = RGBColor(0, 0, 0) # Change to black
173
- replacements_made += 1
174
- break # Only replace first red text found
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  return replacements_made
177
 
178
- def handle_australian_company_number(row, company_numbers):
179
- """Handle ACN digit placement"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  replacements_made = 0
181
- for i, digit in enumerate(company_numbers):
182
- cell_idx = i + 1
183
- if cell_idx < len(row.cells):
184
- cell = row.cells[cell_idx]
185
- if has_red_text(cell):
186
- cell_replacements = replace_red_text_in_cell(cell, str(digit))
187
- replacements_made += cell_replacements
188
- print(f" -> Placed digit '{digit}' in cell {cell_idx + 1}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  return replacements_made
190
 
191
- def handle_nature_business_section(cell, flat_json):
192
- """Handle Nature of Business section with sub-fields"""
 
193
  if not has_red_text(cell):
194
  return 0
195
 
 
196
  cell_text = get_clean_text(cell).lower()
197
  if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
198
  return 0
199
 
200
- print(f" 🎯 Found Nature of Business section")
201
 
202
- # Check for business description
203
- for key in flat_json.keys():
204
- if "nature of the operators business" in key.lower():
205
- business_value = flat_json[key]
206
- replacement_text = get_value_as_string(business_value)
207
- cell_replacements = replace_red_text_in_cell(cell, replacement_text)
208
- if cell_replacements > 0:
209
- print(f" βœ… Updated business description")
210
- return cell_replacements
211
 
212
- return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
- def handle_operator_declaration_table(table, flat_json):
215
- """Handle Operator Declaration table specifically"""
 
216
  replacements_made = 0
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  for row_idx, row in enumerate(table.rows):
219
  if len(row.cells) >= 2:
220
  cell1_text = get_clean_text(row.cells[0]).strip()
221
  cell2_text = get_clean_text(row.cells[1]).strip()
222
 
223
- # Check if this is the Print Name / Position Title header row
224
- if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower()):
225
- print(f" 🎯 Found Operator Declaration table")
 
 
 
226
 
227
- # Look for data row
228
  if row_idx + 1 < len(table.rows):
229
  data_row = table.rows[row_idx + 1]
230
  if len(data_row.cells) >= 2:
231
  name_cell = data_row.cells[0]
232
  position_cell = data_row.cells[1]
233
 
234
- # Update Print Name
235
  if has_red_text(name_cell):
236
  name_value = None
237
- for key in ["Print Name", "print name", "Operator Declaration.Print Name"]:
238
- if key in flat_json:
239
- name_value = flat_json[key]
240
- break
241
 
242
- if name_value:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  name_text = get_value_as_string(name_value)
244
  cell_replacements = replace_red_text_in_cell(name_cell, name_text)
245
  replacements_made += cell_replacements
246
- print(f" βœ… Updated Print Name: '{name_text}'")
247
 
248
- # Update Position Title
249
  if has_red_text(position_cell):
250
  position_value = None
251
- for key in ["Position Title", "position title", "Operator Declaration.Position Title"]:
252
- if key in flat_json:
253
- position_value = flat_json[key]
254
- break
255
 
256
- if position_value:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  position_text = get_value_as_string(position_value)
258
  cell_replacements = replace_red_text_in_cell(position_cell, position_text)
259
  replacements_made += cell_replacements
260
- print(f" βœ… Updated Position Title: '{position_text}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  break
263
 
264
  return replacements_made
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  def process_tables(document, flat_json):
267
- """Process all tables in document"""
268
  replacements_made = 0
269
 
270
  for table_idx, table in enumerate(document.tables):
271
  print(f"\nπŸ” Processing table {table_idx + 1}:")
272
 
273
- # Check for Operator Declaration table first (priority fix)
274
- if len(table.rows) <= 4: # Small tables
275
- declaration_replacements = handle_operator_declaration_table(table, flat_json)
276
- if declaration_replacements > 0:
277
- replacements_made += declaration_replacements
278
- continue
279
 
280
- # Process all rows
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  for row_idx, row in enumerate(table.rows):
282
  if len(row.cells) < 1:
283
  continue
@@ -290,90 +717,261 @@ def process_tables(document, flat_json):
290
 
291
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
292
 
293
- # Handle Nature of Business section
294
- if "nature of the operators business" in key_text.lower():
295
- nature_replacements = handle_nature_business_section(key_cell, flat_json)
296
- replacements_made += nature_replacements
297
- continue
298
-
299
- # Regular field matching
300
  json_value = find_matching_json_value(key_text, flat_json)
301
 
302
  if json_value is not None:
303
  replacement_text = get_value_as_string(json_value, key_text)
304
 
305
- # Handle Australian Company Number specially
306
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
307
  cell_replacements = handle_australian_company_number(row, json_value)
308
  replacements_made += cell_replacements
309
- else:
310
- # Handle regular fields
311
- for cell_idx in range(len(row.cells)):
312
- cell = row.cells[cell_idx]
 
 
 
313
  if has_red_text(cell):
 
 
 
314
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
315
  replacements_made += cell_replacements
316
  if cell_replacements > 0:
317
- print(f" βœ… Updated cell {cell_idx + 1}: '{replacement_text}'")
 
 
 
 
 
 
 
 
 
 
 
 
318
  else:
319
- # Process any red text in row cells
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  for cell_idx in range(len(row.cells)):
321
  cell = row.cells[cell_idx]
322
  if has_red_text(cell):
323
- # Try to extract red text and match it
324
- red_text = ""
325
- for paragraph in cell.paragraphs:
326
- for run in paragraph.runs:
327
- if is_red(run):
328
- red_text += run.text
329
 
330
- if red_text.strip():
331
- json_value = find_matching_json_value(red_text.strip(), flat_json)
332
- if json_value is not None:
333
- replacement_text = get_value_as_string(json_value)
334
- cell_replacements = replace_red_text_in_cell(cell, replacement_text)
335
- replacements_made += cell_replacements
336
- if cell_replacements > 0:
337
- print(f" βœ… Replaced red text: '{red_text.strip()}' -> '{replacement_text}'")
 
 
 
338
 
339
  return replacements_made
340
 
341
  def process_paragraphs(document, flat_json):
342
- """Process paragraphs for red text"""
343
  replacements_made = 0
344
  print(f"\nπŸ” Processing paragraphs:")
345
 
346
  for para_idx, paragraph in enumerate(document.paragraphs):
347
- red_text = ""
348
- red_runs = []
349
-
350
- for run in paragraph.runs:
351
- if is_red(run) and run.text.strip():
352
- red_text += run.text
353
- red_runs.append(run)
354
-
355
- if red_text.strip():
356
- print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text.strip()}'")
357
 
358
- json_value = find_matching_json_value(red_text.strip(), flat_json)
 
359
 
 
 
 
 
 
 
 
360
  if json_value is not None:
361
  replacement_text = get_value_as_string(json_value)
362
- print(f" βœ… Replacing with: '{replacement_text}'")
 
 
 
 
 
363
 
364
- # Replace in first red run only
365
- if red_runs:
366
- red_runs[0].text = replacement_text
367
- red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
368
- # Clear other red runs
369
- for run in red_runs[1:]:
370
- run.text = ''
371
- replacements_made += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
  return replacements_made
374
 
375
  def process_hf(json_file, docx_file, output_file):
376
- """Main processing function compatible with your new system"""
377
  try:
378
  # Load JSON
379
  if hasattr(json_file, "read"):
@@ -382,8 +980,7 @@ def process_hf(json_file, docx_file, output_file):
382
  with open(json_file, 'r', encoding='utf-8') as f:
383
  json_data = json.load(f)
384
 
385
- # Flatten your new JSON structure
386
- flat_json = flatten_json_new_system(json_data)
387
  print("πŸ“„ Available JSON keys (sample):")
388
  for i, (key, value) in enumerate(sorted(flat_json.items())):
389
  if i < 10:
@@ -396,13 +993,14 @@ def process_hf(json_file, docx_file, output_file):
396
  else:
397
  doc = Document(docx_file)
398
 
399
- # Process document
400
- print("πŸš€ Starting processing compatible with your new system...")
401
 
402
  table_replacements = process_tables(doc, flat_json)
403
  paragraph_replacements = process_paragraphs(doc, flat_json)
 
404
 
405
- total_replacements = table_replacements + paragraph_replacements
406
 
407
  # Save output
408
  if hasattr(output_file, "write"):
@@ -414,6 +1012,7 @@ def process_hf(json_file, docx_file, output_file):
414
  print(f"βœ… Total replacements: {total_replacements}")
415
  print(f" πŸ“Š Tables: {table_replacements}")
416
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
 
417
  print(f"πŸŽ‰ Processing complete!")
418
 
419
  except FileNotFoundError as e:
@@ -426,7 +1025,7 @@ def process_hf(json_file, docx_file, output_file):
426
  if __name__ == "__main__":
427
  import sys
428
  if len(sys.argv) != 4:
429
- print("Usage: python compatible_pipeline.py <input_docx> <updated_json> <output_docx>")
430
  exit(1)
431
  docx_path = sys.argv[1]
432
  json_path = sys.argv[2]
 
3
  from docx.shared import RGBColor
4
  import re
5
 
6
+ # Your original heading patterns (unchanged)
7
+ HEADING_PATTERNS = {
8
+ "main": [
9
+ r"NHVAS\s+Audit\s+Summary\s+Report",
10
+ r"NATIONAL\s+HEAVY\s+VEHICLE\s+ACCREDITATION\s+AUDIT\s+SUMMARY\s+REPORT",
11
+ r"NHVAS\s+AUDIT\s+SUMMARY\s+REPORT"
12
+ ],
13
+ "sub": [
14
+ r"AUDIT\s+OBSERVATIONS\s+AND\s+COMMENTS",
15
+ r"MAINTENANCE\s+MANAGEMENT",
16
+ r"MASS\s+MANAGEMENT",
17
+ r"FATIGUE\s+MANAGEMENT",
18
+ r"Fatigue\s+Management\s+Summary\s+of\s+Audit\s+findings",
19
+ r"MAINTENANCE\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
20
+ r"MASS\s+MANAGEMENT\s+SUMMARY\s+OF\s+AUDIT\s+FINDINGS",
21
+ r"Vehicle\s+Registration\s+Numbers\s+of\s+Records\s+Examined",
22
+ r"CORRECTIVE\s+ACTION\s+REQUEST\s+\(CAR\)",
23
+ r"NHVAS\s+APPROVED\s+AUDITOR\s+DECLARATION",
24
+ r"Operator\s+Declaration",
25
+ r"Operator\s+Information",
26
+ r"Driver\s*/\s*Scheduler\s+Records\s+Examined"
27
+ ]
28
+ }
29
+
30
  def load_json(filepath):
31
  with open(filepath, 'r') as file:
32
  return json.load(file)
33
 
34
+ def flatten_json(y, prefix=''):
35
+ out = {}
36
+ for key, val in y.items():
37
+ new_key = f"{prefix}.{key}" if prefix else key
38
+ if isinstance(val, dict):
39
+ out.update(flatten_json(val, new_key))
40
+ else:
41
+ out[new_key] = val
42
+ out[key] = val
43
+ return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  def is_red(run):
 
46
  color = run.font.color
47
  return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
48
 
49
  def get_value_as_string(value, field_name=""):
 
50
  if isinstance(value, list):
51
  if len(value) == 0:
52
  return ""
 
54
  return str(value[0])
55
  else:
56
  if "australian company number" in field_name.lower() or "company number" in field_name.lower():
57
+ return value
58
  else:
59
  return " ".join(str(v) for v in value)
60
  else:
61
  return str(value)
62
 
63
  def find_matching_json_value(field_name, flat_json):
64
+ """Your original matching function with minimal improvements"""
65
  field_name = field_name.strip()
66
 
67
+ # Try exact match first
68
  if field_name in flat_json:
69
  print(f" βœ… Direct match found for key '{field_name}'")
70
  return flat_json[field_name]
71
 
72
+ # Try case-insensitive exact match
73
  for key, value in flat_json.items():
74
  if key.lower() == field_name.lower():
75
  print(f" βœ… Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
76
  return value
77
 
78
+ # 🎯 MINIMAL IMPROVEMENT: Better Print Name detection for operator vs auditor
79
+ if field_name.lower().strip() == "print name":
80
+ # Look in the flat_json keys to see what context we're in
81
+ operator_keys = [k for k in flat_json.keys() if "operator" in k.lower() and "print name" in k.lower()]
82
+ auditor_keys = [k for k in flat_json.keys() if "auditor" in k.lower() and ("print name" in k.lower() or "name" in k.lower())]
83
+
84
+ # If we have operator-specific keys, prefer those in operator context
85
+ if operator_keys:
86
+ print(f" βœ… Operator Print Name match: '{field_name}' -> '{operator_keys[0]}'")
87
+ return flat_json[operator_keys[0]]
88
+ elif auditor_keys:
89
+ print(f" βœ… Auditor Name match: '{field_name}' -> '{auditor_keys[0]}'")
90
+ return flat_json[auditor_keys[0]]
91
+
92
+ # Try suffix matching (for nested keys like "section.field")
93
+ for key, value in flat_json.items():
94
+ if '.' in key and key.split('.')[-1].lower() == field_name.lower():
95
+ print(f" βœ… Suffix match found for key '{field_name}' with JSON key '{key}'")
96
+ return value
97
+
98
+ # Try partial matching - remove parentheses and special chars
99
+ clean_field = re.sub(r'[^\w\s]', ' ', field_name.lower()).strip()
100
+ clean_field = re.sub(r'\s+', ' ', clean_field)
101
+
102
+ for key, value in flat_json.items():
103
+ clean_key = re.sub(r'[^\w\s]', ' ', key.lower()).strip()
104
+ clean_key = re.sub(r'\s+', ' ', clean_key)
105
+
106
+ if clean_field == clean_key:
107
+ print(f" βœ… Clean match found for key '{field_name}' with JSON key '{key}'")
108
+ return value
109
+
110
+ # Enhanced fuzzy matching with better scoring
111
  field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
112
  if not field_words:
113
  return None
 
121
  if not key_words:
122
  continue
123
 
124
+ # Calculate similarity score
125
  common_words = field_words.intersection(key_words)
126
  if common_words:
127
+ # Use Jaccard similarity: intersection / union
128
  similarity = len(common_words) / len(field_words.union(key_words))
129
+
130
+ # Bonus for high word coverage in field_name
131
  coverage = len(common_words) / len(field_words)
132
  final_score = (similarity * 0.6) + (coverage * 0.4)
133
 
 
144
  return None
145
 
146
  def get_clean_text(cell):
 
147
  text = ""
148
  for paragraph in cell.paragraphs:
149
  for run in paragraph.runs:
 
151
  return text.strip()
152
 
153
  def has_red_text(cell):
 
154
  for paragraph in cell.paragraphs:
155
  for run in paragraph.runs:
156
  if is_red(run) and run.text.strip():
157
  return True
158
  return False
159
 
160
+ def extract_red_text_segments(cell):
161
+ """Your original red text extraction (unchanged)"""
162
+ red_segments = []
163
+
164
+ for para_idx, paragraph in enumerate(cell.paragraphs):
165
+ current_segment = ""
166
+ segment_runs = []
167
+
168
+ for run_idx, run in enumerate(paragraph.runs):
169
+ if is_red(run):
170
+ if run.text:
171
+ current_segment += run.text
172
+ segment_runs.append((para_idx, run_idx, run))
173
+ else:
174
+ # End of current red segment
175
+ if segment_runs:
176
+ red_segments.append({
177
+ 'text': current_segment,
178
+ 'runs': segment_runs.copy(),
179
+ 'paragraph_idx': para_idx
180
+ })
181
+ current_segment = ""
182
+ segment_runs = []
183
+
184
+ # Handle segment at end of paragraph
185
+ if segment_runs:
186
+ red_segments.append({
187
+ 'text': current_segment,
188
+ 'runs': segment_runs.copy(),
189
+ 'paragraph_idx': para_idx
190
+ })
191
+
192
+ return red_segments
193
+
194
  def replace_red_text_in_cell(cell, replacement_text):
195
+ """Your original replacement function (unchanged)"""
196
+ red_segments = extract_red_text_segments(cell)
197
+
198
+ if not red_segments:
199
+ return 0
200
+
201
+ if len(red_segments) > 1:
202
+ replacements_made = 0
203
+ for segment in red_segments:
204
+ segment_text = segment['text'].strip()
205
+ if segment_text:
206
+ pass
207
+
208
+ if replacements_made == 0:
209
+ return replace_all_red_segments(red_segments, replacement_text)
210
+
211
+ return replace_all_red_segments(red_segments, replacement_text)
212
+
213
+ def replace_all_red_segments(red_segments, replacement_text):
214
+ """Your original function (unchanged)"""
215
+ if not red_segments:
216
+ return 0
217
+
218
+ if '\n' in replacement_text:
219
+ replacement_lines = replacement_text.split('\n')
220
+ else:
221
+ replacement_lines = [replacement_text]
222
+
223
  replacements_made = 0
224
 
225
+ if red_segments and replacement_lines:
226
+ first_segment = red_segments[0]
227
+ if first_segment['runs']:
228
+ first_run = first_segment['runs'][0][2]
229
+ first_run.text = replacement_lines[0]
230
+ first_run.font.color.rgb = RGBColor(0, 0, 0)
231
+ replacements_made = 1
232
+
233
+ for _, _, run in first_segment['runs'][1:]:
234
+ run.text = ''
235
+
236
+ for segment in red_segments[1:]:
237
+ for _, _, run in segment['runs']:
238
+ run.text = ''
239
+
240
+ if len(replacement_lines) > 1 and red_segments:
241
+ try:
242
+ first_run = red_segments[0]['runs'][0][2]
243
+ paragraph = first_run.element.getparent()
244
+
245
+ for line in replacement_lines[1:]:
246
+ if line.strip():
247
+ from docx.oxml import OxmlElement, ns
248
+ br = OxmlElement('w:br')
249
+ first_run.element.append(br)
250
+
251
+ new_run = paragraph.add_run(line.strip())
252
+ new_run.font.color.rgb = RGBColor(0, 0, 0)
253
+ except:
254
+ if red_segments and red_segments[0]['runs']:
255
+ first_run = red_segments[0]['runs'][0][2]
256
+ first_run.text = ' '.join(replacement_lines)
257
+ first_run.font.color.rgb = RGBColor(0, 0, 0)
258
 
259
  return replacements_made
260
 
261
+ def replace_single_segment(segment, replacement_text):
262
+ """Your original function (unchanged)"""
263
+ if not segment['runs']:
264
+ return False
265
+
266
+ first_run = segment['runs'][0][2]
267
+ first_run.text = replacement_text
268
+ first_run.font.color.rgb = RGBColor(0, 0, 0)
269
+
270
+ for _, _, run in segment['runs'][1:]:
271
+ run.text = ''
272
+
273
+ return True
274
+
275
+ def handle_multiple_red_segments_in_cell(cell, flat_json):
276
+ """Your original function (unchanged)"""
277
+ red_segments = extract_red_text_segments(cell)
278
+
279
+ if not red_segments:
280
+ return 0
281
+
282
+ print(f" πŸ” Found {len(red_segments)} red text segments in cell")
283
  replacements_made = 0
284
+ unmatched_segments = []
285
+
286
+ for i, segment in enumerate(red_segments):
287
+ segment_text = segment['text'].strip()
288
+ if not segment_text:
289
+ continue
290
+
291
+ print(f" Segment {i+1}: '{segment_text[:50]}...'")
292
+
293
+ json_value = find_matching_json_value(segment_text, flat_json)
294
+
295
+ if json_value is not None:
296
+ replacement_text = get_value_as_string(json_value, segment_text)
297
+
298
+ if isinstance(json_value, list) and len(json_value) > 1:
299
+ replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
300
+
301
+ success = replace_single_segment(segment, replacement_text)
302
+ if success:
303
+ replacements_made += 1
304
+ print(f" βœ… Replaced segment '{segment_text[:30]}...' with '{replacement_text[:30]}...'")
305
+ else:
306
+ unmatched_segments.append(segment)
307
+ print(f" ⏳ No individual match for segment '{segment_text[:30]}...'")
308
+
309
+ if unmatched_segments and replacements_made == 0:
310
+ combined_text = " ".join(seg['text'] for seg in red_segments).strip()
311
+ print(f" πŸ”„ Trying combined text match: '{combined_text[:50]}...'")
312
+
313
+ json_value = find_matching_json_value(combined_text, flat_json)
314
+ if json_value is not None:
315
+ replacement_text = get_value_as_string(json_value, combined_text)
316
+ if isinstance(json_value, list) and len(json_value) > 1:
317
+ replacement_text = "\n".join(str(item) for item in json_value if str(item).strip())
318
+
319
+ replacements_made = replace_all_red_segments(red_segments, replacement_text)
320
+ print(f" βœ… Replaced combined text with '{replacement_text[:50]}...'")
321
+
322
  return replacements_made
323
 
324
+ # 🎯 SURGICAL FIX 1: Handle Nature of Business multi-line red text
325
+ def handle_nature_business_multiline_fix(cell, flat_json):
326
+ """SURGICAL FIX: Handle multi-line red text in Nature of Business cells"""
327
  if not has_red_text(cell):
328
  return 0
329
 
330
+ # Check if this cell contains "Nature of the Operators Business"
331
  cell_text = get_clean_text(cell).lower()
332
  if "nature of the operators business" not in cell_text and "nature of the operator business" not in cell_text:
333
  return 0
334
 
335
+ print(f" 🎯 SURGICAL FIX: Nature of Business multi-line processing")
336
 
337
+ # Look for sub-fields like "Accreditation Number:" and "Expiry Date:"
338
+ red_segments = extract_red_text_segments(cell)
339
+ replacements_made = 0
 
 
 
 
 
 
340
 
341
+ # Try to replace each segment individually first
342
+ for segment in red_segments:
343
+ segment_text = segment['text'].strip()
344
+ if not segment_text:
345
+ continue
346
+
347
+ json_value = find_matching_json_value(segment_text, flat_json)
348
+ if json_value is not None:
349
+ replacement_text = get_value_as_string(json_value, segment_text)
350
+ success = replace_single_segment(segment, replacement_text)
351
+ if success:
352
+ replacements_made += 1
353
+ print(f" βœ… Fixed segment: '{segment_text[:30]}...'")
354
+
355
+ # If no individual matches, try combined approach
356
+ if replacements_made == 0 and red_segments:
357
+ combined_text = " ".join(seg['text'] for seg in red_segments).strip()
358
+ json_value = find_matching_json_value(combined_text, flat_json)
359
+ if json_value is not None:
360
+ replacement_text = get_value_as_string(json_value, combined_text)
361
+ replacements_made = replace_all_red_segments(red_segments, replacement_text)
362
+ print(f" βœ… Fixed combined text")
363
+
364
+ return replacements_made
365
 
366
+ # 🎯 SURGICAL FIX 2: Handle Operator Declaration table with context awareness
367
+ def handle_operator_declaration_fix(table, flat_json):
368
+ """SURGICAL FIX: Handle Operator Declaration Print Name and Position Title with better context detection"""
369
  replacements_made = 0
370
 
371
+ # Build table context to understand what type of declaration this is
372
+ table_context = ""
373
+ for row in table.rows:
374
+ for cell in row.cells:
375
+ table_context += get_clean_text(cell).lower() + " "
376
+
377
+ # Determine if this is an operator declaration vs auditor declaration
378
+ is_operator_declaration = any(keyword in table_context for keyword in [
379
+ "hereby acknowledge", "findings detailed", "management system",
380
+ "accreditation to be shared", "operator signature"
381
+ ])
382
+
383
+ is_auditor_declaration = any(keyword in table_context for keyword in [
384
+ "nhvas approved auditor", "auditor registration", "hereby certify",
385
+ "auditor signature"
386
+ ])
387
+
388
+ # Process the table based on context
389
  for row_idx, row in enumerate(table.rows):
390
  if len(row.cells) >= 2:
391
  cell1_text = get_clean_text(row.cells[0]).strip()
392
  cell2_text = get_clean_text(row.cells[1]).strip()
393
 
394
+ # Check if this is a header row with Print Name and Position Title
395
+ if ("print name" in cell1_text.lower() and "position title" in cell2_text.lower() and
396
+ len(table.rows) <= 4): # Small table only
397
+
398
+ context_type = "Operator" if is_operator_declaration else ("Auditor" if is_auditor_declaration else "Unknown")
399
+ print(f" 🎯 SURGICAL FIX: {context_type} Declaration table detected")
400
 
401
+ # Look for the data row (should be next row)
402
  if row_idx + 1 < len(table.rows):
403
  data_row = table.rows[row_idx + 1]
404
  if len(data_row.cells) >= 2:
405
  name_cell = data_row.cells[0]
406
  position_cell = data_row.cells[1]
407
 
408
+ # Fix Print Name based on context
409
  if has_red_text(name_cell):
410
  name_value = None
 
 
 
 
411
 
412
+ if is_operator_declaration:
413
+ # Try operator-specific fields first
414
+ for field_attempt in ["Operator Declaration.Print Name", "operator.print name", "Print Name"]:
415
+ name_value = find_matching_json_value(field_attempt, flat_json)
416
+ if name_value is not None:
417
+ break
418
+ elif is_auditor_declaration:
419
+ # Try auditor-specific fields first
420
+ for field_attempt in ["NHVAS Approved Auditor Declaration.Print Name", "auditor name", "auditor", "Print Name"]:
421
+ name_value = find_matching_json_value(field_attempt, flat_json)
422
+ if name_value is not None:
423
+ break
424
+ else:
425
+ # Fallback to generic
426
+ name_value = find_matching_json_value("Print Name", flat_json)
427
+
428
+ if name_value is not None:
429
  name_text = get_value_as_string(name_value)
430
  cell_replacements = replace_red_text_in_cell(name_cell, name_text)
431
  replacements_made += cell_replacements
432
+ print(f" βœ… Fixed {context_type} Print Name: '{name_text}'")
433
 
434
+ # Fix Position Title based on context
435
  if has_red_text(position_cell):
436
  position_value = None
 
 
 
 
437
 
438
+ if is_operator_declaration:
439
+ # Try operator-specific fields first
440
+ for field_attempt in ["Operator Declaration.Position Title", "operator.position title", "Position Title"]:
441
+ position_value = find_matching_json_value(field_attempt, flat_json)
442
+ if position_value is not None:
443
+ break
444
+ elif is_auditor_declaration:
445
+ # Try auditor registration number for auditor declarations
446
+ for field_attempt in ["NHVR or Exemplar Global Auditor Registration Number", "auditor registration", "registration number"]:
447
+ position_value = find_matching_json_value(field_attempt, flat_json)
448
+ if position_value is not None:
449
+ break
450
+ else:
451
+ # Fallback to generic
452
+ position_value = find_matching_json_value("Position Title", flat_json)
453
+
454
+ if position_value is not None:
455
  position_text = get_value_as_string(position_value)
456
  cell_replacements = replace_red_text_in_cell(position_cell, position_text)
457
  replacements_made += cell_replacements
458
+ print(f" βœ… Fixed {context_type} Position/Registration: '{position_text}'")
459
+
460
+ break # Found the table, stop looking
461
+
462
+ return replacements_made
463
+
464
+ def handle_australian_company_number(row, company_numbers):
465
+ """Your original function (unchanged)"""
466
+ replacements_made = 0
467
+ for i, digit in enumerate(company_numbers):
468
+ cell_idx = i + 1
469
+ if cell_idx < len(row.cells):
470
+ cell = row.cells[cell_idx]
471
+ if has_red_text(cell):
472
+ cell_replacements = replace_red_text_in_cell(cell, str(digit))
473
+ replacements_made += cell_replacements
474
+ print(f" -> Placed digit '{digit}' in cell {cell_idx + 1}")
475
+ return replacements_made
476
+
477
+ def handle_vehicle_registration_table(table, flat_json):
478
+ """Your original function (unchanged)"""
479
+ replacements_made = 0
480
+
481
+ # Try to find vehicle registration data
482
+ vehicle_section = None
483
+
484
+ for key, value in flat_json.items():
485
+ if "vehicle registration numbers of records examined" in key.lower():
486
+ if isinstance(value, dict):
487
+ vehicle_section = value
488
+ print(f" βœ… Found vehicle data in key: '{key}'")
489
+ break
490
+
491
+ if not vehicle_section:
492
+ potential_columns = {}
493
+ for key, value in flat_json.items():
494
+ if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]):
495
+ if "." in key:
496
+ column_name = key.split(".")[-1]
497
+ else:
498
+ column_name = key
499
+ potential_columns[column_name] = value
500
+
501
+ if potential_columns:
502
+ vehicle_section = potential_columns
503
+ print(f" βœ… Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
504
+ else:
505
+ print(f" ❌ Vehicle registration data not found in JSON")
506
+ return 0
507
+
508
+ print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
509
+
510
+ # Find header row
511
+ header_row_idx = -1
512
+ header_row = None
513
+
514
+ for row_idx, row in enumerate(table.rows):
515
+ row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
516
+ if "registration" in row_text and "number" in row_text:
517
+ header_row_idx = row_idx
518
+ header_row = row
519
+ break
520
+
521
+ if header_row_idx == -1:
522
+ print(f" ❌ Could not find header row in vehicle table")
523
+ return 0
524
+
525
+ print(f" βœ… Found header row at index {header_row_idx}")
526
+
527
+ # Enhanced column mapping
528
+ column_mapping = {}
529
+ for col_idx, cell in enumerate(header_row.cells):
530
+ header_text = get_clean_text(cell).strip()
531
+ if not header_text or header_text.lower() == "no.":
532
+ continue
533
+
534
+ best_match = None
535
+ best_score = 0
536
+
537
+ normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
538
+
539
+ for json_key in vehicle_section.keys():
540
+ normalized_json = json_key.lower().strip()
541
+
542
+ if normalized_header == normalized_json:
543
+ best_match = json_key
544
+ best_score = 1.0
545
+ break
546
+
547
+ header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
548
+ json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
549
+
550
+ if header_words and json_words:
551
+ common_words = header_words.intersection(json_words)
552
+ score = len(common_words) / max(len(header_words), len(json_words))
553
+
554
+ if score > best_score and score >= 0.3:
555
+ best_score = score
556
+ best_match = json_key
557
+
558
+ header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
559
+ json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
560
+
561
+ if header_clean in json_clean or json_clean in header_clean:
562
+ if len(header_clean) > 5 and len(json_clean) > 5:
563
+ substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
564
+ if substring_score > best_score and substring_score >= 0.6:
565
+ best_score = substring_score
566
+ best_match = json_key
567
+
568
+ if best_match:
569
+ column_mapping[col_idx] = best_match
570
+ print(f" πŸ“Œ Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
571
+
572
+ if not column_mapping:
573
+ print(f" ❌ No column mappings found")
574
+ return 0
575
+
576
+ # Determine data rows needed
577
+ max_data_rows = 0
578
+ for json_key, data in vehicle_section.items():
579
+ if isinstance(data, list):
580
+ max_data_rows = max(max_data_rows, len(data))
581
+
582
+ print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
583
+
584
+ # Process data rows
585
+ for data_row_index in range(max_data_rows):
586
+ table_row_idx = header_row_idx + 1 + data_row_index
587
+
588
+ if table_row_idx >= len(table.rows):
589
+ print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
590
+ print(f" βž• Adding new row for vehicle {data_row_index + 1}")
591
+
592
+ new_row = table.add_row()
593
+ print(f" βœ… Successfully added row {len(table.rows)} to the table")
594
+
595
+ row = table.rows[table_row_idx]
596
+ print(f" πŸ“Œ Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
597
+
598
+ for col_idx, json_key in column_mapping.items():
599
+ if col_idx < len(row.cells):
600
+ cell = row.cells[col_idx]
601
+
602
+ column_data = vehicle_section.get(json_key, [])
603
+ if isinstance(column_data, list) and data_row_index < len(column_data):
604
+ replacement_value = str(column_data[data_row_index])
605
+
606
+ cell_text = get_clean_text(cell)
607
+ if has_red_text(cell) or not cell_text.strip():
608
+ if not cell_text.strip():
609
+ cell.text = replacement_value
610
+ replacements_made += 1
611
+ print(f" -> Added '{replacement_value}' to empty cell (column '{json_key}')")
612
+ else:
613
+ cell_replacements = replace_red_text_in_cell(cell, replacement_value)
614
+ replacements_made += cell_replacements
615
+ if cell_replacements > 0:
616
+ print(f" -> Replaced red text with '{replacement_value}' (column '{json_key}')")
617
+
618
+ return replacements_made
619
+
620
+ def handle_print_accreditation_section(table, flat_json):
621
+ """Your original function (unchanged)"""
622
+ replacements_made = 0
623
+
624
+ print_data = flat_json.get("print accreditation name.print accreditation name", [])
625
+ if not isinstance(print_data, list) or len(print_data) < 2:
626
+ return 0
627
+
628
+ name_value = print_data[0]
629
+ position_value = print_data[1]
630
+
631
+ print(f" πŸ“‹ Print accreditation data: Name='{name_value}', Position='{position_value}'")
632
+
633
+ for row_idx, row in enumerate(table.rows):
634
+ if len(row.cells) >= 2:
635
+ cell1_text = get_clean_text(row.cells[0]).lower()
636
+ cell2_text = get_clean_text(row.cells[1]).lower()
637
+
638
+ if "print name" in cell1_text and "position title" in cell2_text:
639
+ print(f" πŸ“ Found header row {row_idx + 1}: '{cell1_text}' | '{cell2_text}'")
640
+
641
+ if row_idx + 1 < len(table.rows):
642
+ data_row = table.rows[row_idx + 1]
643
+ if len(data_row.cells) >= 2:
644
+ if has_red_text(data_row.cells[0]):
645
+ cell_replacements = replace_red_text_in_cell(data_row.cells[0], name_value)
646
+ replacements_made += cell_replacements
647
+ if cell_replacements > 0:
648
+ print(f" βœ… Replaced Print Name: '{name_value}'")
649
+
650
+ if has_red_text(data_row.cells[1]):
651
+ cell_replacements = replace_red_text_in_cell(data_row.cells[1], position_value)
652
+ replacements_made += cell_replacements
653
+ if cell_replacements > 0:
654
+ print(f" βœ… Replaced Position Title: '{position_value}'")
655
 
656
  break
657
 
658
  return replacements_made
659
 
660
+ def process_single_column_sections(cell, field_name, flat_json):
661
+ """Your original function (unchanged)"""
662
+ json_value = find_matching_json_value(field_name, flat_json)
663
+ if json_value is not None:
664
+ replacement_text = get_value_as_string(json_value, field_name)
665
+ if isinstance(json_value, list) and len(json_value) > 1:
666
+ replacement_text = "\n".join(str(item) for item in json_value)
667
+ if has_red_text(cell):
668
+ print(f" βœ… Replacing red text in single-column section: '{field_name}'")
669
+ print(f" βœ… Replacement text:\n{replacement_text}")
670
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
671
+ if cell_replacements > 0:
672
+ print(f" -> Replaced with: '{replacement_text[:100]}...'")
673
+ return cell_replacements
674
+ return 0
675
+
676
  def process_tables(document, flat_json):
677
+ """Your original function with minimal surgical fixes added"""
678
  replacements_made = 0
679
 
680
  for table_idx, table in enumerate(document.tables):
681
  print(f"\nπŸ” Processing table {table_idx + 1}:")
682
 
683
+ # Your original logic
684
+ table_text = ""
685
+ for row in table.rows[:3]:
686
+ for cell in row.cells:
687
+ table_text += get_clean_text(cell).lower() + " "
 
688
 
689
+ # Enhanced vehicle registration detection
690
+ vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
691
+ indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
692
+ if indicator_count >= 2:
693
+ print(f" πŸš— Detected Vehicle Registration table")
694
+ vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
695
+ replacements_made += vehicle_replacements
696
+ continue
697
+
698
+ # Enhanced print accreditation detection
699
+ print_accreditation_indicators = ["print name", "position title"]
700
+ indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
701
+ if indicator_count >= 1:
702
+ print(f" πŸ“‹ Detected Print Accreditation table")
703
+ print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
704
+ replacements_made += print_accreditation_replacements
705
+ continue
706
+
707
+ # Your existing row processing
708
  for row_idx, row in enumerate(table.rows):
709
  if len(row.cells) < 1:
710
  continue
 
717
 
718
  print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
719
 
 
 
 
 
 
 
 
720
  json_value = find_matching_json_value(key_text, flat_json)
721
 
722
  if json_value is not None:
723
  replacement_text = get_value_as_string(json_value, key_text)
724
 
725
+ # Enhanced ACN handling
726
  if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
727
  cell_replacements = handle_australian_company_number(row, json_value)
728
  replacements_made += cell_replacements
729
+
730
+ # Enhanced section header handling
731
+ elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
732
+ print(f" βœ… Section header detected, checking next row for content...")
733
+ next_row = table.rows[row_idx + 1]
734
+
735
+ for cell_idx, cell in enumerate(next_row.cells):
736
  if has_red_text(cell):
737
+ print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
738
+ if isinstance(json_value, list):
739
+ replacement_text = "\n".join(str(item) for item in json_value)
740
  cell_replacements = replace_red_text_in_cell(cell, replacement_text)
741
  replacements_made += cell_replacements
742
  if cell_replacements > 0:
743
+ print(f" -> Replaced section content with: '{replacement_text[:100]}...'")
744
+
745
+ elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
746
+ if has_red_text(key_cell):
747
+ cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
748
+ replacements_made += cell_replacements
749
+ else:
750
+ for cell_idx in range(1, len(row.cells)):
751
+ value_cell = row.cells[cell_idx]
752
+ if has_red_text(value_cell):
753
+ print(f" βœ… Found red text in column {cell_idx + 1}")
754
+ cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
755
+ replacements_made += cell_replacements
756
  else:
757
+ # Enhanced fallback processing for unmatched keys
758
+ if len(row.cells) == 1 and has_red_text(key_cell):
759
+ red_text = ""
760
+ for paragraph in key_cell.paragraphs:
761
+ for run in paragraph.runs:
762
+ if is_red(run):
763
+ red_text += run.text
764
+ if red_text.strip():
765
+ section_value = find_matching_json_value(red_text.strip(), flat_json)
766
+ if section_value is not None:
767
+ section_replacement = get_value_as_string(section_value, red_text.strip())
768
+ cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
769
+ replacements_made += cell_replacements
770
+
771
+ # Enhanced red text processing for all cells
772
  for cell_idx in range(len(row.cells)):
773
  cell = row.cells[cell_idx]
774
  if has_red_text(cell):
775
+ cell_replacements = handle_multiple_red_segments_in_cell(cell, flat_json)
776
+ replacements_made += cell_replacements
 
 
 
 
777
 
778
+ # 🎯 SURGICAL FIX 1: Only if no replacements were made
779
+ if cell_replacements == 0:
780
+ surgical_fix = handle_nature_business_multiline_fix(cell, flat_json)
781
+ replacements_made += surgical_fix
782
+
783
+ # 🎯 SURGICAL FIX 2: Handle Operator Declaration tables (only check last few tables)
784
+ print(f"\n🎯 SURGICAL FIX: Checking for Operator/Auditor Declaration tables...")
785
+ for table in document.tables[-3:]: # Only check last 3 tables
786
+ if len(table.rows) <= 4: # Only small tables
787
+ declaration_fix = handle_operator_declaration_fix(table, flat_json)
788
+ replacements_made += declaration_fix
789
 
790
  return replacements_made
791
 
792
  def process_paragraphs(document, flat_json):
793
+ """Your original function (unchanged)"""
794
  replacements_made = 0
795
  print(f"\nπŸ” Processing paragraphs:")
796
 
797
  for para_idx, paragraph in enumerate(document.paragraphs):
798
+ red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
799
+ if red_runs:
800
+ full_text = paragraph.text.strip()
801
+ red_text_only = "".join(run.text for run in red_runs).strip()
802
+ print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
 
 
 
 
 
803
 
804
+ # Your existing matching logic
805
+ json_value = find_matching_json_value(red_text_only, flat_json)
806
 
807
+ if json_value is None:
808
+ # Enhanced pattern matching for signatures and dates
809
+ if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
810
+ json_value = find_matching_json_value("auditor signature", flat_json)
811
+ elif "OPERATOR SIGNATURE" in red_text_only.upper():
812
+ json_value = find_matching_json_value("operator signature", flat_json)
813
+
814
  if json_value is not None:
815
  replacement_text = get_value_as_string(json_value)
816
+ print(f" βœ… Replacing red text with: '{replacement_text}'")
817
+ red_runs[0].text = replacement_text
818
+ red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
819
+ for run in red_runs[1:]:
820
+ run.text = ''
821
+ replacements_made += 1
822
 
823
+ return replacements_made
824
+
825
+ def process_headings(document, flat_json):
826
+ """Your original function (unchanged)"""
827
+ replacements_made = 0
828
+ print(f"\nπŸ” Processing headings:")
829
+
830
+ paragraphs = document.paragraphs
831
+
832
+ for para_idx, paragraph in enumerate(paragraphs):
833
+ paragraph_text = paragraph.text.strip()
834
+
835
+ if not paragraph_text:
836
+ continue
837
+
838
+ # Enhanced heading detection
839
+ matched_heading = None
840
+ for category, patterns in HEADING_PATTERNS.items():
841
+ for pattern in patterns:
842
+ if re.search(pattern, paragraph_text, re.IGNORECASE):
843
+ matched_heading = pattern
844
+ break
845
+ if matched_heading:
846
+ break
847
+
848
+ if matched_heading:
849
+ print(f" πŸ“Œ Found heading at paragraph {para_idx + 1}: '{paragraph_text}'")
850
+
851
+ # Check current heading paragraph
852
+ if has_red_text_in_paragraph(paragraph):
853
+ print(f" πŸ”΄ Found red text in heading itself")
854
+ heading_replacements = process_red_text_in_paragraph(paragraph, paragraph_text, flat_json)
855
+ replacements_made += heading_replacements
856
+
857
+ # Enhanced: Look further ahead for related content
858
+ for next_para_offset in range(1, 6): # Extended range
859
+ next_para_idx = para_idx + next_para_offset
860
+ if next_para_idx >= len(paragraphs):
861
+ break
862
+
863
+ next_paragraph = paragraphs[next_para_idx]
864
+ next_text = next_paragraph.text.strip()
865
+
866
+ if not next_text:
867
+ continue
868
+
869
+ # Stop if we hit another heading
870
+ is_another_heading = False
871
+ for category, patterns in HEADING_PATTERNS.items():
872
+ for pattern in patterns:
873
+ if re.search(pattern, next_text, re.IGNORECASE):
874
+ is_another_heading = True
875
+ break
876
+ if is_another_heading:
877
+ break
878
+
879
+ if is_another_heading:
880
+ break
881
+
882
+ # Process red text with enhanced context
883
+ if has_red_text_in_paragraph(next_paragraph):
884
+ print(f" πŸ”΄ Found red text in paragraph {next_para_idx + 1} after heading: '{next_text[:50]}...'")
885
+
886
+ context_replacements = process_red_text_in_paragraph(
887
+ next_paragraph,
888
+ paragraph_text,
889
+ flat_json
890
+ )
891
+ replacements_made += context_replacements
892
+
893
+ return replacements_made
894
+
895
+ def has_red_text_in_paragraph(paragraph):
896
+ """Your original function (unchanged)"""
897
+ for run in paragraph.runs:
898
+ if is_red(run) and run.text.strip():
899
+ return True
900
+ return False
901
+
902
+ def process_red_text_in_paragraph(paragraph, context_text, flat_json):
903
+ """Your original function (unchanged)"""
904
+ replacements_made = 0
905
+
906
+ red_text_segments = []
907
+ for run in paragraph.runs:
908
+ if is_red(run) and run.text.strip():
909
+ red_text_segments.append(run.text.strip())
910
+
911
+ if not red_text_segments:
912
+ return 0
913
+
914
+ combined_red_text = " ".join(red_text_segments).strip()
915
+ print(f" πŸ” Red text found: '{combined_red_text}'")
916
+
917
+ json_value = None
918
+
919
+ # Strategy 1: Direct matching
920
+ json_value = find_matching_json_value(combined_red_text, flat_json)
921
+
922
+ # Strategy 2: Enhanced context-based matching
923
+ if json_value is None:
924
+ if "NHVAS APPROVED AUDITOR" in context_text.upper():
925
+ auditor_fields = ["auditor name", "auditor", "nhvas auditor", "approved auditor", "print name"]
926
+ for field in auditor_fields:
927
+ json_value = find_matching_json_value(field, flat_json)
928
+ if json_value is not None:
929
+ print(f" βœ… Found auditor match with field: '{field}'")
930
+ break
931
+
932
+ elif "OPERATOR DECLARATION" in context_text.upper():
933
+ operator_fields = ["operator name", "operator", "company name", "organisation name", "print name"]
934
+ for field in operator_fields:
935
+ json_value = find_matching_json_value(field, flat_json)
936
+ if json_value is not None:
937
+ print(f" βœ… Found operator match with field: '{field}'")
938
+ break
939
+
940
+ # Strategy 3: Enhanced context combination
941
+ if json_value is None:
942
+ context_queries = [
943
+ f"{context_text} {combined_red_text}",
944
+ combined_red_text,
945
+ context_text
946
+ ]
947
+
948
+ for query in context_queries:
949
+ json_value = find_matching_json_value(query, flat_json)
950
+ if json_value is not None:
951
+ print(f" βœ… Found match with combined query: '{query[:50]}...'")
952
+ break
953
+
954
+ # Replace if match found
955
+ if json_value is not None:
956
+ replacement_text = get_value_as_string(json_value, combined_red_text)
957
+
958
+ red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
959
+ if red_runs:
960
+ red_runs[0].text = replacement_text
961
+ red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
962
+
963
+ for run in red_runs[1:]:
964
+ run.text = ''
965
+
966
+ replacements_made = 1
967
+ print(f" βœ… Replaced with: '{replacement_text}'")
968
+ else:
969
+ print(f" ❌ No match found for red text: '{combined_red_text}'")
970
 
971
  return replacements_made
972
 
973
  def process_hf(json_file, docx_file, output_file):
974
+ """Your original main function (unchanged)"""
975
  try:
976
  # Load JSON
977
  if hasattr(json_file, "read"):
 
980
  with open(json_file, 'r', encoding='utf-8') as f:
981
  json_data = json.load(f)
982
 
983
+ flat_json = flatten_json(json_data)
 
984
  print("πŸ“„ Available JSON keys (sample):")
985
  for i, (key, value) in enumerate(sorted(flat_json.items())):
986
  if i < 10:
 
993
  else:
994
  doc = Document(docx_file)
995
 
996
+ # Your original processing with surgical fixes
997
+ print("πŸš€ Starting processing with minimal surgical fixes...")
998
 
999
  table_replacements = process_tables(doc, flat_json)
1000
  paragraph_replacements = process_paragraphs(doc, flat_json)
1001
+ heading_replacements = process_headings(doc, flat_json)
1002
 
1003
+ total_replacements = table_replacements + paragraph_replacements + heading_replacements
1004
 
1005
  # Save output
1006
  if hasattr(output_file, "write"):
 
1012
  print(f"βœ… Total replacements: {total_replacements}")
1013
  print(f" πŸ“Š Tables: {table_replacements}")
1014
  print(f" πŸ“ Paragraphs: {paragraph_replacements}")
1015
+ print(f" πŸ“‹ Headings: {heading_replacements}")
1016
  print(f"πŸŽ‰ Processing complete!")
1017
 
1018
  except FileNotFoundError as e:
 
1025
  if __name__ == "__main__":
1026
  import sys
1027
  if len(sys.argv) != 4:
1028
+ print("Usage: python pipeline.py <input_docx> <updated_json> <output_docx>")
1029
  exit(1)
1030
  docx_path = sys.argv[1]
1031
  json_path = sys.argv[2]