Shami96 commited on
Commit
e8b46b5
Β·
verified Β·
1 Parent(s): 6cfe4a2

Upload 4 files

Browse files
extract_pdf_data.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ from pdf2image import convert_from_path
3
+ import pytesseract
4
+
5
+ def extract_pdf_full_text(pdf_path, txt_path):
6
+ raw_texts = []
7
+ need_ocr = []
8
+ # Step 1: Try to extract RAW text, record which pages need OCR
9
+ with pdfplumber.open(pdf_path) as pdf:
10
+ for i, page in enumerate(pdf.pages):
11
+ print(f"Extracting text from page {i+1}...")
12
+ text = page.extract_text() or ""
13
+ if text.strip():
14
+ raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
15
+ else:
16
+ raw_texts.append(None) # Mark that we need OCR for this page
17
+ need_ocr.append(i)
18
+
19
+ # Step 2: OCR only those pages with no RAW text
20
+ print("Running OCR where RAW text is missing...")
21
+ images = convert_from_path(pdf_path, dpi=300)
22
+ for idx in need_ocr:
23
+ ocr_text = pytesseract.image_to_string(images[idx])
24
+ raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
25
+
26
+ # Step 3: Save to file (skip any leftover Nones, but there shouldn't be any)
27
+ result = [txt for txt in raw_texts if txt]
28
+ with open(txt_path, "w", encoding="utf-8") as f:
29
+ f.write("\n".join(result))
30
+ print(f"βœ… Saved deduped full text to {txt_path}")
31
+
32
+ if __name__ == "__main__":
33
+ extract_pdf_full_text("test1.pdf", "pdf_all_text_full.txt")
extract_red_text.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import re, json, sys
3
+ from docx import Document
4
+ from docx.oxml.ns import qn
5
+
6
+ def is_red_font(run) -> bool:
7
+ """Return True if this run is coloured red-ish."""
8
+ col = run.font.color
9
+ if col and col.rgb:
10
+ r,g,b = col.rgb[0], col.rgb[1], col.rgb[2]
11
+ if r>150 and g<100 and b<100 and (r-g)>30 and (r-b)>30:
12
+ return True
13
+ # fallback: raw <w:color w:val="XXXXXX"/>
14
+ rPr = getattr(run._element, "rPr", None)
15
+ if rPr is not None:
16
+ clr = rPr.find(qn('w:color'))
17
+ if clr is not None:
18
+ val = clr.get(qn('w:val'))
19
+ if re.fullmatch(r"[0-9A-Fa-f]{6}", val):
20
+ rr,gg,bb = int(val[:2],16), int(val[2:4],16), int(val[4:],16)
21
+ if rr>150 and gg<100 and bb<100 and (rr-gg)>30 and (rr-bb)>30:
22
+ return True
23
+ return False
24
+
25
+ # ─────────────────────────────────────────────────────────────────────────────
26
+ # Your template, mapped 1:1 to doc.tables[0..18]
27
+ MASTER_TABLES = [
28
+ # Table 0: Tick as appropriate (Mass, Maintenance, etc.)
29
+ {
30
+ "name": "Tick as appropriate",
31
+ "labels_on_row1": True,
32
+ "labels": [
33
+ "Mass", "Maintenance", "Basic Fatigue", "Advanced Fatigue",
34
+ "Entry Audit", "Initial Compliance Audit", "Compliance Audit",
35
+ "Spot Check", "Triggered Audit"
36
+ ]
37
+ },
38
+
39
+ # Table 1: Audit Information
40
+ {
41
+ "name": "Audit Information",
42
+ "labels_on_left": True,
43
+ "labels": [
44
+ "Date of Audit",
45
+ "Location of audit",
46
+ "Auditor name",
47
+ "Audit Matrix Identifier (Name or Number)", # Corrected full label
48
+ "Auditor Exemplar Global Reg No.",
49
+ "expiry Date:",
50
+ "NHVR Auditor Registration Number",
51
+ "expiry Date:" # Note: Duplicate label, might need special handling
52
+ ]
53
+ },
54
+
55
+ # Table 2: Operator Information (including contact details)
56
+ {
57
+ "name": "Operator Information",
58
+ "labels_on_left": True,
59
+ "skip_rows": ["Operator contact details", ""], # Skip subheading and blank rows
60
+ "labels": [
61
+ "Operator name (Legal entity)",
62
+ "NHVAS Accreditation No. (If applicable)",
63
+ "Registered trading name/s",
64
+ "Australian Company Number",
65
+ "NHVAS Manual (Policies and Procedures) developed by",
66
+ "Operator business address",
67
+ "Operator Postal address",
68
+ "Email address",
69
+ "Operator Telephone Number"
70
+ ]
71
+ },
72
+
73
+ # Table 3: Attendance List
74
+ {
75
+ "name": "Attendance List (Names and Position Titles)",
76
+ "labels": ["Attendance List (Names and Position Titles)"]
77
+ },
78
+
79
+ # Table 4: Nature of the Operators Business
80
+ {
81
+ "name": "Nature of the Operators Business (Summary)",
82
+ "labels": [
83
+ "Nature of the Operators Business (Summary)",
84
+ "Accreditation Number:",
85
+ "Expiry Date:"
86
+ ]
87
+ },
88
+
89
+ # Table 5: Accreditation Vehicle Summary
90
+ {
91
+ "name": "Accreditation Vehicle Summary",
92
+ "labels_on_left": True,
93
+ "labels": [
94
+ "Number of powered vehicles",
95
+ "Number of trailing vehicles"
96
+ ]
97
+ },
98
+
99
+ # Table 6: Accreditation Driver Summary
100
+ {
101
+ "name": "Accreditation Driver Summary",
102
+ "labels_on_left": True,
103
+ "labels": [
104
+ "Number of drivers in BFM",
105
+ "Number of drivers in AFM"
106
+ ]
107
+ },
108
+
109
+ # Table 7: Compliance Codes
110
+ {
111
+ "name": "Compliance Codes",
112
+ "labels_on_row1": True,
113
+ "labels": ["V", "SFI", "NA", "NC", "NAP"]
114
+ },
115
+
116
+ # Table 8: Corrective Action Request Identification
117
+ {
118
+ "name": "Corrective Action Request Identification",
119
+ "labels_on_row1": True,
120
+ "labels": ["Title", "Abbreviation", "Description"]
121
+ },
122
+
123
+ # Table 9: MASS MANAGEMENT (Standards 1-8)
124
+ {
125
+ "name": "MASS MANAGEMENT",
126
+ "labels_on_left": True,
127
+ "labels": [
128
+ "Std 1. Responsibilities",
129
+ "Std 2. Vehicle Control",
130
+ "Std 3. Vehicle Use",
131
+ "Std 4. Records and Documentation",
132
+ "Std 5. Verification",
133
+ "Std 6. Internal Review",
134
+ "Std 7. Training and Education",
135
+ "Std 8. Maintenance of Suspension"
136
+ ]
137
+ },
138
+
139
+ # Table 10: Mass Management Summary of Audit findings (Standards 1-8)
140
+ {
141
+ "name": "Mass Management Summary of Audit findings",
142
+ "labels_on_left": True,
143
+ "labels": [
144
+ "Std 1. Responsibilities",
145
+ "Std 2. Vehicle Control",
146
+ "Std 3. Vehicle Use",
147
+ "Std 4. Records and Documentation",
148
+ "Std 5. Verification",
149
+ "Std 6. Internal Review",
150
+ "Std 7. Training and Education",
151
+ "Std 8. Maintenance of Suspension"
152
+ ]
153
+ },
154
+
155
+ # Table 11: Vehicle Registration Numbers of Records Examined
156
+ {
157
+ "name": "Vehicle Registration Numbers of Records Examined",
158
+ "labels_on_row1": True,
159
+ "labels": [
160
+ "No.", "Registration Number",
161
+ "Sub-contractor (Yes/No)",
162
+ "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
163
+ "Weight Verification Records (Date Range)",
164
+ "RFS Suspension Certification # (N/A if not applicable)",
165
+ "Suspension System Maintenance (Date Range)",
166
+ "Trip Records (Date Range)",
167
+ "Fault Recording/ Reporting on Suspension System (Date Range)"
168
+ ]
169
+ },
170
+
171
+ # Table 12: Operator's Name (legal entity) - Signature block
172
+ {
173
+ "name": "Operator’s Name (legal entity)",
174
+ "labels": ["Operator’s Name (legal entity)"]
175
+ },
176
+
177
+ # Table 13: Non-conformance type
178
+ {
179
+ "name": "Non-conformance type (please tick)",
180
+ "labels": ["Un-conditional", "Conditional"]
181
+ },
182
+
183
+ # Table 14: Non-conformance Information
184
+ {
185
+ "name": "Non-conformance Information",
186
+ "labels_on_row1": True,
187
+ "labels": [
188
+ "Non-conformance agreed close out date",
189
+ "Module and Standard",
190
+ "Corrective Action Request (CAR) Number"
191
+ ]
192
+ },
193
+
194
+ # Table 15: Non-conformance and action taken
195
+ {
196
+ "name": "Non-conformance and action taken",
197
+ "labels_on_row1": True,
198
+ "labels": [
199
+ "Observed Non-conformance:",
200
+ "Corrective Action taken or to be taken by operator:",
201
+ "Operator or Representative Signature", "Position", "Date"
202
+ ]
203
+ },
204
+
205
+ # Table 16: Print Name / Auditor Reg Number
206
+ {
207
+ "name": "Print Name / Auditor Reg Number",
208
+ "labels_on_row1": True,
209
+ "labels": [
210
+ "Print Name",
211
+ "NHVR or Exemplar Global Auditor Registration Number"
212
+ ]
213
+ },
214
+
215
+ # Table 17: Audit Declaration
216
+ {
217
+ "name": "Audit Declaration",
218
+ "labels_on_left": True,
219
+ "labels": [
220
+ "Audit was conducted on",
221
+ "Unconditional CARs closed out on:",
222
+ "Conditional CARs to be closed out by:"
223
+ ]
224
+ },
225
+
226
+ # Table 18: print accreditation name
227
+ {
228
+ "name": "print accreditation name",
229
+ "labels": ["print accreditation name"]
230
+ },
231
+
232
+ # Table 19: Operator Declaration
233
+ {
234
+ "name": "Operator Declaration",
235
+ "labels_on_row1": True,
236
+ "labels": ["Print Name", "Position Title"]
237
+ }
238
+ ]
239
+
240
+ def extract_red_text(path):
241
+ doc = Document(path)
242
+
243
+ # debug print
244
+ print(f"Found {len(doc.tables)} tables:")
245
+ for i,t in enumerate(doc.tables):
246
+ print(f" Table#{i}: β€œ{t.rows[0].cells[0].text.strip()[:30]}…”")
247
+ print()
248
+
249
+ out = {}
250
+ for ti, spec in enumerate(MASTER_TABLES):
251
+ if ti >= len(doc.tables):
252
+ break
253
+ tbl = doc.tables[ti]
254
+ name = spec["name"]
255
+
256
+ # prepare container & dedupe sets
257
+ collected = {lbl:[] for lbl in spec["labels"]}
258
+ seen = {lbl:set() for lbl in spec["labels"]}
259
+
260
+ # choose orientation
261
+ if spec.get("labels_on_row1"):
262
+ headers = spec["labels"]
263
+ rows = tbl.rows[1:]
264
+ col_mode = True
265
+ elif spec.get("labels_on_left"):
266
+ headers = spec["labels"]
267
+ # skip any unwanted header/subheading rows
268
+ rows = [
269
+ row for row in tbl.rows[1:]
270
+ if row.cells[0].text.strip() not in spec.get("skip_rows",[])
271
+ ]
272
+ col_mode = False
273
+ else:
274
+ headers = [name]
275
+ rows = tbl.rows
276
+ col_mode = None
277
+
278
+ # scan each cell
279
+ for ri,row in enumerate(rows):
280
+ for ci,cell in enumerate(row.cells):
281
+ red = "".join(
282
+ run.text for p in cell.paragraphs for run in p.runs
283
+ if is_red_font(run)
284
+ ).strip()
285
+ if not red: continue
286
+
287
+ # assign label
288
+ if col_mode is True:
289
+ lbl = headers[ci] if ci<len(headers) else name
290
+ elif col_mode is False:
291
+ lbl = headers[ri] if ri<len(headers) else name
292
+ else:
293
+ lbl = name
294
+
295
+ # dedupe & collect
296
+ if red not in seen[lbl]:
297
+ seen[lbl].add(red)
298
+ collected[lbl].append(red)
299
+
300
+ # only keep non-empty labels
301
+ filtered = {l:collected[l] for l in collected if collected[l]}
302
+ if filtered:
303
+ out[name] = filtered
304
+
305
+ # paragraphs
306
+ paras = {}
307
+ for i,para in enumerate(doc.paragraphs):
308
+ red = "".join(r.text for r in para.runs if is_red_font(r)).strip()
309
+ if not red: continue
310
+ # find nearest non-red above
311
+ lab = None
312
+ for j in range(i-1,-1,-1):
313
+ if any(is_red_font(r) for r in doc.paragraphs[j].runs):
314
+ continue
315
+ txt = doc.paragraphs[j].text.strip()
316
+ if txt:
317
+ lab = txt; break
318
+ key = lab or "(para)"
319
+ paras.setdefault(key,[]).append(red)
320
+
321
+ if paras:
322
+ out["paragraphs"] = paras
323
+ return out
324
+
325
+ if __name__=="__main__":
326
+ fn = sys.argv[1] if len(sys.argv)>1 else "test.docx"
327
+ word_data = extract_red_text(fn)
328
+
329
+ # --- STORE TO JSON for later reuse ---
330
+ with open('word_red_data.json', 'w', encoding='utf-8') as f:
331
+ json.dump(word_data, f, indent=2, ensure_ascii=False)
332
+ # ----------------------------------------
333
+
334
+ # still print to console for immediate feedback
335
+ print(json.dumps(word_data, indent=2, ensure_ascii=False))
update_docx_with_pdf.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import json
3
+
4
+ # Set your OpenAI API key here
5
+ OPENAI_API_KEY = "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"
6
+
7
+ # Load PDF text
8
+ WORD_JSON_FILE = "word_red_data.json"
9
+ PDF_TEXT_FILE = "pdf_all_text_full.txt"
10
+ OUTPUT_FILE = "updated_word_data1.json"
11
+
12
+ # --- Load files ---
13
+ with open(WORD_JSON_FILE, "r", encoding="utf-8") as f:
14
+ word_json = f.read()
15
+ with open(PDF_TEXT_FILE, "r", encoding="utf-8") as f:
16
+ pdf_txt = f.read()
17
+
18
+ # --- Build prompt ---
19
+ user_prompt = f"""
20
+ Here is a JSON template. It contains only the fields that need updating:
21
+ {word_json}
22
+
23
+ Here is the extracted text from a PDF:
24
+ {pdf_txt}
25
+
26
+ Instructions:
27
+ - ONLY update the fields present in the JSON template, using information from the PDF text.
28
+ - DO NOT add any extra fields, and do not change the JSON structure.
29
+ - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
30
+ - Make sure the JSON is valid and ready to use.
31
+ """
32
+
33
+ # --- Call OpenAI API (no env var needed) ---
34
+ client = openai.OpenAI(api_key=OPENAI_API_KEY)
35
+ response = client.chat.completions.create(
36
+ model="gpt-4o",
37
+ messages=[
38
+ {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON."},
39
+ {"role": "user", "content": user_prompt}
40
+ ],
41
+ max_tokens=4096,
42
+ temperature=0
43
+ )
44
+
45
+ updated_json_str = response.choices[0].message.content.strip()
46
+
47
+ # --- Try to parse as JSON ---
48
+ try:
49
+ parsed = json.loads(updated_json_str)
50
+ with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
51
+ json.dump(parsed, f, indent=2, ensure_ascii=False)
52
+ print("βœ… JSON updated and saved to", OUTPUT_FILE)
53
+ except Exception as e:
54
+ print("⚠️ Model did not return valid JSON. Raw output below:\n")
55
+ print(updated_json_str)
56
+ print("\n❌ Failed to parse updated JSON:", e)
updated_word.py ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from docx import Document
3
+ from docx.shared import RGBColor
4
+ import re
5
+
6
+ def load_json(filepath):
7
+ with open(filepath, 'r') as file:
8
+ return json.load(file)
9
+
10
+ def flatten_json(y, prefix=''):
11
+ out = {}
12
+ for key, val in y.items():
13
+ new_key = f"{prefix}.{key}" if prefix else key
14
+ if isinstance(val, dict):
15
+ out.update(flatten_json(val, new_key))
16
+ else:
17
+ out[new_key] = val
18
+ out[key] = val
19
+ return out
20
+
21
+ def is_red(run):
22
+ color = run.font.color
23
+ return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
24
+
25
+ def get_value_as_string(value, field_name=""):
26
+ if isinstance(value, list):
27
+ if len(value) == 0:
28
+ return ""
29
+ elif len(value) == 1:
30
+ return str(value[0])
31
+ else:
32
+ if "australian company number" in field_name.lower() or "company number" in field_name.lower():
33
+ return value
34
+ else:
35
+ return " ".join(str(v) for v in value)
36
+ else:
37
+ return str(value)
38
+
39
+ def find_matching_json_value(field_name, flat_json):
40
+ """Find matching JSON value based on field name (key)"""
41
+ field_name = field_name.strip()
42
+
43
+ # Manual mapping for specific sections that need special handling
44
+ manual_mappings = {
45
+ "attendance list name and position title": "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
46
+ "attendance list (names and position titles)": "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
47
+ "nature of the operators business (summary)": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
48
+ "nature of the operators business (summary):": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
49
+ "nature of operators business (summary)": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
50
+ "nature of operators business (summary):": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
51
+ # Paragraph-level mappings
52
+ "mass management": "paragraphs.MASS MANAGEMENT",
53
+ "liam herbig": "paragraphs.MASS MANAGEMENT", # Name should be replaced with company name
54
+ "date": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
55
+ # Date-related mappings
56
+ "13.11.2024": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
57
+ "auditor signature": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
58
+ "operator signature": "paragraphs.I hereby consent to information relating to my Accreditation to be shared with other law enforcement agencies, including a service provider authorised under the Heavy Vehicle National Law.",
59
+ # Specific data mappings
60
+ "jodie jones": "Audit Information.Auditor name",
61
+ "13th november 2024": "Audit Information.Date of Audit",
62
+ "adelaide barossa transport & warehousing pty ltd": "Operator Information.Operator name (Legal entity)",
63
+ "manager": "Operator Information.Operator name (Legal entity)", # Replace manager title with company name
64
+ "liam herbig –manager": "Operator Information.Operator name (Legal entity)",
65
+ "liam herbig – manager": "Operator Information.Operator name (Legal entity)",
66
+ "deborah herbig – manager": "Operator Information.Operator name (Legal entity)",
67
+ # Contact information mappings (old data in red text -> new data from JSON)
68
+ "141 sitz road callington sa 5254": "Operator Information.Operator business address", # Replace old address with new
69
+ "po box 743 mt barker sa": "Operator Information.Operator Postal address", # Replace old postal with new
70
+ "debherbig@bigpond.com": "Operator Information.Email address", # Replace old email with new
71
+ "0447 710 602": "Operator Information.Operator Telephone Number", # Replace old phone with new
72
+ # Manual/Version mappings (old version -> new version)
73
+ "mahlo 092021v1": "Operator Information.NHVAS Manual (Policies and Procedures) developed by", # Replace old manual with new
74
+ # These should stay as they are (no replacement needed, just different format)
75
+ "511840": "Operator Information.NHVAS Accreditation No. (If applicable)", # Keep accreditation number
76
+ "26th october 2023": "Audit Information.Date of Audit", # Use audit date instead
77
+ # Std 5 and Std 6 mappings
78
+ "the latest verification was dated 23rdnovember 2022": "Mass Management Summary of Audit findings.Std 5. Verification",
79
+ "the latest verification was dated 23rd november 2022": "Mass Management Summary of Audit findings.Std 5. Verification",
80
+ "internal review was dated 23rd august 2023 with 0 ncr": "Mass Management Summary of Audit findings.Std 6. Internal Review",
81
+ "23rd august2023 with 0 trips, 0 trips using mass, 0 overloads and 0 ncr's": "Mass Management Summary of Audit findings.Std 6. Internal Review",
82
+ "23rd august 2023 with 0 trips, 0 trips using mass, 0 overloads and 0 ncr's": "Mass Management Summary of Audit findings.Std 6. Internal Review",
83
+ }
84
+
85
+ # Check manual mappings first
86
+ normalized_field = field_name.lower().strip()
87
+ if normalized_field in manual_mappings:
88
+ mapped_key = manual_mappings[normalized_field]
89
+ if mapped_key in flat_json:
90
+ print(f" βœ… Manual mapping found for '{field_name}' -> '{mapped_key}'")
91
+ return flat_json[mapped_key]
92
+
93
+ # Try exact match first
94
+ if field_name in flat_json:
95
+ print(f" Direct match found for key '{field_name}'")
96
+ return flat_json[field_name]
97
+
98
+ # Try case-insensitive exact match
99
+ for key, value in flat_json.items():
100
+ if key.lower() == field_name.lower():
101
+ print(f" Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
102
+ return value
103
+
104
+ # Try to find a key that ends with this field name
105
+ for key, value in flat_json.items():
106
+ if key.endswith('.' + field_name):
107
+ print(f" Suffix match found for key '{field_name}' with JSON key '{key}'")
108
+ return value
109
+
110
+ # Try partial matching for fields with parentheses or additional text
111
+ clean_field = re.sub(r'\s*\([^)]*\)', '', field_name).strip() # Remove parentheses content
112
+ for key, value in flat_json.items():
113
+ clean_key = re.sub(r'\s*\([^)]*\)', '', key).strip()
114
+ if clean_field.lower() == clean_key.lower():
115
+ print(f" Clean match found for key '{field_name}' with JSON key '{key}'")
116
+ return value
117
+
118
+ # Try word-based matching - more flexible approach
119
+ field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
120
+ best_match = None
121
+ best_score = 0
122
+
123
+ for key, value in flat_json.items():
124
+ key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
125
+ # Calculate how many words match
126
+ common_words = field_words.intersection(key_words)
127
+ if common_words:
128
+ score = len(common_words) / max(len(field_words), len(key_words)) # Normalized score
129
+ if score > best_score:
130
+ best_score = score
131
+ best_match = (key, value)
132
+
133
+ if best_match and best_score >= 0.5: # At least 50% word overlap
134
+ print(f" Word-based match found for key '{field_name}' with JSON key '{best_match[0]}' (score: {best_score:.2f})")
135
+ return best_match[1]
136
+
137
+ # No match found
138
+ print(f" ❌ No match found for '{field_name}'")
139
+ return None
140
+
141
+ def get_clean_text(cell):
142
+ text = ""
143
+ for paragraph in cell.paragraphs:
144
+ for run in paragraph.runs:
145
+ text += run.text
146
+ return text.strip()
147
+
148
+ def has_red_text(cell):
149
+ for paragraph in cell.paragraphs:
150
+ for run in paragraph.runs:
151
+ if is_red(run) and run.text.strip():
152
+ return True
153
+ return False
154
+
155
+ def replace_red_text_in_cell(cell, replacement_text):
156
+ replacements_made = 0
157
+
158
+ # First, collect all red text to show what we're replacing
159
+ all_red_text = ""
160
+ for paragraph in cell.paragraphs:
161
+ for run in paragraph.runs:
162
+ if is_red(run):
163
+ all_red_text += run.text
164
+
165
+ if all_red_text.strip():
166
+ print(f" βœ… Replacing red text: '{all_red_text[:50]}...' β†’ '{replacement_text[:50]}...'")
167
+
168
+ # Now replace all red text in the cell with the replacement text
169
+ first_replacement_done = False
170
+ for paragraph in cell.paragraphs:
171
+ red_runs = [run for run in paragraph.runs if is_red(run)]
172
+ if red_runs:
173
+ if not first_replacement_done:
174
+ # Replace the first red run with our text
175
+ red_runs[0].text = replacement_text
176
+ red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
177
+ first_replacement_done = True
178
+ replacements_made = 1
179
+ else:
180
+ # Clear the first red run since we already replaced content
181
+ red_runs[0].text = ''
182
+
183
+ # Clear all other red runs in this paragraph
184
+ for run in red_runs[1:]:
185
+ run.text = ''
186
+
187
+ return replacements_made
188
+
189
+ def handle_australian_company_number(row, company_numbers):
190
+ replacements_made = 0
191
+ for i, digit in enumerate(company_numbers):
192
+ cell_idx = i + 1
193
+ if cell_idx < len(row.cells):
194
+ cell = row.cells[cell_idx]
195
+ if has_red_text(cell):
196
+ cell_replacements = replace_red_text_in_cell(cell, str(digit))
197
+ replacements_made += cell_replacements
198
+ print(f" -> Placed digit '{digit}' in cell {cell_idx + 1}")
199
+ return replacements_made
200
+
201
+ def handle_vehicle_registration_table(table, flat_json):
202
+ """Handle the Vehicle Registration Numbers table with column-based data"""
203
+ replacements_made = 0
204
+
205
+ # Look for the vehicle registration data in the flattened JSON
206
+ vehicle_section = None
207
+
208
+ # Try to find the vehicle registration section
209
+ for key, value in flat_json.items():
210
+ if "vehicle registration numbers of records examined" in key.lower():
211
+ if isinstance(value, dict): # This should be the nested structure
212
+ vehicle_section = value
213
+ print(f" βœ… Found vehicle data in key: '{key}'")
214
+ break
215
+
216
+ if not vehicle_section:
217
+ # Try alternative approach - look for individual column keys
218
+ potential_columns = {}
219
+ for key, value in flat_json.items():
220
+ if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]):
221
+ # Extract the column name from the flattened key
222
+ if "." in key:
223
+ column_name = key.split(".")[-1]
224
+ else:
225
+ column_name = key
226
+ potential_columns[column_name] = value
227
+
228
+ if potential_columns:
229
+ vehicle_section = potential_columns
230
+ print(f" βœ… Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
231
+ else:
232
+ print(f" ❌ Vehicle registration data not found in JSON")
233
+ return 0
234
+
235
+ print(f" βœ… Found vehicle registration data with {len(vehicle_section)} columns")
236
+
237
+ # Find header row (usually row 0 or 1)
238
+ header_row_idx = -1
239
+ header_row = None
240
+
241
+ for row_idx, row in enumerate(table.rows):
242
+ row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
243
+ if "registration" in row_text and "number" in row_text:
244
+ header_row_idx = row_idx
245
+ header_row = row
246
+ break
247
+
248
+ if header_row_idx == -1:
249
+ print(f" ❌ Could not find header row in vehicle table")
250
+ return 0
251
+
252
+ print(f" βœ… Found header row at index {header_row_idx}")
253
+
254
+ # Create mapping between column indices and JSON keys
255
+ column_mapping = {}
256
+ for col_idx, cell in enumerate(header_row.cells):
257
+ header_text = get_clean_text(cell).strip()
258
+ if not header_text or header_text.lower() == "no.":
259
+ continue
260
+
261
+ # Try to match header text with JSON keys
262
+ best_match = None
263
+ best_score = 0
264
+
265
+ # Normalize header text for better matching
266
+ normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
267
+
268
+ for json_key in vehicle_section.keys():
269
+ normalized_json = json_key.lower().strip()
270
+
271
+ # Try exact match first (after normalization)
272
+ if normalized_header == normalized_json:
273
+ best_match = json_key
274
+ best_score = 1.0
275
+ break
276
+
277
+ # Try word-based matching
278
+ header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
279
+ json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
280
+
281
+ if header_words and json_words:
282
+ common_words = header_words.intersection(json_words)
283
+ score = len(common_words) / max(len(header_words), len(json_words))
284
+
285
+ if score > best_score and score >= 0.3: # At least 30% match
286
+ best_score = score
287
+ best_match = json_key
288
+
289
+ # Try substring matching for cases like "RegistrationNumber" vs "Registration Number"
290
+ header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
291
+ json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
292
+
293
+ if header_clean in json_clean or json_clean in header_clean:
294
+ if len(header_clean) > 5 and len(json_clean) > 5: # Only for meaningful matches
295
+ substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
296
+ if substring_score > best_score and substring_score >= 0.6:
297
+ best_score = substring_score
298
+ best_match = json_key
299
+
300
+ if best_match:
301
+ column_mapping[col_idx] = best_match
302
+ print(f" πŸ“Œ Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
303
+
304
+ if not column_mapping:
305
+ print(f" ❌ No column mappings found")
306
+ return 0
307
+
308
+ # Determine how many data rows we need based on the JSON arrays
309
+ max_data_rows = 0
310
+ for json_key, data in vehicle_section.items():
311
+ if isinstance(data, list):
312
+ max_data_rows = max(max_data_rows, len(data))
313
+
314
+ print(f" πŸ“Œ Need to populate {max_data_rows} data rows")
315
+
316
+ # Process all required data rows
317
+ for data_row_index in range(max_data_rows):
318
+ table_row_idx = header_row_idx + 1 + data_row_index
319
+
320
+ # Check if this table row exists, if not, add it
321
+ if table_row_idx >= len(table.rows):
322
+ print(f" ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
323
+ print(f" βž• Adding new row for vehicle {data_row_index + 1}")
324
+
325
+ # Add a new row to the table
326
+ new_row = table.add_row()
327
+ print(f" βœ… Successfully added row {len(table.rows)} to the table")
328
+
329
+ row = table.rows[table_row_idx]
330
+ print(f" πŸ“Œ Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
331
+
332
+ # Fill in data for each mapped column
333
+ for col_idx, json_key in column_mapping.items():
334
+ if col_idx < len(row.cells):
335
+ cell = row.cells[col_idx]
336
+
337
+ # Get the data for this column and row
338
+ column_data = vehicle_section.get(json_key, [])
339
+ if isinstance(column_data, list) and data_row_index < len(column_data):
340
+ replacement_value = str(column_data[data_row_index])
341
+
342
+ # Check if cell has red text or is empty (needs data)
343
+ cell_text = get_clean_text(cell)
344
+ if has_red_text(cell) or not cell_text.strip():
345
+ # If cell is empty, add the text directly
346
+ if not cell_text.strip():
347
+ cell.text = replacement_value
348
+ replacements_made += 1
349
+ print(f" -> Added '{replacement_value}' to empty cell (column '{json_key}')")
350
+ else:
351
+ # If cell has red text, replace it
352
+ cell_replacements = replace_red_text_in_cell(cell, replacement_value)
353
+ replacements_made += cell_replacements
354
+ if cell_replacements > 0:
355
+ print(f" -> Replaced red text with '{replacement_value}' (column '{json_key}')")
356
+
357
+ return replacements_made
358
+
359
+ def handle_print_accreditation_section(table, flat_json):
360
+ """Handle the special case of print accreditation name with 2 values"""
361
+ replacements_made = 0
362
+
363
+ # Look for the print accreditation name data
364
+ print_data = flat_json.get("print accreditation name.print accreditation name", [])
365
+ if not isinstance(print_data, list) or len(print_data) < 2:
366
+ return 0
367
+
368
+ name_value = print_data[0] # "Simon Anderson"
369
+ position_value = print_data[1] # "Director"
370
+
371
+ print(f" πŸ“‹ Print accreditation data: Name='{name_value}', Position='{position_value}'")
372
+
373
+ # Find rows with "Print Name" and "Position Title"
374
+ for row_idx, row in enumerate(table.rows):
375
+ if len(row.cells) >= 2:
376
+ # Check if this row has the headers
377
+ cell1_text = get_clean_text(row.cells[0]).lower()
378
+ cell2_text = get_clean_text(row.cells[1]).lower()
379
+
380
+ if "print name" in cell1_text and "position title" in cell2_text:
381
+ print(f" πŸ“ Found header row {row_idx + 1}: '{cell1_text}' | '{cell2_text}'")
382
+
383
+ # Check the next row for red text to replace
384
+ if row_idx + 1 < len(table.rows):
385
+ data_row = table.rows[row_idx + 1]
386
+ if len(data_row.cells) >= 2:
387
+ # Replace Print Name (first cell)
388
+ if has_red_text(data_row.cells[0]):
389
+ cell_replacements = replace_red_text_in_cell(data_row.cells[0], name_value)
390
+ replacements_made += cell_replacements
391
+ if cell_replacements > 0:
392
+ print(f" βœ… Replaced Print Name: '{name_value}'")
393
+
394
+ # Replace Position Title (second cell)
395
+ if has_red_text(data_row.cells[1]):
396
+ cell_replacements = replace_red_text_in_cell(data_row.cells[1], position_value)
397
+ replacements_made += cell_replacements
398
+ if cell_replacements > 0:
399
+ print(f" βœ… Replaced Position Title: '{position_value}'")
400
+
401
+ break # Found the section, no need to continue
402
+
403
+ return replacements_made
404
+
405
+ def process_single_column_sections(cell, field_name, flat_json):
406
+ json_value = find_matching_json_value(field_name, flat_json)
407
+ if json_value is not None:
408
+ replacement_text = get_value_as_string(json_value, field_name)
409
+ if isinstance(json_value, list) and len(json_value) > 1:
410
+ replacement_text = "\n".join(str(item) for item in json_value)
411
+ if has_red_text(cell):
412
+ print(f" βœ… Replacing red text in single-column section: '{field_name}'")
413
+ print(f" βœ… Replacement text:\n{replacement_text}")
414
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
415
+ if cell_replacements > 0:
416
+ print(f" -> Replaced with: '{replacement_text[:100]}...'")
417
+ return cell_replacements
418
+ return 0
419
+
420
+ def process_tables(document, flat_json):
421
+ """Process tables to find key-value pairs and replace red values"""
422
+ replacements_made = 0
423
+
424
+ for table_idx, table in enumerate(document.tables):
425
+ print(f"\nπŸ” Processing table {table_idx + 1}:")
426
+
427
+ # Check if this is the vehicle registration table
428
+ table_text = ""
429
+ for row in table.rows[:3]: # Check first 3 rows
430
+ for cell in row.cells:
431
+ table_text += get_clean_text(cell).lower() + " "
432
+
433
+ # Look for vehicle registration indicators (need multiple indicators to avoid false positives)
434
+ vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
435
+ indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
436
+ if indicator_count >= 3: # Require at least 3 indicators to be sure it's a vehicle table
437
+ print(f" πŸš— Detected Vehicle Registration table")
438
+ vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
439
+ replacements_made += vehicle_replacements
440
+ continue # Skip normal processing for this table
441
+
442
+ # Check if this is the print accreditation table
443
+ print_accreditation_indicators = ["print name", "position title"]
444
+ indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
445
+ if indicator_count >= 2: # Require at least 2 indicators to be sure it's a print accreditation table
446
+ print(f" πŸ“‹ Detected Print Accreditation table")
447
+ print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
448
+ replacements_made += print_accreditation_replacements
449
+ continue # Skip normal processing for this table
450
+
451
+ for row_idx, row in enumerate(table.rows):
452
+ if len(row.cells) < 1: # Skip empty rows
453
+ continue
454
+
455
+ # Get the key from the first column
456
+ key_cell = row.cells[0]
457
+ key_text = get_clean_text(key_cell)
458
+
459
+ if not key_text:
460
+ continue
461
+
462
+ print(f" πŸ“Œ Row {row_idx + 1}: Key = '{key_text}'")
463
+
464
+ # Check if this key exists in our JSON
465
+ json_value = find_matching_json_value(key_text, flat_json)
466
+
467
+ if json_value is not None:
468
+ replacement_text = get_value_as_string(json_value, key_text)
469
+
470
+ # Special handling for Australian Company Number
471
+ if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
472
+ cell_replacements = handle_australian_company_number(row, json_value)
473
+ replacements_made += cell_replacements
474
+
475
+ # Handle section headers (like Attendance List, Nature of Business) where content is in next row
476
+ elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
477
+ print(f" βœ… Section header detected, checking next row for content...")
478
+ next_row = table.rows[row_idx + 1]
479
+
480
+ # Check all cells in the next row for red text
481
+ for cell_idx, cell in enumerate(next_row.cells):
482
+ if has_red_text(cell):
483
+ print(f" βœ… Found red text in next row, cell {cell_idx + 1}")
484
+ # For list values, join with line breaks
485
+ if isinstance(json_value, list):
486
+ replacement_text = "\n".join(str(item) for item in json_value)
487
+ cell_replacements = replace_red_text_in_cell(cell, replacement_text)
488
+ replacements_made += cell_replacements
489
+ if cell_replacements > 0:
490
+ print(f" -> Replaced section content with: '{replacement_text[:100]}...'")
491
+
492
+ elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
493
+ if has_red_text(key_cell):
494
+ cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
495
+ replacements_made += cell_replacements
496
+ else:
497
+ for cell_idx in range(1, len(row.cells)):
498
+ value_cell = row.cells[cell_idx]
499
+ if has_red_text(value_cell):
500
+ print(f" βœ… Found red text in column {cell_idx + 1}")
501
+ cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
502
+ replacements_made += cell_replacements
503
+ else:
504
+ if len(row.cells) == 1 and has_red_text(key_cell):
505
+ red_text = ""
506
+ for paragraph in key_cell.paragraphs:
507
+ for run in paragraph.runs:
508
+ if is_red(run):
509
+ red_text += run.text
510
+ if red_text.strip():
511
+ section_value = find_matching_json_value(red_text.strip(), flat_json)
512
+ if section_value is not None:
513
+ section_replacement = get_value_as_string(section_value, red_text.strip())
514
+ cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
515
+ replacements_made += cell_replacements
516
+
517
+ # Handle tables where red text appears in multiple columns (like contact info tables)
518
+ for cell_idx in range(len(row.cells)):
519
+ cell = row.cells[cell_idx]
520
+ if has_red_text(cell):
521
+ # Get the red text from this cell
522
+ red_text = ""
523
+ for paragraph in cell.paragraphs:
524
+ for run in paragraph.runs:
525
+ if is_red(run):
526
+ red_text += run.text
527
+
528
+ if red_text.strip():
529
+ # Try to find a direct mapping for this red text
530
+ section_value = find_matching_json_value(red_text.strip(), flat_json)
531
+ if section_value is not None:
532
+ section_replacement = get_value_as_string(section_value, red_text.strip())
533
+ cell_replacements = replace_red_text_in_cell(cell, section_replacement)
534
+ replacements_made += cell_replacements
535
+ if cell_replacements > 0:
536
+ print(f" βœ… Replaced red text '{red_text.strip()[:30]}...' with '{section_replacement[:30]}...' in cell {cell_idx + 1}")
537
+
538
+ return replacements_made
539
+
540
+ def process_paragraphs(document, flat_json):
541
+ replacements_made = 0
542
+ print(f"\nπŸ” Processing paragraphs:")
543
+ for para_idx, paragraph in enumerate(document.paragraphs):
544
+ red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
545
+ if red_runs:
546
+ full_text = paragraph.text.strip()
547
+ red_text_only = "".join(run.text for run in red_runs).strip()
548
+ print(f" πŸ“Œ Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
549
+
550
+ # Try to match the red text specifically first
551
+ json_value = find_matching_json_value(red_text_only, flat_json)
552
+
553
+ # If no match, try some common patterns
554
+ if json_value is None:
555
+ # Check for signature patterns
556
+ if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
557
+ json_value = find_matching_json_value("auditor signature", flat_json)
558
+ elif "OPERATOR SIGNATURE" in red_text_only.upper():
559
+ json_value = find_matching_json_value("operator signature", flat_json)
560
+
561
+ if json_value is not None:
562
+ replacement_text = get_value_as_string(json_value)
563
+ print(f" βœ… Replacing red text with: '{replacement_text}'")
564
+ red_runs[0].text = replacement_text
565
+ red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
566
+ for run in red_runs[1:]:
567
+ run.text = ''
568
+ replacements_made += 1
569
+ return replacements_made
570
+
571
+ def main():
572
+ json_path = 'updated_word_data.json'
573
+ docx_path = 'test.docx'
574
+ output_path = 'updated_reportv1.docx'
575
+
576
+ try:
577
+ json_data = load_json(json_path)
578
+ flat_json = flatten_json(json_data)
579
+ print("πŸ“„ Available JSON keys (sample):")
580
+ count = 0
581
+ for key, value in sorted(flat_json.items()):
582
+ if count < 10:
583
+ print(f" - {key}: {value}")
584
+ count += 1
585
+ print(f" ... and {len(flat_json) - count} more keys\n")
586
+
587
+ doc = Document(docx_path)
588
+
589
+ table_replacements = process_tables(doc, flat_json)
590
+ paragraph_replacements = process_paragraphs(doc, flat_json)
591
+ total_replacements = table_replacements + paragraph_replacements
592
+
593
+ doc.save(output_path)
594
+ print(f"\nβœ… Document saved as: {output_path}")
595
+ print(f"βœ… Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs)")
596
+
597
+ except FileNotFoundError as e:
598
+ print(f"❌ File not found: {e}")
599
+ except Exception as e:
600
+ print(f"❌ Error: {e}")
601
+ import traceback
602
+ traceback.print_exc()
603
+
604
+ if __name__ == "__main__":
605
+ main()