Shami96 commited on
Commit
89ec944
·
verified ·
1 Parent(s): f486b52

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +172 -184
update_docx_with_pdf.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- update_docx_with_pdf.py
4
  """
5
 
6
  import os
@@ -10,7 +10,6 @@ import time
10
  import re
11
  from typing import Optional
12
 
13
- # Try to import OpenAI client in the style used previously
14
  try:
15
  from openai import OpenAI
16
  except Exception:
@@ -18,12 +17,11 @@ except Exception:
18
 
19
  # Config
20
  RETRIES = 3
21
- RETRY_DELAY = 1.0 # seconds between retries
22
  DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
23
  MAX_TOKENS = 4096
24
  TEMPERATURE = 0.0
25
 
26
-
27
  def read_any(path_or_file):
28
  """Read content from file path or file-like object."""
29
  if hasattr(path_or_file, "read"):
@@ -36,26 +34,22 @@ def read_any(path_or_file):
36
  with open(path_or_file, "r", encoding="utf-8") as fh:
37
  return fh.read()
38
 
39
-
40
  def find_first_balanced_json(s: str) -> Optional[str]:
41
- """
42
- Scan the input string and return the first substring that is a balanced JSON object
43
- starting with '{' and ending with the matching '}' that parses successfully.
44
- """
45
  if not s:
46
  return None
47
- # Find all possible '{' positions
48
  for m in re.finditer(r"\{", s):
49
  start = m.start()
50
  depth = 0
51
  in_str = False
52
  escape = False
 
53
  for i in range(start, len(s)):
54
  ch = s[i]
55
  if ch == '"' and not escape:
56
  in_str = not in_str
57
  if in_str:
58
- # handle escape toggling but don't treat braces inside strings
59
  if ch == "\\" and not escape:
60
  escape = True
61
  else:
@@ -71,41 +65,11 @@ def find_first_balanced_json(s: str) -> Optional[str]:
71
  json.loads(candidate)
72
  return candidate
73
  except Exception:
74
- # candidate not valid JSON (maybe trailing commas etc.) -> continue searching
75
  break
76
  return None
77
 
78
-
79
- def extract_json_substring(s: str) -> Optional[str]:
80
- """
81
- Wrapper for find_first_balanced_json kept for compatibility with existing naming.
82
- """
83
- return find_first_balanced_json(s)
84
-
85
-
86
- def try_parse_json_str(s: str):
87
- """Attempt to parse JSON string, raising the same exceptions as json.loads."""
88
- return json.loads(s)
89
-
90
-
91
- def safe_write(path: str, data):
92
- with open(path, "w", encoding="utf-8") as f:
93
- json.dump(data, f, indent=2, ensure_ascii=False)
94
-
95
-
96
- def save_raw(path: str, text: str):
97
- try:
98
- with open(path, "w", encoding="utf-8") as f:
99
- f.write(text)
100
- except Exception:
101
- # best-effort; don't crash
102
- pass
103
-
104
-
105
  def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
106
- """
107
- Call the model and return raw text content. Support variations in SDK response shape.
108
- """
109
  resp = client.chat.completions.create(
110
  model=model_name,
111
  messages=[
@@ -116,32 +80,83 @@ def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: s
116
  temperature=TEMPERATURE,
117
  )
118
 
119
- # Try to extract raw text in a few shapes
120
- raw_text = ""
121
  try:
122
- # New-style: resp.choices[0].message.content
123
  raw_text = resp.choices[0].message.content
124
  except Exception:
125
  try:
126
- # Older shape: resp.choices[0].text
127
  raw_text = resp.choices[0].text
128
  except Exception:
129
  raw_text = str(resp)
 
130
  if isinstance(raw_text, bytes):
131
  raw_text = raw_text.decode("utf-8", errors="replace")
132
  return (raw_text or "").strip()
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
136
- # --- load inputs ---
137
  word_json_text = read_any(word_json_file)
138
  pdf_txt = read_any(pdf_txt_file)
139
 
140
  try:
141
  word_json = json.loads(word_json_text)
142
- except Exception:
143
- # If the input word_json isn't valid JSON, abort early but write original to output
144
- print("⚠️ Input word_json is not valid JSON. Writing raw input to output and exiting.")
145
  if hasattr(output_file, "write"):
146
  output_file.write(word_json_text)
147
  output_file.flush()
@@ -150,194 +165,168 @@ def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
150
  f.write(word_json_text)
151
  return
152
 
153
- # --- build base prompts ---
154
- system_msg = (
155
- "You are a strict JSON extraction assistant. Only output valid JSON with no surrounding text, "
156
- "no markdown, no explanation. The JSON must be parseable by json.loads()."
157
- )
158
-
159
- user_prompt_template = (
160
- "Here is a JSON template that must be updated (DO NOT change structure or keys):\n\n"
161
- "{word_json}\n\n"
162
- "Here is the extracted text from a PDF (use this to fill/update fields):\n\n"
163
- "{pdf_text}\n\n"
164
- "Instructions:\n"
165
- "- ONLY update fields that already exist in the JSON template using evidence from the PDF text.\n"
166
- "- DO NOT add new top-level keys or alter the structure.\n"
167
- "- If you cannot find a value for an existing field, leave it unchanged.\n"
168
- "- OUTPUT EXACTLY one JSON object and NOTHING else.\n"
169
- )
170
-
171
- user_prompt = user_prompt_template.format(
172
- word_json=json.dumps(word_json, ensure_ascii=False),
173
- pdf_text=(pdf_txt or "")[:120000], # cap size to avoid truncation/hitting token limits
174
- )
175
-
176
  api_key = os.environ.get("OPENAI_API_KEY")
177
  if not api_key:
178
- print("⚠️ OPENAI_API_KEY not found in environment variables! Writing original JSON to output and exiting.")
179
  if hasattr(output_file, "write"):
180
  json.dump(word_json, output_file, indent=2, ensure_ascii=False)
181
  output_file.flush()
182
  else:
183
- safe_write(output_file, word_json)
 
184
  return
185
 
186
  if OpenAI is None:
187
- print("⚠️ OpenAI SDK not available (could not import OpenAI). Writing original JSON to output and exiting.")
188
  if hasattr(output_file, "write"):
189
  json.dump(word_json, output_file, indent=2, ensure_ascii=False)
190
  output_file.flush()
191
  else:
192
- safe_write(output_file, word_json)
 
193
  return
194
 
195
- # Create client (constructor signature can be adapted if your OpenAI wrapper differs)
196
- client = OpenAI(api_key=api_key)
 
 
 
197
 
 
198
  model_name = DEFAULT_MODEL
199
 
200
- raw_outputs = []
201
- parsed = None
202
-
203
- # Try multiple attempts (progressive instructions)
204
  for attempt in range(1, RETRIES + 1):
205
- variant_user_prompt = user_prompt
206
- # On later attempts, append stricter instruction
 
 
207
  if attempt == 2:
208
- variant_user_prompt += "\nIMPORTANT: Return ONLY valid JSON. If you cannot find new values, keep existing template values."
209
  elif attempt >= 3:
210
- variant_user_prompt += "\nLAST ATTEMPT: Output exactly one JSON object and nothing else. If uncertain, keep fields unchanged."
211
 
212
- print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
213
  try:
214
- raw_text = call_model_and_get_raw(client, model_name, system_msg, variant_user_prompt)
215
- raw_outputs.append(raw_text)
216
- # Save raw model output for diagnostics
217
  out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
218
  raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
219
- save_raw(raw_save_path, raw_text)
 
 
 
 
220
 
221
- # Try parse as JSON directly
222
  try:
223
- parsed = try_parse_json_str(raw_text)
224
- print("✅ Model returned valid JSON.")
225
- # write and return
 
 
 
 
 
 
 
226
  if hasattr(output_file, "write"):
227
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
228
  output_file.flush()
229
  else:
230
- safe_write(output_file, parsed)
 
231
  return parsed
232
- except Exception:
233
- # try extracting a balanced JSON substring
234
- candidate = extract_json_substring(raw_text)
 
 
 
235
  if candidate:
236
  try:
237
- parsed = try_parse_json_str(candidate)
238
- print("✅ Successfully extracted and parsed JSON substring from model output.")
 
 
239
  if hasattr(output_file, "write"):
240
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
241
  output_file.flush()
242
  else:
243
- safe_write(output_file, parsed)
 
244
  return parsed
245
- except Exception:
246
- print("⚠️ Extracted substring was not valid JSON after parsing attempt.")
247
- else:
248
- print("⚠️ Could not find a balanced JSON substring in the model output.")
249
-
250
- # If we get here, the model output is not parseable - attempt a repair pass once per attempt
251
- print("🔧 Attempting repair pass: sending model its raw output and asking for VALID JSON only...")
252
- repair_system = "You are a JSON repair assistant. The previous model output (possibly with commentary) is provided. Extract and return a single VALID JSON object and NOTHING else. If you cannot produce valid JSON, return {}."
253
- # Provide the model its own raw output for repair
254
- repair_user = f"Raw model output:\n\n{raw_text}\n\nReturn only valid JSON object."
255
- repair_raw = ""
256
- try:
257
- repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
258
- # Save repair output
259
- repair_save_path = f"{out_base}.model_raw_attempt{attempt}_repair.txt"
260
- save_raw(repair_save_path, repair_raw)
261
-
262
- # Try parse repair output
263
  try:
264
- parsed = try_parse_json_str(repair_raw)
265
- print("✅ Repair pass succeeded with valid JSON.")
 
 
 
266
  if hasattr(output_file, "write"):
267
- json.dump(parsed, output_file, indent=2, ensure_ascii=False)
268
  output_file.flush()
269
  else:
270
- safe_write(output_file, parsed)
271
- return parsed
272
- except Exception:
273
- candidate = extract_json_substring(repair_raw)
274
- if candidate:
275
- try:
276
- parsed = try_parse_json_str(candidate)
277
- print("✅ Successfully extracted JSON substring from repair output.")
278
- if hasattr(output_file, "write"):
279
- json.dump(parsed, output_file, indent=2, ensure_ascii=False)
280
- output_file.flush()
281
- else:
282
- safe_write(output_file, parsed)
283
- return parsed
284
- except Exception:
285
- print("⚠️ Repair output contained JSON-like substring but could not be parsed.")
286
- else:
287
- print("⚠️ Repair pass did not produce a parseable JSON substring.")
288
- except Exception as rep_err:
289
- print(f"⚠️ Exception during repair pass: {rep_err}")
290
 
291
- except Exception as call_err:
292
- print(f"⚠️ Exception while calling model: {call_err}")
293
 
294
  # Wait before next attempt
295
- time.sleep(RETRY_DELAY)
296
-
297
- # If we've reached here, all attempts failed
298
- print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).")
299
- try:
300
- out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
301
- raw_path = f"{out_base}.model_raw.txt"
302
- with open(raw_path, "w", encoding="utf-8") as rf:
303
- rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n")
304
- for i, out in enumerate(raw_outputs, start=1):
305
- rf.write(f"--- ATTEMPT {i} ---\n")
306
- rf.write((out or "") + "\n\n")
307
- rf.write("\n=== END ===\n\n")
308
- rf.write("\n\n=== PDF TEXT USED (truncated) ===\n\n")
309
- rf.write((pdf_txt or "")[:20000])
310
- print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}")
311
- except Exception as e:
312
- print(f"⚠️ Failed to save raw model outputs: {e}")
313
 
314
- # Also create a salvage bundle for manual inspection
315
- try:
316
- salvage_path = f"{out_base}.salvage.json"
317
- salvage_bundle = {
318
- "original_word_json": word_json,
319
- "pdf_text_sample": (pdf_txt or "")[:2000],
320
- "raw_outputs_path": raw_path,
321
- }
322
- with open(salvage_path, "w", encoding="utf-8") as sf:
323
- json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False)
324
- print(f"ℹ️ Salvage bundle saved to: {salvage_path}")
325
- except Exception as e:
326
- print(f"⚠️ Failed to save salvage bundle: {e}")
327
-
328
- # Write original JSON to output to avoid failing the calling process
329
  try:
330
  if hasattr(output_file, "write"):
331
  json.dump(word_json, output_file, indent=2, ensure_ascii=False)
332
  output_file.flush()
333
  else:
334
- safe_write(output_file, word_json)
335
- print("✅ Original JSON template written to output (no updates applied).")
 
336
  except Exception as e:
337
- print(f"⚠️ Failed to write original JSON to output: {e}")
338
 
339
  return None
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  if __name__ == "__main__":
343
  if len(sys.argv) != 4:
@@ -347,8 +336,7 @@ if __name__ == "__main__":
347
  try:
348
  update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
349
  except Exception as e:
350
- # Top-level catch to avoid crashing the pipeline; write original input as fallback.
351
- print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
352
  try:
353
  with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
354
  outf.write(inf.read())
 
1
  #!/usr/bin/env python3
2
  """
3
+ Enhanced update_docx_with_pdf.py with better JSON structure handling
4
  """
5
 
6
  import os
 
10
  import re
11
  from typing import Optional
12
 
 
13
  try:
14
  from openai import OpenAI
15
  except Exception:
 
17
 
18
  # Config
19
  RETRIES = 3
20
+ RETRY_DELAY = 1.0
21
  DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
22
  MAX_TOKENS = 4096
23
  TEMPERATURE = 0.0
24
 
 
25
  def read_any(path_or_file):
26
  """Read content from file path or file-like object."""
27
  if hasattr(path_or_file, "read"):
 
34
  with open(path_or_file, "r", encoding="utf-8") as fh:
35
  return fh.read()
36
 
 
37
  def find_first_balanced_json(s: str) -> Optional[str]:
38
+ """Find the first valid JSON object in the string"""
 
 
 
39
  if not s:
40
  return None
41
+
42
  for m in re.finditer(r"\{", s):
43
  start = m.start()
44
  depth = 0
45
  in_str = False
46
  escape = False
47
+
48
  for i in range(start, len(s)):
49
  ch = s[i]
50
  if ch == '"' and not escape:
51
  in_str = not in_str
52
  if in_str:
 
53
  if ch == "\\" and not escape:
54
  escape = True
55
  else:
 
65
  json.loads(candidate)
66
  return candidate
67
  except Exception:
 
68
  break
69
  return None
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
72
+ """Call the model and return raw text content"""
 
 
73
  resp = client.chat.completions.create(
74
  model=model_name,
75
  messages=[
 
80
  temperature=TEMPERATURE,
81
  )
82
 
 
 
83
  try:
 
84
  raw_text = resp.choices[0].message.content
85
  except Exception:
86
  try:
 
87
  raw_text = resp.choices[0].text
88
  except Exception:
89
  raw_text = str(resp)
90
+
91
  if isinstance(raw_text, bytes):
92
  raw_text = raw_text.decode("utf-8", errors="replace")
93
  return (raw_text or "").strip()
94
 
95
+ def create_enhanced_prompt(word_json, pdf_text):
96
+ """Create an enhanced prompt that ensures proper JSON structure"""
97
+
98
+ # Analyze the word_json structure to understand what needs to be filled
99
+ structure_analysis = []
100
+
101
+ def analyze_structure(obj, path=""):
102
+ if isinstance(obj, dict):
103
+ for key, value in obj.items():
104
+ current_path = f"{path}.{key}" if path else key
105
+ if isinstance(value, dict):
106
+ structure_analysis.append(f" {current_path} (nested object)")
107
+ analyze_structure(value, current_path)
108
+ elif isinstance(value, list):
109
+ structure_analysis.append(f" {current_path} (list with {len(value)} items)")
110
+ elif value is None or str(value).strip() == "":
111
+ structure_analysis.append(f" {current_path} (EMPTY - needs data)")
112
+ else:
113
+ structure_analysis.append(f" {current_path} (has data: {str(value)[:50]}...)")
114
+
115
+ analyze_structure(word_json)
116
+
117
+ system_msg = """You are a precise JSON data extraction assistant.
118
+
119
+ CRITICAL RULES:
120
+ 1. Output ONLY valid JSON - no markdown, no explanations, no extra text
121
+ 2. Maintain the EXACT structure provided in the template
122
+ 3. Only UPDATE fields that are empty or null - do not change existing data
123
+ 4. Extract data accurately from the PDF text provided
124
+ 5. If you cannot find data for a field, leave it as null or empty string
125
+ 6. Ensure all nested objects and arrays maintain their structure"""
126
+
127
+ user_prompt = f"""TASK: Update this JSON template with data from the PDF text.
128
+
129
+ JSON TEMPLATE TO UPDATE:
130
+ {json.dumps(word_json, indent=2, ensure_ascii=False)}
131
+
132
+ STRUCTURE ANALYSIS:
133
+ {chr(10).join(structure_analysis[:50])} # Limit to first 50 for brevity
134
+
135
+ PDF SOURCE TEXT:
136
+ {pdf_text[:100000]} # Truncate very long text
137
+
138
+ EXTRACTION GUIDELINES:
139
+ - For "Operator name (Legal entity)" or similar: Extract the company name
140
+ - For "Date of Audit": Look for audit dates (format: DD Month YYYY or DD/MM/YYYY)
141
+ - For "Auditor name": Extract auditor's name
142
+ - For "Attendance List": Extract names and positions, format as list
143
+ - For vehicle data: Extract registration numbers, maintenance info, etc.
144
+ - For management summaries: Extract compliance details and findings
145
+
146
+ CRITICAL: Return ONLY the updated JSON object. No other text whatsoever."""
147
+
148
+ return system_msg, user_prompt
149
 
150
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
151
+ # Load inputs
152
  word_json_text = read_any(word_json_file)
153
  pdf_txt = read_any(pdf_txt_file)
154
 
155
  try:
156
  word_json = json.loads(word_json_text)
157
+ except Exception as e:
158
+ print(f"⚠️ Input word_json is not valid JSON: {e}")
159
+ print("Writing original to output and exiting.")
160
  if hasattr(output_file, "write"):
161
  output_file.write(word_json_text)
162
  output_file.flush()
 
165
  f.write(word_json_text)
166
  return
167
 
168
+ # Check API key
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  api_key = os.environ.get("OPENAI_API_KEY")
170
  if not api_key:
171
+ print("⚠️ OPENAI_API_KEY not found! Writing original JSON to output.")
172
  if hasattr(output_file, "write"):
173
  json.dump(word_json, output_file, indent=2, ensure_ascii=False)
174
  output_file.flush()
175
  else:
176
+ with open(output_file, "w", encoding="utf-8") as f:
177
+ json.dump(word_json, f, indent=2, ensure_ascii=False)
178
  return
179
 
180
  if OpenAI is None:
181
+ print("⚠️ OpenAI SDK not available. Writing original JSON to output.")
182
  if hasattr(output_file, "write"):
183
  json.dump(word_json, output_file, indent=2, ensure_ascii=False)
184
  output_file.flush()
185
  else:
186
+ with open(output_file, "w", encoding="utf-8") as f:
187
+ json.dump(word_json, f, indent=2, ensure_ascii=False)
188
  return
189
 
190
+ # Create enhanced prompts
191
+ system_msg, user_prompt = create_enhanced_prompt(word_json, pdf_txt)
192
+
193
+ print(f"📊 Original JSON has {len(json.dumps(word_json))} characters")
194
+ print(f"📊 PDF text has {len(pdf_txt)} characters")
195
 
196
+ client = OpenAI(api_key=api_key)
197
  model_name = DEFAULT_MODEL
198
 
199
+ # Try multiple attempts with different strategies
 
 
 
200
  for attempt in range(1, RETRIES + 1):
201
+ print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
202
+
203
+ # Modify prompt for different attempts
204
+ current_user_prompt = user_prompt
205
  if attempt == 2:
206
+ current_user_prompt += "\n\nIMPORTANT: Focus on extracting the most obvious data first. Ensure valid JSON format."
207
  elif attempt >= 3:
208
+ current_user_prompt += "\n\nFINAL ATTEMPT: Return a valid JSON object even if some fields remain empty. Prioritize JSON validity over completeness."
209
 
 
210
  try:
211
+ raw_text = call_model_and_get_raw(client, model_name, system_msg, current_user_prompt)
212
+
213
+ # Save raw output for debugging
214
  out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
215
  raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
216
+ try:
217
+ with open(raw_save_path, "w", encoding="utf-8") as f:
218
+ f.write(raw_text)
219
+ except:
220
+ pass
221
 
222
+ # Try to parse directly
223
  try:
224
+ parsed = json.loads(raw_text)
225
+ print("✅ Model returned valid JSON directly.")
226
+
227
+ # Validate structure matches original
228
+ if validate_json_structure(parsed, word_json):
229
+ print("✅ JSON structure validation passed.")
230
+ else:
231
+ print("⚠️ JSON structure differs from template, but proceeding...")
232
+
233
+ # Write output
234
  if hasattr(output_file, "write"):
235
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
236
  output_file.flush()
237
  else:
238
+ with open(output_file, "w", encoding="utf-8") as f:
239
+ json.dump(parsed, f, indent=2, ensure_ascii=False)
240
  return parsed
241
+
242
+ except Exception as parse_error:
243
+ print(f"⚠️ Direct parsing failed: {parse_error}")
244
+
245
+ # Try to extract JSON substring
246
+ candidate = find_first_balanced_json(raw_text)
247
  if candidate:
248
  try:
249
+ parsed = json.loads(candidate)
250
+ print("✅ Successfully extracted JSON substring from model output.")
251
+
252
+ # Write output
253
  if hasattr(output_file, "write"):
254
  json.dump(parsed, output_file, indent=2, ensure_ascii=False)
255
  output_file.flush()
256
  else:
257
+ with open(output_file, "w", encoding="utf-8") as f:
258
+ json.dump(parsed, f, indent=2, ensure_ascii=False)
259
  return parsed
260
+
261
+ except Exception as sub_parse_error:
262
+ print(f"⚠️ Substring parsing also failed: {sub_parse_error}")
263
+
264
+ # Try repair pass
265
+ print("🔧 Attempting JSON repair...")
266
+ repair_system = "You are a JSON repair specialist. Fix the JSON below to be valid. Return ONLY valid JSON, nothing else."
267
+ repair_user = f"Fix this JSON:\n\n{raw_text}"
268
+
 
 
 
 
 
 
 
 
 
269
  try:
270
+ repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
271
+ repair_parsed = json.loads(repair_raw)
272
+ print("✅ Repair pass succeeded.")
273
+
274
+ # Write output
275
  if hasattr(output_file, "write"):
276
+ json.dump(repair_parsed, output_file, indent=2, ensure_ascii=False)
277
  output_file.flush()
278
  else:
279
+ with open(output_file, "w", encoding="utf-8") as f:
280
+ json.dump(repair_parsed, f, indent=2, ensure_ascii=False)
281
+ return repair_parsed
282
+
283
+ except Exception as repair_error:
284
+ print(f"⚠️ Repair pass failed: {repair_error}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
+ except Exception as call_error:
287
+ print(f"⚠️ Exception while calling model: {call_error}")
288
 
289
  # Wait before next attempt
290
+ if attempt < RETRIES:
291
+ time.sleep(RETRY_DELAY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
+ # All attempts failed
294
+ print("❗ All LLM attempts failed. Writing original JSON to output.")
295
+
 
 
 
 
 
 
 
 
 
 
 
 
296
  try:
297
  if hasattr(output_file, "write"):
298
  json.dump(word_json, output_file, indent=2, ensure_ascii=False)
299
  output_file.flush()
300
  else:
301
+ with open(output_file, "w", encoding="utf-8") as f:
302
+ json.dump(word_json, f, indent=2, ensure_ascii=False)
303
+ print("✅ Original JSON template written to output.")
304
  except Exception as e:
305
+ print(f"⚠️ Failed to write original JSON: {e}")
306
 
307
  return None
308
 
309
+ def validate_json_structure(parsed_json, original_json):
310
+ """Validate that the parsed JSON maintains the original structure"""
311
+ try:
312
+ def compare_structure(parsed, original, path=""):
313
+ if type(parsed) != type(original):
314
+ print(f"⚠️ Type mismatch at {path}: {type(parsed)} vs {type(original)}")
315
+ return False
316
+
317
+ if isinstance(original, dict):
318
+ for key in original.keys():
319
+ if key not in parsed:
320
+ print(f"⚠️ Missing key at {path}.{key}")
321
+ return False
322
+ if not compare_structure(parsed[key], original[key], f"{path}.{key}"):
323
+ return False
324
+
325
+ return True
326
+
327
+ return compare_structure(parsed_json, original_json)
328
+ except Exception:
329
+ return False
330
 
331
  if __name__ == "__main__":
332
  if len(sys.argv) != 4:
 
336
  try:
337
  update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
338
  except Exception as e:
339
+ print(f"Unexpected exception: {e}")
 
340
  try:
341
  with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
342
  outf.write(inf.read())