Shami96 commited on
Commit
5244c54
·
verified ·
1 Parent(s): 0322667

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +58 -336
update_docx_with_pdf.py CHANGED
@@ -1,359 +1,81 @@
1
- #!/usr/bin/env python3
2
- """
3
- Enhanced update_docx_with_pdf.py with better JSON structure handling
4
- """
5
-
6
- import os
7
- import sys
8
  import json
9
- import time
10
- import re
11
- from typing import Optional
12
-
13
- try:
14
- from openai import OpenAI
15
- except Exception:
16
- OpenAI = None
17
-
18
- # Config
19
- RETRIES = 3
20
- RETRY_DELAY = 1.0
21
- DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
22
- MAX_TOKENS = 4096
23
- TEMPERATURE = 0.0
24
-
25
- def read_any(path_or_file):
26
- """Read content from file path or file-like object."""
27
- if hasattr(path_or_file, "read"):
28
- path_or_file.seek(0)
29
- content = path_or_file.read()
30
- if isinstance(content, bytes):
31
- content = content.decode("utf-8")
32
- return content
33
- else:
34
- with open(path_or_file, "r", encoding="utf-8") as fh:
35
- return fh.read()
36
-
37
- def find_first_balanced_json(s: str) -> Optional[str]:
38
- """Find the first valid JSON object in the string"""
39
- if not s:
40
- return None
41
-
42
- for m in re.finditer(r"\{", s):
43
- start = m.start()
44
- depth = 0
45
- in_str = False
46
- escape = False
47
-
48
- for i in range(start, len(s)):
49
- ch = s[i]
50
- if ch == '"' and not escape:
51
- in_str = not in_str
52
- if in_str:
53
- if ch == "\\" and not escape:
54
- escape = True
55
- else:
56
- escape = False
57
- continue
58
- if ch == "{":
59
- depth += 1
60
- elif ch == "}":
61
- depth -= 1
62
- if depth == 0:
63
- candidate = s[start : i + 1]
64
- try:
65
- json.loads(candidate)
66
- return candidate
67
- except Exception:
68
- break
69
- return None
70
-
71
- def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
72
- """Call the model and return raw text content"""
73
- resp = client.chat.completions.create(
74
- model=model_name,
75
- messages=[
76
- {"role": "system", "content": system_msg},
77
- {"role": "user", "content": user_msg},
78
- ],
79
- max_tokens=MAX_TOKENS,
80
- temperature=TEMPERATURE,
81
- )
82
-
83
- try:
84
- raw_text = resp.choices[0].message.content
85
- except Exception:
86
- try:
87
- raw_text = resp.choices[0].text
88
- except Exception:
89
- raw_text = str(resp)
90
-
91
- if isinstance(raw_text, bytes):
92
- raw_text = raw_text.decode("utf-8", errors="replace")
93
- return (raw_text or "").strip()
94
-
95
- def create_enhanced_prompt(word_json, pdf_text):
96
- """Create an enhanced prompt that ensures proper JSON structure"""
97
-
98
- # Analyze the word_json structure to understand what needs to be filled
99
- structure_analysis = []
100
-
101
- def analyze_structure(obj, path=""):
102
- if isinstance(obj, dict):
103
- for key, value in obj.items():
104
- current_path = f"{path}.{key}" if path else key
105
- if isinstance(value, dict):
106
- structure_analysis.append(f" {current_path} (nested object)")
107
- analyze_structure(value, current_path)
108
- elif isinstance(value, list):
109
- structure_analysis.append(f" {current_path} (list with {len(value)} items)")
110
- elif value is None or str(value).strip() == "":
111
- structure_analysis.append(f" {current_path} (EMPTY - needs data)")
112
- else:
113
- structure_analysis.append(f" {current_path} (has data: {str(value)[:50]}...)")
114
-
115
- analyze_structure(word_json)
116
-
117
- system_msg = """You are a precise JSON data extraction assistant.
118
-
119
- CRITICAL RULES:
120
- 1. Output ONLY valid JSON - no markdown, no explanations, no extra text
121
- 2. Maintain the EXACT structure provided in the template
122
- 3. Only UPDATE fields that are empty or null - do not change existing data
123
- 4. Extract data accurately from the PDF text provided
124
- 5. If you cannot find data for a field, leave it as null or empty string
125
- 6. Ensure all nested objects and arrays maintain their structure"""
126
-
127
- user_prompt = f"""TASK: Update this JSON template with data from the PDF text.
128
-
129
- JSON TEMPLATE TO UPDATE:
130
- {json.dumps(word_json, indent=2, ensure_ascii=False)}
131
-
132
- STRUCTURE ANALYSIS:
133
- {chr(10).join(structure_analysis[:50])} # Limit to first 50 for brevity
134
-
135
- PDF SOURCE TEXT:
136
- {pdf_text[:100000]} # Truncate very long text
137
 
138
- EXTRACTION GUIDELINES:
139
- - For "Operator name (Legal entity)" or similar: Extract the company name
140
- - For "Date of Audit": Look for audit dates (format: DD Month YYYY or DD/MM/YYYY)
141
- - For "Auditor name": Extract auditor's name
142
- - For "Attendance List": Extract names and positions, format as list
143
- - For vehicle data: Extract registration numbers, maintenance info, etc.
144
- - For management summaries: Extract compliance details and findings
 
 
 
 
 
 
 
 
 
 
145
 
146
- CRITICAL: Return ONLY the updated JSON object. No other text whatsoever."""
 
147
 
148
- return system_msg, user_prompt
 
 
149
 
150
- def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
151
- # Load inputs
152
- word_json_text = read_any(word_json_file)
153
- pdf_txt = read_any(pdf_txt_file)
154
 
155
- try:
156
- word_json = json.loads(word_json_text)
157
- except Exception as e:
158
- print(f"⚠️ Input word_json is not valid JSON: {e}")
159
- print("Writing original to output and exiting.")
160
- if hasattr(output_file, "write"):
161
- output_file.write(word_json_text)
162
- output_file.flush()
163
- else:
164
- with open(output_file, "w", encoding="utf-8") as f:
165
- f.write(word_json_text)
166
- return
167
 
168
- # Check API key
169
  api_key = os.environ.get("OPENAI_API_KEY")
170
  if not api_key:
171
- print("⚠️ OPENAI_API_KEY not found! Writing original JSON to output.")
172
- if hasattr(output_file, "write"):
173
- json.dump(word_json, output_file, indent=2, ensure_ascii=False)
174
- output_file.flush()
175
- else:
176
- with open(output_file, "w", encoding="utf-8") as f:
177
- json.dump(word_json, f, indent=2, ensure_ascii=False)
178
- return
179
-
180
- if OpenAI is None:
181
- print("⚠️ OpenAI SDK not available. Writing original JSON to output.")
182
- if hasattr(output_file, "write"):
183
- json.dump(word_json, output_file, indent=2, ensure_ascii=False)
184
- output_file.flush()
185
- else:
186
- with open(output_file, "w", encoding="utf-8") as f:
187
- json.dump(word_json, f, indent=2, ensure_ascii=False)
188
- return
189
-
190
- # Create enhanced prompts
191
- system_msg, user_prompt = create_enhanced_prompt(word_json, pdf_txt)
192
-
193
- print(f"📊 Original JSON has {len(json.dumps(word_json))} characters")
194
- print(f"📊 PDF text has {len(pdf_txt)} characters")
195
 
196
  client = OpenAI(api_key=api_key)
197
- model_name = DEFAULT_MODEL
198
-
199
- # Try multiple attempts with different strategies
200
- for attempt in range(1, RETRIES + 1):
201
- print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
202
-
203
- # Modify prompt for different attempts
204
- current_user_prompt = user_prompt
205
- if attempt == 2:
206
- current_user_prompt += "\n\nIMPORTANT: Focus on extracting the most obvious data first. Ensure valid JSON format."
207
- elif attempt >= 3:
208
- current_user_prompt += "\n\nFINAL ATTEMPT: Return a valid JSON object even if some fields remain empty. Prioritize JSON validity over completeness."
209
-
210
- try:
211
- raw_text = call_model_and_get_raw(client, model_name, system_msg, current_user_prompt)
212
-
213
- # Save raw output for debugging
214
- out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
215
- raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
216
- try:
217
- with open(raw_save_path, "w", encoding="utf-8") as f:
218
- f.write(raw_text)
219
- except:
220
- pass
221
-
222
- # Try to parse directly
223
- try:
224
- parsed = json.loads(raw_text)
225
- print("✅ Model returned valid JSON directly.")
226
-
227
- # Validate structure matches original
228
- if validate_json_structure(parsed, word_json):
229
- print("✅ JSON structure validation passed.")
230
- else:
231
- print("⚠️ JSON structure differs from template, but proceeding...")
232
-
233
- # Write output
234
- if hasattr(output_file, "write"):
235
- json.dump(parsed, output_file, indent=2, ensure_ascii=False)
236
- output_file.flush()
237
- else:
238
- with open(output_file, "w", encoding="utf-8") as f:
239
- json.dump(parsed, f, indent=2, ensure_ascii=False)
240
- return parsed
241
-
242
- except Exception as parse_error:
243
- print(f"⚠️ Direct parsing failed: {parse_error}")
244
-
245
- # Try to extract JSON substring
246
- candidate = find_first_balanced_json(raw_text)
247
- if candidate:
248
- try:
249
- parsed = json.loads(candidate)
250
- print("✅ Successfully extracted JSON substring from model output.")
251
-
252
- # Write output
253
- if hasattr(output_file, "write"):
254
- json.dump(parsed, output_file, indent=2, ensure_ascii=False)
255
- output_file.flush()
256
- else:
257
- with open(output_file, "w", encoding="utf-8") as f:
258
- json.dump(parsed, f, indent=2, ensure_ascii=False)
259
- return parsed
260
-
261
- except Exception as sub_parse_error:
262
- print(f"⚠️ Substring parsing also failed: {sub_parse_error}")
263
-
264
- # Try repair pass
265
- print("🔧 Attempting JSON repair...")
266
- repair_system = "You are a JSON repair specialist. Fix the JSON below to be valid. Return ONLY valid JSON, nothing else."
267
- repair_user = f"Fix this JSON:\n\n{raw_text}"
268
-
269
- try:
270
- repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
271
- repair_parsed = json.loads(repair_raw)
272
- print("✅ Repair pass succeeded.")
273
-
274
- # Write output
275
- if hasattr(output_file, "write"):
276
- json.dump(repair_parsed, output_file, indent=2, ensure_ascii=False)
277
- output_file.flush()
278
- else:
279
- with open(output_file, "w", encoding="utf-8") as f:
280
- json.dump(repair_parsed, f, indent=2, ensure_ascii=False)
281
- return repair_parsed
282
-
283
- except Exception as repair_error:
284
- print(f"⚠️ Repair pass failed: {repair_error}")
285
-
286
- except Exception as call_error:
287
- print(f"⚠️ Exception while calling model: {call_error}")
288
 
289
- # Wait before next attempt
290
- if attempt < RETRIES:
291
- time.sleep(RETRY_DELAY)
292
 
293
- # All attempts failed
294
- print("❗ All LLM attempts failed. Writing original JSON to output.")
295
-
296
  try:
 
297
  if hasattr(output_file, "write"):
298
- json.dump(word_json, output_file, indent=2, ensure_ascii=False)
299
  output_file.flush()
300
  else:
301
  with open(output_file, "w", encoding="utf-8") as f:
302
- json.dump(word_json, f, indent=2, ensure_ascii=False)
303
- print("✅ Original JSON template written to output.")
304
  except Exception as e:
305
- print(f"⚠️ Failed to write original JSON: {e}")
306
-
307
- return None
308
-
309
- def validate_json_structure(parsed_json, original_json):
310
- """Validate that the parsed JSON maintains the original structure"""
311
- try:
312
- def compare_structure(parsed, original, path=""):
313
- if type(parsed) != type(original):
314
- print(f"⚠️ Type mismatch at {path}: {type(parsed)} vs {type(original)}")
315
- return False
316
-
317
- if isinstance(original, dict):
318
- for key in original.keys():
319
- if key not in parsed:
320
- print(f"⚠️ Missing key at {path}.{key}")
321
- return False
322
- if not compare_structure(parsed[key], original[key], f"{path}.{key}"):
323
- return False
324
-
325
- return True
326
-
327
- return compare_structure(parsed_json, original_json)
328
- except Exception:
329
- return False
330
 
331
  if __name__ == "__main__":
 
332
  if len(sys.argv) != 4:
333
  print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
334
- sys.exit(0)
335
-
336
- try:
337
- update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
338
- except Exception as e:
339
- print(f"Unexpected exception: {e}")
340
- try:
341
- with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
342
- outf.write(inf.read())
343
- print("Wrote original input JSON to output due to exception.")
344
- except Exception:
345
- pass
346
- sys.exit(0)
347
-
348
- # ADD THIS LINE:
349
- with open(sys.argv[3], 'r') as f: print(f"\n📄 LLM_UPDATE OUTPUT:\n{f.read()}")
350
-
351
- except Exception as e:
352
- print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
353
- try:
354
- with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
355
- outf.write(inf.read())
356
- print("Wrote original input JSON to output due to exception.")
357
- except Exception:
358
- pass
359
- sys.exit(0)
 
1
+ from openai import OpenAI
 
 
 
 
 
 
2
  import json
3
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
6
+ """
7
+ word_json_file: file-like object or file path (docx extraction JSON)
8
+ pdf_txt_file: file-like object or file path (PDF plain text)
9
+ output_file: file-like object (opened for writing) or file path
10
+ """
11
+ # --- Load files ---
12
+ def read_any(f):
13
+ if hasattr(f, "read"):
14
+ f.seek(0)
15
+ content = f.read()
16
+ if isinstance(content, bytes):
17
+ content = content.decode("utf-8")
18
+ return content
19
+ else:
20
+ with open(f, "r", encoding="utf-8") as fh:
21
+ return fh.read()
22
 
23
+ word_json = read_any(word_json_file)
24
+ pdf_txt = read_any(pdf_txt_file)
25
 
26
+ # --- Build prompt ---
27
+ user_prompt = f"""Here is a JSON template. It contains only the fields that need updating:
28
+ {word_json}
29
 
30
+ Here is the extracted text from a PDF:
31
+ {pdf_txt}
 
 
32
 
33
+ Instructions:
34
+ - ONLY update the fields present in the JSON template, using information from the PDF text.
35
+ - DO NOT add any extra fields, and do not change the JSON structure.
36
+ - Update ALL nested sections properly (like "Operator Declaration" with its "Print Name" and "Position Title")
37
+ - Make sure to update both the main sections AND the flattened keys (like "Operator Declaration.Print Name")
38
+ - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
39
+ - Make sure the JSON is valid and ready to use.
40
+ - Pay special attention to updating operator names, auditor names, and all personal details consistently throughout all sections."""
 
 
 
 
41
 
42
+ # --- Call OpenAI API ---
43
  api_key = os.environ.get("OPENAI_API_KEY")
44
  if not api_key:
45
+ raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  client = OpenAI(api_key=api_key)
48
+
49
+ response = client.chat.completions.create(
50
+ model="gpt-4o",
51
+ messages=[
52
+ {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON. Update ALL sections consistently with the same data."},
53
+ {"role": "user", "content": user_prompt}
54
+ ],
55
+ max_tokens=4096,
56
+ temperature=0
57
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ updated_json_str = response.choices[0].message.content.strip()
 
 
60
 
61
+ # --- Try to parse as JSON ---
 
 
62
  try:
63
+ parsed = json.loads(updated_json_str)
64
  if hasattr(output_file, "write"):
65
+ json.dump(parsed, output_file, indent=2, ensure_ascii=False)
66
  output_file.flush()
67
  else:
68
  with open(output_file, "w", encoding="utf-8") as f:
69
+ json.dump(parsed, f, indent=2, ensure_ascii=False)
70
+ print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
71
  except Exception as e:
72
+ print("⚠️ Model did not return valid JSON. Raw output below:\n")
73
+ print(updated_json_str)
74
+ print("\n❌ Failed to parse updated JSON:", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  if __name__ == "__main__":
77
+ import sys
78
  if len(sys.argv) != 4:
79
  print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
80
+ exit(1)
81
+ update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])