File size: 13,490 Bytes
3edd648
f486b52
89ec944
f486b52
 
876a319
3edd648
 
8001b1f
3edd648
 
 
 
 
f486b52
3edd648
e8b46b5
f486b52
3edd648
89ec944
f486b52
 
 
 
3edd648
 
 
 
 
8001b1f
 
 
 
3edd648
8001b1f
25603c9
f486b52
89ec944
3edd648
 
89ec944
f486b52
 
 
 
 
89ec944
f486b52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3edd648
 
f486b52
89ec944
f486b52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89ec944
f486b52
 
 
 
89ec944
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f486b52
8001b1f
89ec944
3edd648
25603c9
e8b46b5
3edd648
 
89ec944
 
 
3edd648
 
 
 
 
 
 
 
89ec944
876a319
 
89ec944
3edd648
 
 
 
89ec944
 
3edd648
 
 
89ec944
3edd648
 
 
 
89ec944
 
3edd648
 
89ec944
 
 
 
 
25603c9
89ec944
f486b52
f9fae18
89ec944
f486b52
89ec944
 
 
 
f486b52
89ec944
f486b52
89ec944
f486b52
8001b1f
89ec944
 
 
f486b52
 
89ec944
 
 
 
 
f486b52
89ec944
3edd648
89ec944
 
 
 
 
 
 
 
 
 
3edd648
 
 
 
89ec944
 
3edd648
89ec944
 
 
 
 
 
3edd648
 
89ec944
 
 
 
3edd648
 
 
 
89ec944
 
3edd648
89ec944
 
 
 
 
 
 
 
 
f486b52
89ec944
 
 
 
 
f486b52
89ec944
f486b52
 
89ec944
 
 
 
 
 
f486b52
89ec944
 
f486b52
 
89ec944
 
3edd648
89ec944
 
 
3edd648
 
 
 
 
89ec944
 
 
3edd648
89ec944
3edd648
 
876a319
89ec944
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f486b52
876a319
 
 
3edd648
 
 
 
 
89ec944
3edd648
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#!/usr/bin/env python3
"""
Enhanced update_docx_with_pdf.py with better JSON structure handling
"""

import os
import sys
import json
import time
import re
from typing import Optional

try:
    from openai import OpenAI
except Exception:
    OpenAI = None

# Config
RETRIES = 3
RETRY_DELAY = 1.0
DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
MAX_TOKENS = 4096
TEMPERATURE = 0.0

def read_any(path_or_file):
    """Read content from file path or file-like object."""
    if hasattr(path_or_file, "read"):
        path_or_file.seek(0)
        content = path_or_file.read()
        if isinstance(content, bytes):
            content = content.decode("utf-8")
        return content
    else:
        with open(path_or_file, "r", encoding="utf-8") as fh:
            return fh.read()

def find_first_balanced_json(s: str) -> Optional[str]:
    """Find the first valid JSON object in the string"""
    if not s:
        return None
    
    for m in re.finditer(r"\{", s):
        start = m.start()
        depth = 0
        in_str = False
        escape = False
        
        for i in range(start, len(s)):
            ch = s[i]
            if ch == '"' and not escape:
                in_str = not in_str
            if in_str:
                if ch == "\\" and not escape:
                    escape = True
                else:
                    escape = False
                continue
            if ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    candidate = s[start : i + 1]
                    try:
                        json.loads(candidate)
                        return candidate
                    except Exception:
                        break
    return None

def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
    """Call the model and return raw text content"""
    resp = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
    )

    try:
        raw_text = resp.choices[0].message.content
    except Exception:
        try:
            raw_text = resp.choices[0].text
        except Exception:
            raw_text = str(resp)
    
    if isinstance(raw_text, bytes):
        raw_text = raw_text.decode("utf-8", errors="replace")
    return (raw_text or "").strip()

def create_enhanced_prompt(word_json, pdf_text):
    """Create an enhanced prompt that ensures proper JSON structure"""
    
    # Analyze the word_json structure to understand what needs to be filled
    structure_analysis = []
    
    def analyze_structure(obj, path=""):
        if isinstance(obj, dict):
            for key, value in obj.items():
                current_path = f"{path}.{key}" if path else key
                if isinstance(value, dict):
                    structure_analysis.append(f"  {current_path} (nested object)")
                    analyze_structure(value, current_path)
                elif isinstance(value, list):
                    structure_analysis.append(f"  {current_path} (list with {len(value)} items)")
                elif value is None or str(value).strip() == "":
                    structure_analysis.append(f"  {current_path} (EMPTY - needs data)")
                else:
                    structure_analysis.append(f"  {current_path} (has data: {str(value)[:50]}...)")
    
    analyze_structure(word_json)
    
    system_msg = """You are a precise JSON data extraction assistant. 

CRITICAL RULES:
1. Output ONLY valid JSON - no markdown, no explanations, no extra text
2. Maintain the EXACT structure provided in the template
3. Only UPDATE fields that are empty or null - do not change existing data
4. Extract data accurately from the PDF text provided
5. If you cannot find data for a field, leave it as null or empty string
6. Ensure all nested objects and arrays maintain their structure"""

    user_prompt = f"""TASK: Update this JSON template with data from the PDF text.

JSON TEMPLATE TO UPDATE:
{json.dumps(word_json, indent=2, ensure_ascii=False)}

STRUCTURE ANALYSIS:
{chr(10).join(structure_analysis[:50])}  # Limit to first 50 for brevity

PDF SOURCE TEXT:
{pdf_text[:100000]}  # Truncate very long text

EXTRACTION GUIDELINES:
- For "Operator name (Legal entity)" or similar: Extract the company name
- For "Date of Audit": Look for audit dates (format: DD Month YYYY or DD/MM/YYYY)
- For "Auditor name": Extract auditor's name
- For "Attendance List": Extract names and positions, format as list
- For vehicle data: Extract registration numbers, maintenance info, etc.
- For management summaries: Extract compliance details and findings

CRITICAL: Return ONLY the updated JSON object. No other text whatsoever."""

    return system_msg, user_prompt

def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
    # Load inputs
    word_json_text = read_any(word_json_file)
    pdf_txt = read_any(pdf_txt_file)

    try:
        word_json = json.loads(word_json_text)
    except Exception as e:
        print(f"⚠️ Input word_json is not valid JSON: {e}")
        print("Writing original to output and exiting.")
        if hasattr(output_file, "write"):
            output_file.write(word_json_text)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(word_json_text)
        return

    # Check API key
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("⚠️ OPENAI_API_KEY not found! Writing original JSON to output.")
        if hasattr(output_file, "write"):
            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(word_json, f, indent=2, ensure_ascii=False)
        return

    if OpenAI is None:
        print("⚠️ OpenAI SDK not available. Writing original JSON to output.")
        if hasattr(output_file, "write"):
            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(word_json, f, indent=2, ensure_ascii=False)
        return

    # Create enhanced prompts
    system_msg, user_prompt = create_enhanced_prompt(word_json, pdf_txt)
    
    print(f"📊 Original JSON has {len(json.dumps(word_json))} characters")
    print(f"📊 PDF text has {len(pdf_txt)} characters")

    client = OpenAI(api_key=api_key)
    model_name = DEFAULT_MODEL

    # Try multiple attempts with different strategies
    for attempt in range(1, RETRIES + 1):
        print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
        
        # Modify prompt for different attempts
        current_user_prompt = user_prompt
        if attempt == 2:
            current_user_prompt += "\n\nIMPORTANT: Focus on extracting the most obvious data first. Ensure valid JSON format."
        elif attempt >= 3:
            current_user_prompt += "\n\nFINAL ATTEMPT: Return a valid JSON object even if some fields remain empty. Prioritize JSON validity over completeness."

        try:
            raw_text = call_model_and_get_raw(client, model_name, system_msg, current_user_prompt)
            
            # Save raw output for debugging
            out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
            raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
            try:
                with open(raw_save_path, "w", encoding="utf-8") as f:
                    f.write(raw_text)
            except:
                pass

            # Try to parse directly
            try:
                parsed = json.loads(raw_text)
                print("✅ Model returned valid JSON directly.")
                
                # Validate structure matches original
                if validate_json_structure(parsed, word_json):
                    print("✅ JSON structure validation passed.")
                else:
                    print("⚠️ JSON structure differs from template, but proceeding...")
                
                # Write output
                if hasattr(output_file, "write"):
                    json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                    output_file.flush()
                else:
                    with open(output_file, "w", encoding="utf-8") as f:
                        json.dump(parsed, f, indent=2, ensure_ascii=False)
                return parsed
                
            except Exception as parse_error:
                print(f"⚠️ Direct parsing failed: {parse_error}")
                
                # Try to extract JSON substring
                candidate = find_first_balanced_json(raw_text)
                if candidate:
                    try:
                        parsed = json.loads(candidate)
                        print("✅ Successfully extracted JSON substring from model output.")
                        
                        # Write output
                        if hasattr(output_file, "write"):
                            json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                            output_file.flush()
                        else:
                            with open(output_file, "w", encoding="utf-8") as f:
                                json.dump(parsed, f, indent=2, ensure_ascii=False)
                        return parsed
                        
                    except Exception as sub_parse_error:
                        print(f"⚠️ Substring parsing also failed: {sub_parse_error}")
                
                # Try repair pass
                print("🔧 Attempting JSON repair...")
                repair_system = "You are a JSON repair specialist. Fix the JSON below to be valid. Return ONLY valid JSON, nothing else."
                repair_user = f"Fix this JSON:\n\n{raw_text}"
                
                try:
                    repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
                    repair_parsed = json.loads(repair_raw)
                    print("✅ Repair pass succeeded.")
                    
                    # Write output
                    if hasattr(output_file, "write"):
                        json.dump(repair_parsed, output_file, indent=2, ensure_ascii=False)
                        output_file.flush()
                    else:
                        with open(output_file, "w", encoding="utf-8") as f:
                            json.dump(repair_parsed, f, indent=2, ensure_ascii=False)
                    return repair_parsed
                    
                except Exception as repair_error:
                    print(f"⚠️ Repair pass failed: {repair_error}")

        except Exception as call_error:
            print(f"⚠️ Exception while calling model: {call_error}")

        # Wait before next attempt
        if attempt < RETRIES:
            time.sleep(RETRY_DELAY)

    # All attempts failed
    print("❗ All LLM attempts failed. Writing original JSON to output.")
    
    try:
        if hasattr(output_file, "write"):
            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(word_json, f, indent=2, ensure_ascii=False)
        print("✅ Original JSON template written to output.")
    except Exception as e:
        print(f"⚠️ Failed to write original JSON: {e}")

    return None

def validate_json_structure(parsed_json, original_json):
    """Validate that the parsed JSON maintains the original structure"""
    try:
        def compare_structure(parsed, original, path=""):
            if type(parsed) != type(original):
                print(f"⚠️ Type mismatch at {path}: {type(parsed)} vs {type(original)}")
                return False
            
            if isinstance(original, dict):
                for key in original.keys():
                    if key not in parsed:
                        print(f"⚠️ Missing key at {path}.{key}")
                        return False
                    if not compare_structure(parsed[key], original[key], f"{path}.{key}"):
                        return False
            
            return True
        
        return compare_structure(parsed_json, original_json)
    except Exception:
        return False

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
        sys.exit(0)

    try:
        update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
    except Exception as e:
        print(f"Unexpected exception: {e}")
        try:
            with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
                outf.write(inf.read())
                print("Wrote original input JSON to output due to exception.")
        except Exception:
            pass
        sys.exit(0)