Spaces:
Running
Running
File size: 13,490 Bytes
3edd648 f486b52 89ec944 f486b52 876a319 3edd648 8001b1f 3edd648 f486b52 3edd648 e8b46b5 f486b52 3edd648 89ec944 f486b52 3edd648 8001b1f 3edd648 8001b1f 25603c9 f486b52 89ec944 3edd648 89ec944 f486b52 89ec944 f486b52 3edd648 f486b52 89ec944 f486b52 89ec944 f486b52 89ec944 f486b52 8001b1f 89ec944 3edd648 25603c9 e8b46b5 3edd648 89ec944 3edd648 89ec944 876a319 89ec944 3edd648 89ec944 3edd648 89ec944 3edd648 89ec944 3edd648 89ec944 25603c9 89ec944 f486b52 f9fae18 89ec944 f486b52 89ec944 f486b52 89ec944 f486b52 89ec944 f486b52 8001b1f 89ec944 f486b52 89ec944 f486b52 89ec944 3edd648 89ec944 3edd648 89ec944 3edd648 89ec944 3edd648 89ec944 3edd648 89ec944 3edd648 89ec944 f486b52 89ec944 f486b52 89ec944 f486b52 89ec944 f486b52 89ec944 f486b52 89ec944 3edd648 89ec944 3edd648 89ec944 3edd648 89ec944 3edd648 876a319 89ec944 f486b52 876a319 3edd648 89ec944 3edd648 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 |
#!/usr/bin/env python3
"""
Enhanced update_docx_with_pdf.py with better JSON structure handling
"""
import os
import sys
import json
import time
import re
from typing import Optional
try:
from openai import OpenAI
except Exception:
OpenAI = None
# Config
RETRIES = 3
RETRY_DELAY = 1.0
DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
MAX_TOKENS = 4096
TEMPERATURE = 0.0
def read_any(path_or_file):
"""Read content from file path or file-like object."""
if hasattr(path_or_file, "read"):
path_or_file.seek(0)
content = path_or_file.read()
if isinstance(content, bytes):
content = content.decode("utf-8")
return content
else:
with open(path_or_file, "r", encoding="utf-8") as fh:
return fh.read()
def find_first_balanced_json(s: str) -> Optional[str]:
"""Find the first valid JSON object in the string"""
if not s:
return None
for m in re.finditer(r"\{", s):
start = m.start()
depth = 0
in_str = False
escape = False
for i in range(start, len(s)):
ch = s[i]
if ch == '"' and not escape:
in_str = not in_str
if in_str:
if ch == "\\" and not escape:
escape = True
else:
escape = False
continue
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
candidate = s[start : i + 1]
try:
json.loads(candidate)
return candidate
except Exception:
break
return None
def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
"""Call the model and return raw text content"""
resp = client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": system_msg},
{"role": "user", "content": user_msg},
],
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
)
try:
raw_text = resp.choices[0].message.content
except Exception:
try:
raw_text = resp.choices[0].text
except Exception:
raw_text = str(resp)
if isinstance(raw_text, bytes):
raw_text = raw_text.decode("utf-8", errors="replace")
return (raw_text or "").strip()
def create_enhanced_prompt(word_json, pdf_text):
"""Create an enhanced prompt that ensures proper JSON structure"""
# Analyze the word_json structure to understand what needs to be filled
structure_analysis = []
def analyze_structure(obj, path=""):
if isinstance(obj, dict):
for key, value in obj.items():
current_path = f"{path}.{key}" if path else key
if isinstance(value, dict):
structure_analysis.append(f" {current_path} (nested object)")
analyze_structure(value, current_path)
elif isinstance(value, list):
structure_analysis.append(f" {current_path} (list with {len(value)} items)")
elif value is None or str(value).strip() == "":
structure_analysis.append(f" {current_path} (EMPTY - needs data)")
else:
structure_analysis.append(f" {current_path} (has data: {str(value)[:50]}...)")
analyze_structure(word_json)
system_msg = """You are a precise JSON data extraction assistant.
CRITICAL RULES:
1. Output ONLY valid JSON - no markdown, no explanations, no extra text
2. Maintain the EXACT structure provided in the template
3. Only UPDATE fields that are empty or null - do not change existing data
4. Extract data accurately from the PDF text provided
5. If you cannot find data for a field, leave it as null or empty string
6. Ensure all nested objects and arrays maintain their structure"""
user_prompt = f"""TASK: Update this JSON template with data from the PDF text.
JSON TEMPLATE TO UPDATE:
{json.dumps(word_json, indent=2, ensure_ascii=False)}
STRUCTURE ANALYSIS:
{chr(10).join(structure_analysis[:50])} # Limit to first 50 for brevity
PDF SOURCE TEXT:
{pdf_text[:100000]} # Truncate very long text
EXTRACTION GUIDELINES:
- For "Operator name (Legal entity)" or similar: Extract the company name
- For "Date of Audit": Look for audit dates (format: DD Month YYYY or DD/MM/YYYY)
- For "Auditor name": Extract auditor's name
- For "Attendance List": Extract names and positions, format as list
- For vehicle data: Extract registration numbers, maintenance info, etc.
- For management summaries: Extract compliance details and findings
CRITICAL: Return ONLY the updated JSON object. No other text whatsoever."""
return system_msg, user_prompt
def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
# Load inputs
word_json_text = read_any(word_json_file)
pdf_txt = read_any(pdf_txt_file)
try:
word_json = json.loads(word_json_text)
except Exception as e:
print(f"⚠️ Input word_json is not valid JSON: {e}")
print("Writing original to output and exiting.")
if hasattr(output_file, "write"):
output_file.write(word_json_text)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
f.write(word_json_text)
return
# Check API key
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
print("⚠️ OPENAI_API_KEY not found! Writing original JSON to output.")
if hasattr(output_file, "write"):
json.dump(word_json, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(word_json, f, indent=2, ensure_ascii=False)
return
if OpenAI is None:
print("⚠️ OpenAI SDK not available. Writing original JSON to output.")
if hasattr(output_file, "write"):
json.dump(word_json, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(word_json, f, indent=2, ensure_ascii=False)
return
# Create enhanced prompts
system_msg, user_prompt = create_enhanced_prompt(word_json, pdf_txt)
print(f"📊 Original JSON has {len(json.dumps(word_json))} characters")
print(f"📊 PDF text has {len(pdf_txt)} characters")
client = OpenAI(api_key=api_key)
model_name = DEFAULT_MODEL
# Try multiple attempts with different strategies
for attempt in range(1, RETRIES + 1):
print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
# Modify prompt for different attempts
current_user_prompt = user_prompt
if attempt == 2:
current_user_prompt += "\n\nIMPORTANT: Focus on extracting the most obvious data first. Ensure valid JSON format."
elif attempt >= 3:
current_user_prompt += "\n\nFINAL ATTEMPT: Return a valid JSON object even if some fields remain empty. Prioritize JSON validity over completeness."
try:
raw_text = call_model_and_get_raw(client, model_name, system_msg, current_user_prompt)
# Save raw output for debugging
out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
try:
with open(raw_save_path, "w", encoding="utf-8") as f:
f.write(raw_text)
except:
pass
# Try to parse directly
try:
parsed = json.loads(raw_text)
print("✅ Model returned valid JSON directly.")
# Validate structure matches original
if validate_json_structure(parsed, word_json):
print("✅ JSON structure validation passed.")
else:
print("⚠️ JSON structure differs from template, but proceeding...")
# Write output
if hasattr(output_file, "write"):
json.dump(parsed, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(parsed, f, indent=2, ensure_ascii=False)
return parsed
except Exception as parse_error:
print(f"⚠️ Direct parsing failed: {parse_error}")
# Try to extract JSON substring
candidate = find_first_balanced_json(raw_text)
if candidate:
try:
parsed = json.loads(candidate)
print("✅ Successfully extracted JSON substring from model output.")
# Write output
if hasattr(output_file, "write"):
json.dump(parsed, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(parsed, f, indent=2, ensure_ascii=False)
return parsed
except Exception as sub_parse_error:
print(f"⚠️ Substring parsing also failed: {sub_parse_error}")
# Try repair pass
print("🔧 Attempting JSON repair...")
repair_system = "You are a JSON repair specialist. Fix the JSON below to be valid. Return ONLY valid JSON, nothing else."
repair_user = f"Fix this JSON:\n\n{raw_text}"
try:
repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
repair_parsed = json.loads(repair_raw)
print("✅ Repair pass succeeded.")
# Write output
if hasattr(output_file, "write"):
json.dump(repair_parsed, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(repair_parsed, f, indent=2, ensure_ascii=False)
return repair_parsed
except Exception as repair_error:
print(f"⚠️ Repair pass failed: {repair_error}")
except Exception as call_error:
print(f"⚠️ Exception while calling model: {call_error}")
# Wait before next attempt
if attempt < RETRIES:
time.sleep(RETRY_DELAY)
# All attempts failed
print("❗ All LLM attempts failed. Writing original JSON to output.")
try:
if hasattr(output_file, "write"):
json.dump(word_json, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(word_json, f, indent=2, ensure_ascii=False)
print("✅ Original JSON template written to output.")
except Exception as e:
print(f"⚠️ Failed to write original JSON: {e}")
return None
def validate_json_structure(parsed_json, original_json):
"""Validate that the parsed JSON maintains the original structure"""
try:
def compare_structure(parsed, original, path=""):
if type(parsed) != type(original):
print(f"⚠️ Type mismatch at {path}: {type(parsed)} vs {type(original)}")
return False
if isinstance(original, dict):
for key in original.keys():
if key not in parsed:
print(f"⚠️ Missing key at {path}.{key}")
return False
if not compare_structure(parsed[key], original[key], f"{path}.{key}"):
return False
return True
return compare_structure(parsed_json, original_json)
except Exception:
return False
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
sys.exit(0)
try:
update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
except Exception as e:
print(f"Unexpected exception: {e}")
try:
with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
outf.write(inf.read())
print("Wrote original input JSON to output due to exception.")
except Exception:
pass
sys.exit(0) |