Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

PDF-Data_Extractor / update_docx_with_pdf.py

Shami96

Update update_docx_with_pdf.py

89ec944 verified 3 months ago

raw

history blame

13.5 kB

	#!/usr/bin/env python3
	"""
	Enhanced update_docx_with_pdf.py with better JSON structure handling
	"""

	import os
	import sys
	import json
	import time
	import re
	from typing import Optional

	try:
	from openai import OpenAI
	except Exception:
	OpenAI = None

	# Config
	RETRIES = 3
	RETRY_DELAY = 1.0
	DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
	MAX_TOKENS = 4096
	TEMPERATURE = 0.0

	def read_any(path_or_file):
	"""Read content from file path or file-like object."""
	if hasattr(path_or_file, "read"):
	path_or_file.seek(0)
	content = path_or_file.read()
	if isinstance(content, bytes):
	content = content.decode("utf-8")
	return content
	else:
	with open(path_or_file, "r", encoding="utf-8") as fh:
	return fh.read()

	def find_first_balanced_json(s: str) -> Optional[str]:
	"""Find the first valid JSON object in the string"""
	if not s:
	return None

	for m in re.finditer(r"\{", s):
	start = m.start()
	depth = 0
	in_str = False
	escape = False

	for i in range(start, len(s)):
	ch = s[i]
	if ch == '"' and not escape:
	in_str = not in_str
	if in_str:
	if ch == "\\" and not escape:
	escape = True
	else:
	escape = False
	continue
	if ch == "{":
	depth += 1
	elif ch == "}":
	depth -= 1
	if depth == 0:
	candidate = s[start : i + 1]
	try:
	json.loads(candidate)
	return candidate
	except Exception:
	break
	return None

	def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
	"""Call the model and return raw text content"""
	resp = client.chat.completions.create(
	model=model_name,
	messages=[
	{"role": "system", "content": system_msg},
	{"role": "user", "content": user_msg},
	],
	max_tokens=MAX_TOKENS,
	temperature=TEMPERATURE,
	)

	try:
	raw_text = resp.choices[0].message.content
	except Exception:
	try:
	raw_text = resp.choices[0].text
	except Exception:
	raw_text = str(resp)

	if isinstance(raw_text, bytes):
	raw_text = raw_text.decode("utf-8", errors="replace")
	return (raw_text or "").strip()

	def create_enhanced_prompt(word_json, pdf_text):
	"""Create an enhanced prompt that ensures proper JSON structure"""

	# Analyze the word_json structure to understand what needs to be filled
	structure_analysis = []

	def analyze_structure(obj, path=""):
	if isinstance(obj, dict):
	for key, value in obj.items():
	current_path = f"{path}.{key}" if path else key
	if isinstance(value, dict):
	structure_analysis.append(f" {current_path} (nested object)")
	analyze_structure(value, current_path)
	elif isinstance(value, list):
	structure_analysis.append(f" {current_path} (list with {len(value)} items)")
	elif value is None or str(value).strip() == "":
	structure_analysis.append(f" {current_path} (EMPTY - needs data)")
	else:
	structure_analysis.append(f" {current_path} (has data: {str(value)[:50]}...)")

	analyze_structure(word_json)

	system_msg = """You are a precise JSON data extraction assistant.

	CRITICAL RULES:
	1. Output ONLY valid JSON - no markdown, no explanations, no extra text
	2. Maintain the EXACT structure provided in the template
	3. Only UPDATE fields that are empty or null - do not change existing data
	4. Extract data accurately from the PDF text provided
	5. If you cannot find data for a field, leave it as null or empty string
	6. Ensure all nested objects and arrays maintain their structure"""

	user_prompt = f"""TASK: Update this JSON template with data from the PDF text.

	JSON TEMPLATE TO UPDATE:
	{json.dumps(word_json, indent=2, ensure_ascii=False)}

	STRUCTURE ANALYSIS:
	{chr(10).join(structure_analysis[:50])} # Limit to first 50 for brevity

	PDF SOURCE TEXT:
	{pdf_text[:100000]} # Truncate very long text

	EXTRACTION GUIDELINES:
	- For "Operator name (Legal entity)" or similar: Extract the company name
	- For "Date of Audit": Look for audit dates (format: DD Month YYYY or DD/MM/YYYY)
	- For "Auditor name": Extract auditor's name
	- For "Attendance List": Extract names and positions, format as list
	- For vehicle data: Extract registration numbers, maintenance info, etc.
	- For management summaries: Extract compliance details and findings

	CRITICAL: Return ONLY the updated JSON object. No other text whatsoever."""

	return system_msg, user_prompt

	def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
	# Load inputs
	word_json_text = read_any(word_json_file)
	pdf_txt = read_any(pdf_txt_file)

	try:
	word_json = json.loads(word_json_text)
	except Exception as e:
	print(f"⚠️ Input word_json is not valid JSON: {e}")
	print("Writing original to output and exiting.")
	if hasattr(output_file, "write"):
	output_file.write(word_json_text)
	output_file.flush()
	else:
	with open(output_file, "w", encoding="utf-8") as f:
	f.write(word_json_text)
	return

	# Check API key
	api_key = os.environ.get("OPENAI_API_KEY")
	if not api_key:
	print("⚠️ OPENAI_API_KEY not found! Writing original JSON to output.")
	if hasattr(output_file, "write"):
	json.dump(word_json, output_file, indent=2, ensure_ascii=False)
	output_file.flush()
	else:
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(word_json, f, indent=2, ensure_ascii=False)
	return

	if OpenAI is None:
	print("⚠️ OpenAI SDK not available. Writing original JSON to output.")
	if hasattr(output_file, "write"):
	json.dump(word_json, output_file, indent=2, ensure_ascii=False)
	output_file.flush()
	else:
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(word_json, f, indent=2, ensure_ascii=False)
	return

	# Create enhanced prompts
	system_msg, user_prompt = create_enhanced_prompt(word_json, pdf_txt)

	print(f"📊 Original JSON has {len(json.dumps(word_json))} characters")
	print(f"📊 PDF text has {len(pdf_txt)} characters")

	client = OpenAI(api_key=api_key)
	model_name = DEFAULT_MODEL

	# Try multiple attempts with different strategies
	for attempt in range(1, RETRIES + 1):
	print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")

	# Modify prompt for different attempts
	current_user_prompt = user_prompt
	if attempt == 2:
	current_user_prompt += "\n\nIMPORTANT: Focus on extracting the most obvious data first. Ensure valid JSON format."
	elif attempt >= 3:
	current_user_prompt += "\n\nFINAL ATTEMPT: Return a valid JSON object even if some fields remain empty. Prioritize JSON validity over completeness."

	try:
	raw_text = call_model_and_get_raw(client, model_name, system_msg, current_user_prompt)

	# Save raw output for debugging
	out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
	raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
	try:
	with open(raw_save_path, "w", encoding="utf-8") as f:
	f.write(raw_text)
	except:
	pass

	# Try to parse directly
	try:
	parsed = json.loads(raw_text)
	print("✅ Model returned valid JSON directly.")

	# Validate structure matches original
	if validate_json_structure(parsed, word_json):
	print("✅ JSON structure validation passed.")
	else:
	print("⚠️ JSON structure differs from template, but proceeding...")

	# Write output
	if hasattr(output_file, "write"):
	json.dump(parsed, output_file, indent=2, ensure_ascii=False)
	output_file.flush()
	else:
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(parsed, f, indent=2, ensure_ascii=False)
	return parsed

	except Exception as parse_error:
	print(f"⚠️ Direct parsing failed: {parse_error}")

	# Try to extract JSON substring
	candidate = find_first_balanced_json(raw_text)
	if candidate:
	try:
	parsed = json.loads(candidate)
	print("✅ Successfully extracted JSON substring from model output.")

	# Write output
	if hasattr(output_file, "write"):
	json.dump(parsed, output_file, indent=2, ensure_ascii=False)
	output_file.flush()
	else:
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(parsed, f, indent=2, ensure_ascii=False)
	return parsed

	except Exception as sub_parse_error:
	print(f"⚠️ Substring parsing also failed: {sub_parse_error}")

	# Try repair pass
	print("🔧 Attempting JSON repair...")
	repair_system = "You are a JSON repair specialist. Fix the JSON below to be valid. Return ONLY valid JSON, nothing else."
	repair_user = f"Fix this JSON:\n\n{raw_text}"

	try:
	repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
	repair_parsed = json.loads(repair_raw)
	print("✅ Repair pass succeeded.")

	# Write output
	if hasattr(output_file, "write"):
	json.dump(repair_parsed, output_file, indent=2, ensure_ascii=False)
	output_file.flush()
	else:
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(repair_parsed, f, indent=2, ensure_ascii=False)
	return repair_parsed

	except Exception as repair_error:
	print(f"⚠️ Repair pass failed: {repair_error}")

	except Exception as call_error:
	print(f"⚠️ Exception while calling model: {call_error}")

	# Wait before next attempt
	if attempt < RETRIES:
	time.sleep(RETRY_DELAY)

	# All attempts failed
	print("❗ All LLM attempts failed. Writing original JSON to output.")

	try:
	if hasattr(output_file, "write"):
	json.dump(word_json, output_file, indent=2, ensure_ascii=False)
	output_file.flush()
	else:
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(word_json, f, indent=2, ensure_ascii=False)
	print("✅ Original JSON template written to output.")
	except Exception as e:
	print(f"⚠️ Failed to write original JSON: {e}")

	return None

	def validate_json_structure(parsed_json, original_json):
	"""Validate that the parsed JSON maintains the original structure"""
	try:
	def compare_structure(parsed, original, path=""):
	if type(parsed) != type(original):
	print(f"⚠️ Type mismatch at {path}: {type(parsed)} vs {type(original)}")
	return False

	if isinstance(original, dict):
	for key in original.keys():
	if key not in parsed:
	print(f"⚠️ Missing key at {path}.{key}")
	return False
	if not compare_structure(parsed[key], original[key], f"{path}.{key}"):
	return False

	return True

	return compare_structure(parsed_json, original_json)
	except Exception:
	return False

	if __name__ == "__main__":
	if len(sys.argv) != 4:
	print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
	sys.exit(0)

	try:
	update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
	except Exception as e:
	print(f"Unexpected exception: {e}")
	try:
	with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
	outf.write(inf.read())
	print("Wrote original input JSON to output due to exception.")
	except Exception:
	pass
	sys.exit(0)