Spaces:
Running
Running
File size: 4,070 Bytes
5244c54 3edd648 5244c54 89ec944 5244c54 89ec944 5244c54 89ec944 5244c54 543101d 5244c54 f486b52 5244c54 543101d 5244c54 e8b46b5 5244c54 f5393f7 5244c54 543101d 3edd648 5244c54 876a319 5244c54 25603c9 89ec944 5244c54 f486b52 5244c54 3edd648 5244c54 3edd648 5244c54 3edd648 5244c54 3edd648 89ec944 5244c54 3edd648 5244c54 f486b52 876a319 5244c54 876a319 5244c54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
from openai import OpenAI
import json
import os
def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
"""
word_json_file: file-like object or file path (docx extraction JSON)
pdf_txt_file: file-like object or file path (PDF plain text)
output_file: file-like object (opened for writing) or file path
"""
# --- Load files ---
def read_any(f):
if hasattr(f, "read"):
f.seek(0)
content = f.read()
if isinstance(content, bytes):
content = content.decode("utf-8")
return content
else:
with open(f, "r", encoding="utf-8") as fh:
return fh.read()
word_json = read_any(word_json_file)
pdf_txt = read_any(pdf_txt_file)
# --- Build prompt ---
user_prompt = f"""Here is a JSON template. It contains only the fields that need updating:
{word_json}
Here is the extracted text from a PDF:
{pdf_txt}
Instructions:
- ONLY update the fields present in the JSON template, using information from the PDF text.
- DO NOT add any extra fields, and do not change the JSON structure.
- Update ALL nested sections properly (like "Operator Declaration" with its "Print Name" and "Position Title")
- Make sure to update both the main sections AND the flattened keys (like "Operator Declaration.Print Name")
CRITICAL - For Operator Declaration section:
- The "Print Name" should be the OPERATOR/COMPANY REPRESENTATIVE's name, NOT the auditor's name
- Look for the "OPERATOR DECLARATION" section at the end of the document
- The person signing the operator declaration is usually someone from the company like a manager, compliance officer, or director
- Common examples: "Peter Sheppard", "Jeff Nitschke", etc.
- AVOID using the auditor's name (typically "Greg Dyer" in these documents)
- The "Position Title" should be their job role (e.g., "Compliance Officer", "Director", "Manager", "WHSE Compliance Officer")
For Attendance List:
- Extract all people listed with their roles (e.g., "Peter Sheppard - Compliance", "Greg Dyer - Auditor")
- Include both operator staff and auditor in the attendance list
- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
- Make sure the JSON is valid and ready to use.
- Update operator names, auditor names, and all personal details consistently throughout all sections."""
# --- Call OpenAI API ---
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON. Update ALL sections consistently with the same data."},
{"role": "user", "content": user_prompt}
],
max_tokens=4096,
temperature=0
)
updated_json_str = response.choices[0].message.content.strip()
# --- Try to parse as JSON ---
try:
parsed = json.loads(updated_json_str)
if hasattr(output_file, "write"):
json.dump(parsed, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(parsed, f, indent=2, ensure_ascii=False)
print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
except Exception as e:
print("⚠️ Model did not return valid JSON. Raw output below:\n")
print(updated_json_str)
print("\n❌ Failed to parse updated JSON:", e)
if __name__ == "__main__":
import sys
if len(sys.argv) != 4:
print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
exit(1)
update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3]) |