File size: 4,070 Bytes
5244c54
3edd648
5244c54
89ec944
5244c54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89ec944
5244c54
 
89ec944
5244c54
 
543101d
5244c54
f486b52
5244c54
543101d
5244c54
e8b46b5
5244c54
 
 
 
 
f5393f7
 
 
 
 
 
 
 
 
 
 
 
 
5244c54
 
543101d
3edd648
5244c54
876a319
 
5244c54
25603c9
89ec944
5244c54
 
 
 
 
 
 
 
 
 
f486b52
5244c54
3edd648
5244c54
3edd648
5244c54
3edd648
5244c54
3edd648
 
89ec944
5244c54
 
3edd648
5244c54
 
 
f486b52
876a319
5244c54
876a319
 
5244c54
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from openai import OpenAI
import json
import os

def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
    """
    word_json_file: file-like object or file path (docx extraction JSON)
    pdf_txt_file: file-like object or file path (PDF plain text)
    output_file: file-like object (opened for writing) or file path
    """
    # --- Load files ---
    def read_any(f):
        if hasattr(f, "read"):
            f.seek(0)
            content = f.read()
            if isinstance(content, bytes):
                content = content.decode("utf-8")
            return content
        else:
            with open(f, "r", encoding="utf-8") as fh:
                return fh.read()

    word_json = read_any(word_json_file)
    pdf_txt = read_any(pdf_txt_file)

    # --- Build prompt ---
    user_prompt = f"""Here is a JSON template. It contains only the fields that need updating:

{word_json}

Here is the extracted text from a PDF:

{pdf_txt}

Instructions:
- ONLY update the fields present in the JSON template, using information from the PDF text.
- DO NOT add any extra fields, and do not change the JSON structure.
- Update ALL nested sections properly (like "Operator Declaration" with its "Print Name" and "Position Title")
- Make sure to update both the main sections AND the flattened keys (like "Operator Declaration.Print Name")

CRITICAL - For Operator Declaration section:
- The "Print Name" should be the OPERATOR/COMPANY REPRESENTATIVE's name, NOT the auditor's name
- Look for the "OPERATOR DECLARATION" section at the end of the document
- The person signing the operator declaration is usually someone from the company like a manager, compliance officer, or director
- Common examples: "Peter Sheppard", "Jeff Nitschke", etc.
- AVOID using the auditor's name (typically "Greg Dyer" in these documents)
- The "Position Title" should be their job role (e.g., "Compliance Officer", "Director", "Manager", "WHSE Compliance Officer")

For Attendance List:
- Extract all people listed with their roles (e.g., "Peter Sheppard - Compliance", "Greg Dyer - Auditor")
- Include both operator staff and auditor in the attendance list

- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
- Make sure the JSON is valid and ready to use.
- Update operator names, auditor names, and all personal details consistently throughout all sections."""

    # --- Call OpenAI API ---
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not found in environment variables!")

    client = OpenAI(api_key=api_key)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON. Update ALL sections consistently with the same data."},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=4096,
        temperature=0
    )

    updated_json_str = response.choices[0].message.content.strip()

    # --- Try to parse as JSON ---
    try:
        parsed = json.loads(updated_json_str)
        if hasattr(output_file, "write"):
            json.dump(parsed, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(parsed, f, indent=2, ensure_ascii=False)
        print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
    except Exception as e:
        print("⚠️ Model did not return valid JSON. Raw output below:\n")
        print(updated_json_str)
        print("\n❌ Failed to parse updated JSON:", e)

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 4:
        print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
        exit(1)
    update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])