File size: 2,225 Bytes
e8b46b5
 
876a319
e8b46b5
876a319
 
 
 
 
 
e8b46b5
f9fae18
 
e8b46b5
 
 
 
 
 
 
 
 
 
 
f9fae18
876a319
 
 
 
 
 
 
 
 
 
 
 
 
e8b46b5
f9fae18
 
 
 
 
 
 
 
 
 
 
 
876a319
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import openai
import json
import os

def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
    # --- Load files ---
    with open(word_json_file, "r", encoding="utf-8") as f:
        word_json = f.read()
    with open(pdf_txt_file, "r", encoding="utf-8") as f:
        pdf_txt = f.read()

    # --- Build prompt ---
    user_prompt = f"""
Here is a JSON template. It contains only the fields that need updating:
{word_json}
Here is the extracted text from a PDF:
{pdf_txt}
Instructions:
- ONLY update the fields present in the JSON template, using information from the PDF text.
- DO NOT add any extra fields, and do not change the JSON structure.
- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
- Make sure the JSON is valid and ready to use.
"""

    # --- Call OpenAI API ---
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
    client = openai.OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON."},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=4096,
        temperature=0
    )

    updated_json_str = response.choices[0].message.content.strip()

    # --- Try to parse as JSON ---
    try:
        parsed = json.loads(updated_json_str)
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(parsed, f, indent=2, ensure_ascii=False)
        print("✅ JSON updated and saved to", output_file)
    except Exception as e:
        print("⚠️ Model did not return valid JSON. Raw output below:\n")
        print(updated_json_str)
        print("\n❌ Failed to parse updated JSON:", e)

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 4:
        print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
        exit(1)
    update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])