from openai import OpenAI import json import os def update_json_with_pdf(word_json_file, pdf_txt_file, output_file): """ word_json_file: file-like object or file path (docx extraction JSON) pdf_txt_file: file-like object or file path (PDF plain text) output_file: file-like object (opened for writing) or file path """ # --- Load files --- def read_any(f): if hasattr(f, "read"): f.seek(0) content = f.read() if isinstance(content, bytes): content = content.decode("utf-8") return content else: with open(f, "r", encoding="utf-8") as fh: return fh.read() word_json = read_any(word_json_file) pdf_txt = read_any(pdf_txt_file) # --- Build prompt --- user_prompt = f""" Here is a JSON template. It contains only the fields that need updating: {word_json} Here is the extracted text from a PDF: {pdf_txt} Instructions: - ONLY update the fields present in the JSON template, using information from the PDF text. - DO NOT add any extra fields, and do not change the JSON structure. - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings). - Make sure the JSON is valid and ready to use. """ # --- Call OpenAI API --- api_key = os.environ.get("OPENAI_API_KEY") if not api_key: raise RuntimeError("OPENAI_API_KEY not found in environment variables!") client = OpenAI(api_key=api_key) response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON."}, {"role": "user", "content": user_prompt} ], max_tokens=4096, temperature=0 ) updated_json_str = response.choices[0].message.content.strip() # --- Try to parse as JSON --- try: parsed = json.loads(updated_json_str) if hasattr(output_file, "write"): json.dump(parsed, output_file, indent=2, ensure_ascii=False) output_file.flush() else: with open(output_file, "w", encoding="utf-8") as f: json.dump(parsed, f, indent=2, ensure_ascii=False) print("✅ JSON updated and saved to", getattr(output_file, "name", output_file)) except Exception as e: print("⚠️ Model did not return valid JSON. Raw output below:\n") print(updated_json_str) print("\n❌ Failed to parse updated JSON:", e) if __name__ == "__main__": import sys if len(sys.argv) != 4: print("Usage: python update_docx_with_pdf.py ") exit(1) update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])