PDF-Data_Extractor / update_docx_with_pdf.py
Shami96's picture
Update update_docx_with_pdf.py
25603c9 verified
raw
history blame
2.89 kB
from openai import OpenAI
import json
import os
def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
"""
word_json_file: file-like object or file path (docx extraction JSON)
pdf_txt_file: file-like object or file path (PDF plain text)
output_file: file-like object (opened for writing) or file path
"""
# --- Load files ---
def read_any(f):
if hasattr(f, "read"):
f.seek(0)
content = f.read()
if isinstance(content, bytes):
content = content.decode("utf-8")
return content
else:
with open(f, "r", encoding="utf-8") as fh:
return fh.read()
word_json = read_any(word_json_file)
pdf_txt = read_any(pdf_txt_file)
# --- Build prompt ---
user_prompt = f"""
Here is a JSON template. It contains only the fields that need updating:
{word_json}
Here is the extracted text from a PDF:
{pdf_txt}
Instructions:
- ONLY update the fields present in the JSON template, using information from the PDF text.
- DO NOT add any extra fields, and do not change the JSON structure.
- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
- Make sure the JSON is valid and ready to use.
"""
# --- Call OpenAI API ---
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON."},
{"role": "user", "content": user_prompt}
],
max_tokens=4096,
temperature=0
)
updated_json_str = response.choices[0].message.content.strip()
# --- Try to parse as JSON ---
try:
parsed = json.loads(updated_json_str)
if hasattr(output_file, "write"):
json.dump(parsed, output_file, indent=2, ensure_ascii=False)
output_file.flush()
else:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(parsed, f, indent=2, ensure_ascii=False)
print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
except Exception as e:
print("⚠️ Model did not return valid JSON. Raw output below:\n")
print(updated_json_str)
print("\n❌ Failed to parse updated JSON:", e)
if __name__ == "__main__":
import sys
if len(sys.argv) != 4:
print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
exit(1)
update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])