Spaces:

Shami96
/

PDF-Data_Extractor

Running

File size: 3,111 Bytes

# update_docx_with_pdf.py
from openai import OpenAI
import json
import os
import time

def read_any(f):
    if hasattr(f, "read"):
        f.seek(0)
        content = f.read()
        if isinstance(content, bytes):
            content = content.decode("utf-8")
        return content
    else:
        with open(f, "r", encoding="utf-8") as fh:
            return fh.read()

def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
    word_json = read_any(word_json_file)
    pdf_txt = read_any(pdf_txt_file)

    user_prompt = f"""
Here is a JSON template. It contains only the fields that need updating:
{word_json}
Here is the extracted text from a PDF:
{pdf_txt}
Instructions:
- ONLY update the fields present in the JSON template, using information from the PDF text.
- DO NOT add any extra fields, and do not change the JSON structure.
- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
- If a field cannot be populated, keep its original value.
"""

    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
    client = OpenAI(api_key=api_key)

    # Try a small number of attempts if the model returns text instead of JSON
    for attempt in range(3):
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role":"system","content":"You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting."},
                {"role":"user","content":user_prompt}
            ],
            max_tokens=4096,
            temperature=0
        )
        updated_json_str = response.choices[0].message.content.strip()

        try:
            parsed = json.loads(updated_json_str)
            template_keys = set(json.loads(word_json).keys())
            parsed_keys = set(parsed.keys())
            added = parsed_keys - template_keys
            if added:
                print("⚠️ Model returned extra top-level keys; pruning:", added)
                for ak in added:
                    parsed.pop(ak, None)
            if hasattr(output_file, "write"):
                json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                output_file.flush()
            else:
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(parsed, f, indent=2, ensure_ascii=False)
            print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
            return
        except json.JSONDecodeError:
            print("⚠️ Model output was not valid JSON. Raw output (truncated):")
            print(updated_json_str[:2000])
            time.sleep(1)
    raise RuntimeError("Model failed to return valid JSON after retries.")

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 4:
        print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
        exit(1)
    update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])