Spaces:

Shami96
/

PDF-Data_Extractor

Running

File size: 4,070 Bytes

from openai import OpenAI
import json
import os

def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
    """
    word_json_file: file-like object or file path (docx extraction JSON)
    pdf_txt_file: file-like object or file path (PDF plain text)
    output_file: file-like object (opened for writing) or file path
    """
    # --- Load files ---
    def read_any(f):
        if hasattr(f, "read"):
            f.seek(0)
            content = f.read()
            if isinstance(content, bytes):
                content = content.decode("utf-8")
            return content
        else:
            with open(f, "r", encoding="utf-8") as fh:
                return fh.read()

    word_json = read_any(word_json_file)
    pdf_txt = read_any(pdf_txt_file)

    # --- Build prompt ---
    user_prompt = f"""Here is a JSON template. It contains only the fields that need updating:

{word_json}

Here is the extracted text from a PDF:

{pdf_txt}

Instructions:
- ONLY update the fields present in the JSON template, using information from the PDF text.
- DO NOT add any extra fields, and do not change the JSON structure.
- Update ALL nested sections properly (like "Operator Declaration" with its "Print Name" and "Position Title")
- Make sure to update both the main sections AND the flattened keys (like "Operator Declaration.Print Name")

CRITICAL - For Operator Declaration section:
- The "Print Name" should be the OPERATOR/COMPANY REPRESENTATIVE's name, NOT the auditor's name
- Look for the "OPERATOR DECLARATION" section at the end of the document
- The person signing the operator declaration is usually someone from the company like a manager, compliance officer, or director
- Common examples: "Peter Sheppard", "Jeff Nitschke", etc.
- AVOID using the auditor's name (typically "Greg Dyer" in these documents)
- The "Position Title" should be their job role (e.g., "Compliance Officer", "Director", "Manager", "WHSE Compliance Officer")

For Attendance List:
- Extract all people listed with their roles (e.g., "Peter Sheppard - Compliance", "Greg Dyer - Auditor")
- Include both operator staff and auditor in the attendance list

- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
- Make sure the JSON is valid and ready to use.
- Update operator names, auditor names, and all personal details consistently throughout all sections."""

    # --- Call OpenAI API ---
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not found in environment variables!")

    client = OpenAI(api_key=api_key)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON. Update ALL sections consistently with the same data."},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=4096,
        temperature=0
    )

    updated_json_str = response.choices[0].message.content.strip()

    # --- Try to parse as JSON ---
    try:
        parsed = json.loads(updated_json_str)
        if hasattr(output_file, "write"):
            json.dump(parsed, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(parsed, f, indent=2, ensure_ascii=False)
        print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
    except Exception as e:
        print("⚠️ Model did not return valid JSON. Raw output below:\n")
        print(updated_json_str)
        print("\n❌ Failed to parse updated JSON:", e)

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 4:
        print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
        exit(1)
    update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])