Spaces:
Running
Running
File size: 2,007 Bytes
e8b46b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import openai
import json
# Set your OpenAI API key here
OPENAI_API_KEY = "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"
# Load PDF text
WORD_JSON_FILE = "word_red_data.json"
PDF_TEXT_FILE = "pdf_all_text_full.txt"
OUTPUT_FILE = "updated_word_data1.json"
# --- Load files ---
with open(WORD_JSON_FILE, "r", encoding="utf-8") as f:
word_json = f.read()
with open(PDF_TEXT_FILE, "r", encoding="utf-8") as f:
pdf_txt = f.read()
# --- Build prompt ---
user_prompt = f"""
Here is a JSON template. It contains only the fields that need updating:
{word_json}
Here is the extracted text from a PDF:
{pdf_txt}
Instructions:
- ONLY update the fields present in the JSON template, using information from the PDF text.
- DO NOT add any extra fields, and do not change the JSON structure.
- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
- Make sure the JSON is valid and ready to use.
"""
# --- Call OpenAI API (no env var needed) ---
client = openai.OpenAI(api_key=OPENAI_API_KEY)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON."},
{"role": "user", "content": user_prompt}
],
max_tokens=4096,
temperature=0
)
updated_json_str = response.choices[0].message.content.strip()
# --- Try to parse as JSON ---
try:
parsed = json.loads(updated_json_str)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(parsed, f, indent=2, ensure_ascii=False)
print("✅ JSON updated and saved to", OUTPUT_FILE)
except Exception as e:
print("⚠️ Model did not return valid JSON. Raw output below:\n")
print(updated_json_str)
print("\n❌ Failed to parse updated JSON:", e) |