Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

PDF-Data_Extractor / update_docx_with_pdf.py

Shami96

Upload 4 files

e8b46b5 verified 4 months ago

raw

history blame

2.01 kB

	import openai
	import json

	# Set your OpenAI API key here
	OPENAI_API_KEY = "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"

	# Load PDF text
	WORD_JSON_FILE = "word_red_data.json"
	PDF_TEXT_FILE = "pdf_all_text_full.txt"
	OUTPUT_FILE = "updated_word_data1.json"

	# --- Load files ---
	with open(WORD_JSON_FILE, "r", encoding="utf-8") as f:
	word_json = f.read()
	with open(PDF_TEXT_FILE, "r", encoding="utf-8") as f:
	pdf_txt = f.read()

	# --- Build prompt ---
	user_prompt = f"""
	Here is a JSON template. It contains only the fields that need updating:
	{word_json}

	Here is the extracted text from a PDF:
	{pdf_txt}

	Instructions:
	- ONLY update the fields present in the JSON template, using information from the PDF text.
	- DO NOT add any extra fields, and do not change the JSON structure.
	- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
	- Make sure the JSON is valid and ready to use.
	"""

	# --- Call OpenAI API (no env var needed) ---
	client = openai.OpenAI(api_key=OPENAI_API_KEY)
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON."},
	{"role": "user", "content": user_prompt}
	],
	max_tokens=4096,
	temperature=0
	)

	updated_json_str = response.choices[0].message.content.strip()

	# --- Try to parse as JSON ---
	try:
	parsed = json.loads(updated_json_str)
	with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
	json.dump(parsed, f, indent=2, ensure_ascii=False)
	print("✅ JSON updated and saved to", OUTPUT_FILE)
	except Exception as e:
	print("⚠️ Model did not return valid JSON. Raw output below:\n")
	print(updated_json_str)
	print("\n❌ Failed to parse updated JSON:", e)