Spaces:
Running
Running
Update update_docx_with_pdf.py
Browse files- update_docx_with_pdf.py +30 -9
update_docx_with_pdf.py
CHANGED
|
@@ -1,20 +1,36 @@
|
|
| 1 |
-
import
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
|
| 5 |
def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
# --- Load files ---
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# --- Build prompt ---
|
| 13 |
user_prompt = f"""
|
| 14 |
Here is a JSON template. It contains only the fields that need updating:
|
| 15 |
{word_json}
|
|
|
|
| 16 |
Here is the extracted text from a PDF:
|
| 17 |
{pdf_txt}
|
|
|
|
| 18 |
Instructions:
|
| 19 |
- ONLY update the fields present in the JSON template, using information from the PDF text.
|
| 20 |
- DO NOT add any extra fields, and do not change the JSON structure.
|
|
@@ -26,7 +42,8 @@ Instructions:
|
|
| 26 |
api_key = os.environ.get("OPENAI_API_KEY")
|
| 27 |
if not api_key:
|
| 28 |
raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
|
| 29 |
-
client =
|
|
|
|
| 30 |
response = client.chat.completions.create(
|
| 31 |
model="gpt-4o",
|
| 32 |
messages=[
|
|
@@ -42,9 +59,13 @@ Instructions:
|
|
| 42 |
# --- Try to parse as JSON ---
|
| 43 |
try:
|
| 44 |
parsed = json.loads(updated_json_str)
|
| 45 |
-
|
| 46 |
-
json.dump(parsed,
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
except Exception as e:
|
| 49 |
print("⚠️ Model did not return valid JSON. Raw output below:\n")
|
| 50 |
print(updated_json_str)
|
|
|
|
| 1 |
+
from openai import OpenAI
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
|
| 5 |
def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
|
| 6 |
+
"""
|
| 7 |
+
word_json_file: file-like object or file path (docx extraction JSON)
|
| 8 |
+
pdf_txt_file: file-like object or file path (PDF plain text)
|
| 9 |
+
output_file: file-like object (opened for writing) or file path
|
| 10 |
+
"""
|
| 11 |
# --- Load files ---
|
| 12 |
+
def read_any(f):
|
| 13 |
+
if hasattr(f, "read"):
|
| 14 |
+
f.seek(0)
|
| 15 |
+
content = f.read()
|
| 16 |
+
if isinstance(content, bytes):
|
| 17 |
+
content = content.decode("utf-8")
|
| 18 |
+
return content
|
| 19 |
+
else:
|
| 20 |
+
with open(f, "r", encoding="utf-8") as fh:
|
| 21 |
+
return fh.read()
|
| 22 |
+
|
| 23 |
+
word_json = read_any(word_json_file)
|
| 24 |
+
pdf_txt = read_any(pdf_txt_file)
|
| 25 |
|
| 26 |
# --- Build prompt ---
|
| 27 |
user_prompt = f"""
|
| 28 |
Here is a JSON template. It contains only the fields that need updating:
|
| 29 |
{word_json}
|
| 30 |
+
|
| 31 |
Here is the extracted text from a PDF:
|
| 32 |
{pdf_txt}
|
| 33 |
+
|
| 34 |
Instructions:
|
| 35 |
- ONLY update the fields present in the JSON template, using information from the PDF text.
|
| 36 |
- DO NOT add any extra fields, and do not change the JSON structure.
|
|
|
|
| 42 |
api_key = os.environ.get("OPENAI_API_KEY")
|
| 43 |
if not api_key:
|
| 44 |
raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
|
| 45 |
+
client = OpenAI(api_key=api_key)
|
| 46 |
+
|
| 47 |
response = client.chat.completions.create(
|
| 48 |
model="gpt-4o",
|
| 49 |
messages=[
|
|
|
|
| 59 |
# --- Try to parse as JSON ---
|
| 60 |
try:
|
| 61 |
parsed = json.loads(updated_json_str)
|
| 62 |
+
if hasattr(output_file, "write"):
|
| 63 |
+
json.dump(parsed, output_file, indent=2, ensure_ascii=False)
|
| 64 |
+
output_file.flush()
|
| 65 |
+
else:
|
| 66 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 67 |
+
json.dump(parsed, f, indent=2, ensure_ascii=False)
|
| 68 |
+
print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
|
| 69 |
except Exception as e:
|
| 70 |
print("⚠️ Model did not return valid JSON. Raw output below:\n")
|
| 71 |
print(updated_json_str)
|