Shami96 commited on
Commit
25603c9
·
verified ·
1 Parent(s): 1055fe1

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +30 -9
update_docx_with_pdf.py CHANGED
@@ -1,20 +1,36 @@
1
- import openai
2
  import json
3
  import os
4
 
5
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
 
 
 
 
 
6
  # --- Load files ---
7
- with open(word_json_file, "r", encoding="utf-8") as f:
8
- word_json = f.read()
9
- with open(pdf_txt_file, "r", encoding="utf-8") as f:
10
- pdf_txt = f.read()
 
 
 
 
 
 
 
 
 
11
 
12
  # --- Build prompt ---
13
  user_prompt = f"""
14
  Here is a JSON template. It contains only the fields that need updating:
15
  {word_json}
 
16
  Here is the extracted text from a PDF:
17
  {pdf_txt}
 
18
  Instructions:
19
  - ONLY update the fields present in the JSON template, using information from the PDF text.
20
  - DO NOT add any extra fields, and do not change the JSON structure.
@@ -26,7 +42,8 @@ Instructions:
26
  api_key = os.environ.get("OPENAI_API_KEY")
27
  if not api_key:
28
  raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
29
- client = openai.OpenAI(api_key=api_key)
 
30
  response = client.chat.completions.create(
31
  model="gpt-4o",
32
  messages=[
@@ -42,9 +59,13 @@ Instructions:
42
  # --- Try to parse as JSON ---
43
  try:
44
  parsed = json.loads(updated_json_str)
45
- with open(output_file, "w", encoding="utf-8") as f:
46
- json.dump(parsed, f, indent=2, ensure_ascii=False)
47
- print("✅ JSON updated and saved to", output_file)
 
 
 
 
48
  except Exception as e:
49
  print("⚠️ Model did not return valid JSON. Raw output below:\n")
50
  print(updated_json_str)
 
1
+ from openai import OpenAI
2
  import json
3
  import os
4
 
5
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
6
+ """
7
+ word_json_file: file-like object or file path (docx extraction JSON)
8
+ pdf_txt_file: file-like object or file path (PDF plain text)
9
+ output_file: file-like object (opened for writing) or file path
10
+ """
11
  # --- Load files ---
12
+ def read_any(f):
13
+ if hasattr(f, "read"):
14
+ f.seek(0)
15
+ content = f.read()
16
+ if isinstance(content, bytes):
17
+ content = content.decode("utf-8")
18
+ return content
19
+ else:
20
+ with open(f, "r", encoding="utf-8") as fh:
21
+ return fh.read()
22
+
23
+ word_json = read_any(word_json_file)
24
+ pdf_txt = read_any(pdf_txt_file)
25
 
26
  # --- Build prompt ---
27
  user_prompt = f"""
28
  Here is a JSON template. It contains only the fields that need updating:
29
  {word_json}
30
+
31
  Here is the extracted text from a PDF:
32
  {pdf_txt}
33
+
34
  Instructions:
35
  - ONLY update the fields present in the JSON template, using information from the PDF text.
36
  - DO NOT add any extra fields, and do not change the JSON structure.
 
42
  api_key = os.environ.get("OPENAI_API_KEY")
43
  if not api_key:
44
  raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
45
+ client = OpenAI(api_key=api_key)
46
+
47
  response = client.chat.completions.create(
48
  model="gpt-4o",
49
  messages=[
 
59
  # --- Try to parse as JSON ---
60
  try:
61
  parsed = json.loads(updated_json_str)
62
+ if hasattr(output_file, "write"):
63
+ json.dump(parsed, output_file, indent=2, ensure_ascii=False)
64
+ output_file.flush()
65
+ else:
66
+ with open(output_file, "w", encoding="utf-8") as f:
67
+ json.dump(parsed, f, indent=2, ensure_ascii=False)
68
+ print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
69
  except Exception as e:
70
  print("⚠️ Model did not return valid JSON. Raw output below:\n")
71
  print(updated_json_str)