Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

5244c54

verified ·

1 Parent(s): 0322667

Update update_docx_with_pdf.py

Browse files

Files changed (1) hide show

update_docx_with_pdf.py +58 -336

update_docx_with_pdf.py CHANGED Viewed

@@ -1,359 +1,81 @@
-#!/usr/bin/env python3
-"""
-Enhanced update_docx_with_pdf.py with better JSON structure handling
-"""
-import os
-import sys
 import json
-import time
-import re
-from typing import Optional
-try:
-    from openai import OpenAI
-except Exception:
-    OpenAI = None
-# Config
-RETRIES = 3
-RETRY_DELAY = 1.0
-DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
-MAX_TOKENS = 4096
-TEMPERATURE = 0.0
-def read_any(path_or_file):
-    """Read content from file path or file-like object."""
-    if hasattr(path_or_file, "read"):
-        path_or_file.seek(0)
-        content = path_or_file.read()
-        if isinstance(content, bytes):
-            content = content.decode("utf-8")
-        return content
-    else:
-        with open(path_or_file, "r", encoding="utf-8") as fh:
-            return fh.read()
-def find_first_balanced_json(s: str) -> Optional[str]:
-    """Find the first valid JSON object in the string"""
-    if not s:
-        return None
-    for m in re.finditer(r"\{", s):
-        start = m.start()
-        depth = 0
-        in_str = False
-        escape = False
-        for i in range(start, len(s)):
-            ch = s[i]
-            if ch == '"' and not escape:
-                in_str = not in_str
-            if in_str:
-                if ch == "\\" and not escape:
-                    escape = True
-                else:
-                    escape = False
-                continue
-            if ch == "{":
-                depth += 1
-            elif ch == "}":
-                depth -= 1
-                if depth == 0:
-                    candidate = s[start : i + 1]
-                    try:
-                        json.loads(candidate)
-                        return candidate
-                    except Exception:
-                        break
-    return None
-def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
-    """Call the model and return raw text content"""
-    resp = client.chat.completions.create(
-        model=model_name,
-        messages=[
-            {"role": "system", "content": system_msg},
-            {"role": "user", "content": user_msg},
-        ],
-        max_tokens=MAX_TOKENS,
-        temperature=TEMPERATURE,
-    )
-    try:
-        raw_text = resp.choices[0].message.content
-    except Exception:
-        try:
-            raw_text = resp.choices[0].text
-        except Exception:
-            raw_text = str(resp)
-    if isinstance(raw_text, bytes):
-        raw_text = raw_text.decode("utf-8", errors="replace")
-    return (raw_text or "").strip()
-def create_enhanced_prompt(word_json, pdf_text):
-    """Create an enhanced prompt that ensures proper JSON structure"""
-    # Analyze the word_json structure to understand what needs to be filled
-    structure_analysis = []
-    def analyze_structure(obj, path=""):
-        if isinstance(obj, dict):
-            for key, value in obj.items():
-                current_path = f"{path}.{key}" if path else key
-                if isinstance(value, dict):
-                    structure_analysis.append(f"  {current_path} (nested object)")
-                    analyze_structure(value, current_path)
-                elif isinstance(value, list):
-                    structure_analysis.append(f"  {current_path} (list with {len(value)} items)")
-                elif value is None or str(value).strip() == "":
-                    structure_analysis.append(f"  {current_path} (EMPTY - needs data)")
-                else:
-                    structure_analysis.append(f"  {current_path} (has data: {str(value)[:50]}...)")
-    analyze_structure(word_json)
-    system_msg = """You are a precise JSON data extraction assistant.
-CRITICAL RULES:
-1. Output ONLY valid JSON - no markdown, no explanations, no extra text
-2. Maintain the EXACT structure provided in the template
-3. Only UPDATE fields that are empty or null - do not change existing data
-4. Extract data accurately from the PDF text provided
-5. If you cannot find data for a field, leave it as null or empty string
-6. Ensure all nested objects and arrays maintain their structure"""
-    user_prompt = f"""TASK: Update this JSON template with data from the PDF text.
-JSON TEMPLATE TO UPDATE:
-{json.dumps(word_json, indent=2, ensure_ascii=False)}
-STRUCTURE ANALYSIS:
-{chr(10).join(structure_analysis[:50])}  # Limit to first 50 for brevity
-PDF SOURCE TEXT:
-{pdf_text[:100000]}  # Truncate very long text
-EXTRACTION GUIDELINES:
-- For "Operator name (Legal entity)" or similar: Extract the company name
-- For "Date of Audit": Look for audit dates (format: DD Month YYYY or DD/MM/YYYY)
-- For "Auditor name": Extract auditor's name
-- For "Attendance List": Extract names and positions, format as list
-- For vehicle data: Extract registration numbers, maintenance info, etc.
-- For management summaries: Extract compliance details and findings
-CRITICAL: Return ONLY the updated JSON object. No other text whatsoever."""
-    return system_msg, user_prompt
-def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
-    # Load inputs
-    word_json_text = read_any(word_json_file)
-    pdf_txt = read_any(pdf_txt_file)
-    try:
-        word_json = json.loads(word_json_text)
-    except Exception as e:
-        print(f"⚠️ Input word_json is not valid JSON: {e}")
-        print("Writing original to output and exiting.")
-        if hasattr(output_file, "write"):
-            output_file.write(word_json_text)
-            output_file.flush()
-        else:
-            with open(output_file, "w", encoding="utf-8") as f:
-                f.write(word_json_text)
-        return
-    # Check API key
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
-        print("⚠️ OPENAI_API_KEY not found! Writing original JSON to output.")
-        if hasattr(output_file, "write"):
-            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
-            output_file.flush()
-        else:
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(word_json, f, indent=2, ensure_ascii=False)
-        return
-    if OpenAI is None:
-        print("⚠️ OpenAI SDK not available. Writing original JSON to output.")
-        if hasattr(output_file, "write"):
-            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
-            output_file.flush()
-        else:
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(word_json, f, indent=2, ensure_ascii=False)
-        return
-    # Create enhanced prompts
-    system_msg, user_prompt = create_enhanced_prompt(word_json, pdf_txt)
-    print(f"📊 Original JSON has {len(json.dumps(word_json))} characters")
-    print(f"📊 PDF text has {len(pdf_txt)} characters")
     client = OpenAI(api_key=api_key)
-    model_name = DEFAULT_MODEL
-    # Try multiple attempts with different strategies
-    for attempt in range(1, RETRIES + 1):
-        print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
-        # Modify prompt for different attempts
-        current_user_prompt = user_prompt
-        if attempt == 2:
-            current_user_prompt += "\n\nIMPORTANT: Focus on extracting the most obvious data first. Ensure valid JSON format."
-        elif attempt >= 3:
-            current_user_prompt += "\n\nFINAL ATTEMPT: Return a valid JSON object even if some fields remain empty. Prioritize JSON validity over completeness."
-        try:
-            raw_text = call_model_and_get_raw(client, model_name, system_msg, current_user_prompt)
-            # Save raw output for debugging
-            out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
-            raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
-            try:
-                with open(raw_save_path, "w", encoding="utf-8") as f:
-                    f.write(raw_text)
-            except:
-                pass
-            # Try to parse directly
-            try:
-                parsed = json.loads(raw_text)
-                print("✅ Model returned valid JSON directly.")
-                # Validate structure matches original
-                if validate_json_structure(parsed, word_json):
-                    print("✅ JSON structure validation passed.")
-                else:
-                    print("⚠️ JSON structure differs from template, but proceeding...")
-                # Write output
-                if hasattr(output_file, "write"):
-                    json.dump(parsed, output_file, indent=2, ensure_ascii=False)
-                    output_file.flush()
-                else:
-                    with open(output_file, "w", encoding="utf-8") as f:
-                        json.dump(parsed, f, indent=2, ensure_ascii=False)
-                return parsed
-            except Exception as parse_error:
-                print(f"⚠️ Direct parsing failed: {parse_error}")
-                # Try to extract JSON substring
-                candidate = find_first_balanced_json(raw_text)
-                if candidate:
-                    try:
-                        parsed = json.loads(candidate)
-                        print("✅ Successfully extracted JSON substring from model output.")
-                        # Write output
-                        if hasattr(output_file, "write"):
-                            json.dump(parsed, output_file, indent=2, ensure_ascii=False)
-                            output_file.flush()
-                        else:
-                            with open(output_file, "w", encoding="utf-8") as f:
-                                json.dump(parsed, f, indent=2, ensure_ascii=False)
-                        return parsed
-                    except Exception as sub_parse_error:
-                        print(f"⚠️ Substring parsing also failed: {sub_parse_error}")
-                # Try repair pass
-                print("🔧 Attempting JSON repair...")
-                repair_system = "You are a JSON repair specialist. Fix the JSON below to be valid. Return ONLY valid JSON, nothing else."
-                repair_user = f"Fix this JSON:\n\n{raw_text}"
-                try:
-                    repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
-                    repair_parsed = json.loads(repair_raw)
-                    print("✅ Repair pass succeeded.")
-                    # Write output
-                    if hasattr(output_file, "write"):
-                        json.dump(repair_parsed, output_file, indent=2, ensure_ascii=False)
-                        output_file.flush()
-                    else:
-                        with open(output_file, "w", encoding="utf-8") as f:
-                            json.dump(repair_parsed, f, indent=2, ensure_ascii=False)
-                    return repair_parsed
-                except Exception as repair_error:
-                    print(f"⚠️ Repair pass failed: {repair_error}")
-        except Exception as call_error:
-            print(f"⚠️ Exception while calling model: {call_error}")
-        # Wait before next attempt
-        if attempt < RETRIES:
-            time.sleep(RETRY_DELAY)
-    # All attempts failed
-    print("❗ All LLM attempts failed. Writing original JSON to output.")
     try:
         if hasattr(output_file, "write"):
-            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
         else:
             with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(word_json, f, indent=2, ensure_ascii=False)
-        print("✅ Original JSON template written to output.")
     except Exception as e:
-        print(f"⚠️ Failed to write original JSON: {e}")
-    return None
-def validate_json_structure(parsed_json, original_json):
-    """Validate that the parsed JSON maintains the original structure"""
-    try:
-        def compare_structure(parsed, original, path=""):
-            if type(parsed) != type(original):
-                print(f"⚠️ Type mismatch at {path}: {type(parsed)} vs {type(original)}")
-                return False
-            if isinstance(original, dict):
-                for key in original.keys():
-                    if key not in parsed:
-                        print(f"⚠️ Missing key at {path}.{key}")
-                        return False
-                    if not compare_structure(parsed[key], original[key], f"{path}.{key}"):
-                        return False
-            return True
-        return compare_structure(parsed_json, original_json)
-    except Exception:
-        return False
 if __name__ == "__main__":
     if len(sys.argv) != 4:
         print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
-        sys.exit(0)
-    try:
-        update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
-    except Exception as e:
-        print(f"Unexpected exception: {e}")
-        try:
-            with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
-                outf.write(inf.read())
-                print("Wrote original input JSON to output due to exception.")
-        except Exception:
-            pass
-        sys.exit(0)
-        # ADD THIS LINE:
-        with open(sys.argv[3], 'r') as f: print(f"\n📄 LLM_UPDATE OUTPUT:\n{f.read()}")
-    except Exception as e:
-        print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
-        try:
-            with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
-                outf.write(inf.read())
-                print("Wrote original input JSON to output due to exception.")
-        except Exception:
-            pass
-        sys.exit(0)

+from openai import OpenAI
 import json
+import os
+def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
+    """
+    word_json_file: file-like object or file path (docx extraction JSON)
+    pdf_txt_file: file-like object or file path (PDF plain text)
+    output_file: file-like object (opened for writing) or file path
+    """
+    # --- Load files ---
+    def read_any(f):
+        if hasattr(f, "read"):
+            f.seek(0)
+            content = f.read()
+            if isinstance(content, bytes):
+                content = content.decode("utf-8")
+            return content
+        else:
+            with open(f, "r", encoding="utf-8") as fh:
+                return fh.read()
+    word_json = read_any(word_json_file)
+    pdf_txt = read_any(pdf_txt_file)
+    # --- Build prompt ---
+    user_prompt = f"""Here is a JSON template. It contains only the fields that need updating:
+{word_json}
+Here is the extracted text from a PDF:
+{pdf_txt}
+Instructions:
+- ONLY update the fields present in the JSON template, using information from the PDF text.
+- DO NOT add any extra fields, and do not change the JSON structure.
+- Update ALL nested sections properly (like "Operator Declaration" with its "Print Name" and "Position Title")
+- Make sure to update both the main sections AND the flattened keys (like "Operator Declaration.Print Name")
+- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
+- Make sure the JSON is valid and ready to use.
+- Pay special attention to updating operator names, auditor names, and all personal details consistently throughout all sections."""
+    # --- Call OpenAI API ---
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
+        raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
     client = OpenAI(api_key=api_key)
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON. Update ALL sections consistently with the same data."},
+            {"role": "user", "content": user_prompt}
+        ],
+        max_tokens=4096,
+        temperature=0
+    )
+    updated_json_str = response.choices[0].message.content.strip()
+    # --- Try to parse as JSON ---
     try:
+        parsed = json.loads(updated_json_str)
         if hasattr(output_file, "write"):
+            json.dump(parsed, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
         else:
             with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(parsed, f, indent=2, ensure_ascii=False)
+        print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
     except Exception as e:
+        print("⚠️ Model did not return valid JSON. Raw output below:\n")
+        print(updated_json_str)
+        print("\n❌ Failed to parse updated JSON:", e)
 if __name__ == "__main__":
+    import sys
     if len(sys.argv) != 4:
         print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
+        exit(1)
+    update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])