Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

89ec944

verified ·

1 Parent(s): f486b52

Update update_docx_with_pdf.py

Browse files

Files changed (1) hide show

update_docx_with_pdf.py +172 -184

update_docx_with_pdf.py CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-update_docx_with_pdf.py
 """
 import os
@@ -10,7 +10,6 @@ import time
 import re
 from typing import Optional
-# Try to import OpenAI client in the style used previously
 try:
     from openai import OpenAI
 except Exception:
@@ -18,12 +17,11 @@ except Exception:
 # Config
 RETRIES = 3
-RETRY_DELAY = 1.0  # seconds between retries
 DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
 MAX_TOKENS = 4096
 TEMPERATURE = 0.0
 def read_any(path_or_file):
     """Read content from file path or file-like object."""
     if hasattr(path_or_file, "read"):
@@ -36,26 +34,22 @@ def read_any(path_or_file):
         with open(path_or_file, "r", encoding="utf-8") as fh:
             return fh.read()
 def find_first_balanced_json(s: str) -> Optional[str]:
-    """
-    Scan the input string and return the first substring that is a balanced JSON object
-    starting with '{' and ending with the matching '}' that parses successfully.
-    """
     if not s:
         return None
-    # Find all possible '{' positions
     for m in re.finditer(r"\{", s):
         start = m.start()
         depth = 0
         in_str = False
         escape = False
         for i in range(start, len(s)):
             ch = s[i]
             if ch == '"' and not escape:
                 in_str = not in_str
             if in_str:
-                # handle escape toggling but don't treat braces inside strings
                 if ch == "\\" and not escape:
                     escape = True
                 else:
@@ -71,41 +65,11 @@ def find_first_balanced_json(s: str) -> Optional[str]:
                         json.loads(candidate)
                         return candidate
                     except Exception:
-                        # candidate not valid JSON (maybe trailing commas etc.) -> continue searching
                         break
     return None
-def extract_json_substring(s: str) -> Optional[str]:
-    """
-    Wrapper for find_first_balanced_json kept for compatibility with existing naming.
-    """
-    return find_first_balanced_json(s)
-def try_parse_json_str(s: str):
-    """Attempt to parse JSON string, raising the same exceptions as json.loads."""
-    return json.loads(s)
-def safe_write(path: str, data):
-    with open(path, "w", encoding="utf-8") as f:
-        json.dump(data, f, indent=2, ensure_ascii=False)
-def save_raw(path: str, text: str):
-    try:
-        with open(path, "w", encoding="utf-8") as f:
-            f.write(text)
-    except Exception:
-        # best-effort; don't crash
-        pass
 def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
-    """
-    Call the model and return raw text content. Support variations in SDK response shape.
-    """
     resp = client.chat.completions.create(
         model=model_name,
         messages=[
@@ -116,32 +80,83 @@ def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: s
         temperature=TEMPERATURE,
     )
-    # Try to extract raw text in a few shapes
-    raw_text = ""
     try:
-        # New-style: resp.choices[0].message.content
         raw_text = resp.choices[0].message.content
     except Exception:
         try:
-            # Older shape: resp.choices[0].text
             raw_text = resp.choices[0].text
         except Exception:
             raw_text = str(resp)
     if isinstance(raw_text, bytes):
         raw_text = raw_text.decode("utf-8", errors="replace")
     return (raw_text or "").strip()
 def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
-    # --- load inputs ---
     word_json_text = read_any(word_json_file)
     pdf_txt = read_any(pdf_txt_file)
     try:
         word_json = json.loads(word_json_text)
-    except Exception:
-        # If the input word_json isn't valid JSON, abort early but write original to output
-        print("⚠️ Input word_json is not valid JSON. Writing raw input to output and exiting.")
         if hasattr(output_file, "write"):
             output_file.write(word_json_text)
             output_file.flush()
@@ -150,194 +165,168 @@ def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
                 f.write(word_json_text)
         return
-    # --- build base prompts ---
-    system_msg = (
-        "You are a strict JSON extraction assistant. Only output valid JSON with no surrounding text, "
-        "no markdown, no explanation. The JSON must be parseable by json.loads()."
-    )
-    user_prompt_template = (
-        "Here is a JSON template that must be updated (DO NOT change structure or keys):\n\n"
-        "{word_json}\n\n"
-        "Here is the extracted text from a PDF (use this to fill/update fields):\n\n"
-        "{pdf_text}\n\n"
-        "Instructions:\n"
-        "- ONLY update fields that already exist in the JSON template using evidence from the PDF text.\n"
-        "- DO NOT add new top-level keys or alter the structure.\n"
-        "- If you cannot find a value for an existing field, leave it unchanged.\n"
-        "- OUTPUT EXACTLY one JSON object and NOTHING else.\n"
-    )
-    user_prompt = user_prompt_template.format(
-        word_json=json.dumps(word_json, ensure_ascii=False),
-        pdf_text=(pdf_txt or "")[:120000],  # cap size to avoid truncation/hitting token limits
-    )
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
-        print("⚠️ OPENAI_API_KEY not found in environment variables! Writing original JSON to output and exiting.")
         if hasattr(output_file, "write"):
             json.dump(word_json, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
         else:
-            safe_write(output_file, word_json)
         return
     if OpenAI is None:
-        print("⚠️ OpenAI SDK not available (could not import OpenAI). Writing original JSON to output and exiting.")
         if hasattr(output_file, "write"):
             json.dump(word_json, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
         else:
-            safe_write(output_file, word_json)
         return
-    # Create client (constructor signature can be adapted if your OpenAI wrapper differs)
-    client = OpenAI(api_key=api_key)
     model_name = DEFAULT_MODEL
-    raw_outputs = []
-    parsed = None
-    # Try multiple attempts (progressive instructions)
     for attempt in range(1, RETRIES + 1):
-        variant_user_prompt = user_prompt
-        # On later attempts, append stricter instruction
         if attempt == 2:
-            variant_user_prompt += "\nIMPORTANT: Return ONLY valid JSON. If you cannot find new values, keep existing template values."
         elif attempt >= 3:
-            variant_user_prompt += "\nLAST ATTEMPT: Output exactly one JSON object and nothing else. If uncertain, keep fields unchanged."
-        print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
         try:
-            raw_text = call_model_and_get_raw(client, model_name, system_msg, variant_user_prompt)
-            raw_outputs.append(raw_text)
-            # Save raw model output for diagnostics
             out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
             raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
-            save_raw(raw_save_path, raw_text)
-            # Try parse as JSON directly
             try:
-                parsed = try_parse_json_str(raw_text)
-                print("✅ Model returned valid JSON.")
-                # write and return
                 if hasattr(output_file, "write"):
                     json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                     output_file.flush()
                 else:
-                    safe_write(output_file, parsed)
                 return parsed
-            except Exception:
-                # try extracting a balanced JSON substring
-                candidate = extract_json_substring(raw_text)
                 if candidate:
                     try:
-                        parsed = try_parse_json_str(candidate)
-                        print("✅ Successfully extracted and parsed JSON substring from model output.")
                         if hasattr(output_file, "write"):
                             json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                             output_file.flush()
                         else:
-                            safe_write(output_file, parsed)
                         return parsed
-                    except Exception:
-                        print("⚠️ Extracted substring was not valid JSON after parsing attempt.")
-                else:
-                    print("⚠️ Could not find a balanced JSON substring in the model output.")
-            # If we get here, the model output is not parseable - attempt a repair pass once per attempt
-            print("🔧 Attempting repair pass: sending model its raw output and asking for VALID JSON only...")
-            repair_system = "You are a JSON repair assistant. The previous model output (possibly with commentary) is provided. Extract and return a single VALID JSON object and NOTHING else. If you cannot produce valid JSON, return {}."
-            # Provide the model its own raw output for repair
-            repair_user = f"Raw model output:\n\n{raw_text}\n\nReturn only valid JSON object."
-            repair_raw = ""
-            try:
-                repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
-                # Save repair output
-                repair_save_path = f"{out_base}.model_raw_attempt{attempt}_repair.txt"
-                save_raw(repair_save_path, repair_raw)
-                # Try parse repair output
                 try:
-                    parsed = try_parse_json_str(repair_raw)
-                    print("✅ Repair pass succeeded with valid JSON.")
                     if hasattr(output_file, "write"):
-                        json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                         output_file.flush()
                     else:
-                        safe_write(output_file, parsed)
-                    return parsed
-                except Exception:
-                    candidate = extract_json_substring(repair_raw)
-                    if candidate:
-                        try:
-                            parsed = try_parse_json_str(candidate)
-                            print("✅ Successfully extracted JSON substring from repair output.")
-                            if hasattr(output_file, "write"):
-                                json.dump(parsed, output_file, indent=2, ensure_ascii=False)
-                                output_file.flush()
-                            else:
-                                safe_write(output_file, parsed)
-                            return parsed
-                        except Exception:
-                            print("⚠️ Repair output contained JSON-like substring but could not be parsed.")
-                    else:
-                        print("⚠️ Repair pass did not produce a parseable JSON substring.")
-            except Exception as rep_err:
-                print(f"⚠️ Exception during repair pass: {rep_err}")
-        except Exception as call_err:
-            print(f"⚠️ Exception while calling model: {call_err}")
         # Wait before next attempt
-        time.sleep(RETRY_DELAY)
-    # If we've reached here, all attempts failed
-    print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).")
-    try:
-        out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
-        raw_path = f"{out_base}.model_raw.txt"
-        with open(raw_path, "w", encoding="utf-8") as rf:
-            rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n")
-            for i, out in enumerate(raw_outputs, start=1):
-                rf.write(f"--- ATTEMPT {i} ---\n")
-                rf.write((out or "") + "\n\n")
-            rf.write("\n=== END ===\n\n")
-            rf.write("\n\n=== PDF TEXT USED (truncated) ===\n\n")
-            rf.write((pdf_txt or "")[:20000])
-        print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}")
-    except Exception as e:
-        print(f"⚠️ Failed to save raw model outputs: {e}")
-    # Also create a salvage bundle for manual inspection
-    try:
-        salvage_path = f"{out_base}.salvage.json"
-        salvage_bundle = {
-            "original_word_json": word_json,
-            "pdf_text_sample": (pdf_txt or "")[:2000],
-            "raw_outputs_path": raw_path,
-        }
-        with open(salvage_path, "w", encoding="utf-8") as sf:
-            json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False)
-        print(f"ℹ️ Salvage bundle saved to: {salvage_path}")
-    except Exception as e:
-        print(f"⚠️ Failed to save salvage bundle: {e}")
-    # Write original JSON to output to avoid failing the calling process
     try:
         if hasattr(output_file, "write"):
             json.dump(word_json, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
         else:
-            safe_write(output_file, word_json)
-        print("✅ Original JSON template written to output (no updates applied).")
     except Exception as e:
-        print(f"⚠️ Failed to write original JSON to output: {e}")
     return None
 if __name__ == "__main__":
     if len(sys.argv) != 4:
@@ -347,8 +336,7 @@ if __name__ == "__main__":
     try:
         update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
     except Exception as e:
-        # Top-level catch to avoid crashing the pipeline; write original input as fallback.
-        print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
         try:
             with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
                 outf.write(inf.read())

 #!/usr/bin/env python3
 """
+Enhanced update_docx_with_pdf.py with better JSON structure handling
 """
 import os
 import re
 from typing import Optional
 try:
     from openai import OpenAI
 except Exception:
 # Config
 RETRIES = 3
+RETRY_DELAY = 1.0
 DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
 MAX_TOKENS = 4096
 TEMPERATURE = 0.0
 def read_any(path_or_file):
     """Read content from file path or file-like object."""
     if hasattr(path_or_file, "read"):
         with open(path_or_file, "r", encoding="utf-8") as fh:
             return fh.read()
 def find_first_balanced_json(s: str) -> Optional[str]:
+    """Find the first valid JSON object in the string"""
     if not s:
         return None
     for m in re.finditer(r"\{", s):
         start = m.start()
         depth = 0
         in_str = False
         escape = False
         for i in range(start, len(s)):
             ch = s[i]
             if ch == '"' and not escape:
                 in_str = not in_str
             if in_str:
                 if ch == "\\" and not escape:
                     escape = True
                 else:
                         json.loads(candidate)
                         return candidate
                     except Exception:
                         break
     return None
 def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
+    """Call the model and return raw text content"""
     resp = client.chat.completions.create(
         model=model_name,
         messages=[
         temperature=TEMPERATURE,
     )
     try:
         raw_text = resp.choices[0].message.content
     except Exception:
         try:
             raw_text = resp.choices[0].text
         except Exception:
             raw_text = str(resp)
     if isinstance(raw_text, bytes):
         raw_text = raw_text.decode("utf-8", errors="replace")
     return (raw_text or "").strip()
+def create_enhanced_prompt(word_json, pdf_text):
+    """Create an enhanced prompt that ensures proper JSON structure"""
+    # Analyze the word_json structure to understand what needs to be filled
+    structure_analysis = []
+    def analyze_structure(obj, path=""):
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                current_path = f"{path}.{key}" if path else key
+                if isinstance(value, dict):
+                    structure_analysis.append(f"  {current_path} (nested object)")
+                    analyze_structure(value, current_path)
+                elif isinstance(value, list):
+                    structure_analysis.append(f"  {current_path} (list with {len(value)} items)")
+                elif value is None or str(value).strip() == "":
+                    structure_analysis.append(f"  {current_path} (EMPTY - needs data)")
+                else:
+                    structure_analysis.append(f"  {current_path} (has data: {str(value)[:50]}...)")
+    analyze_structure(word_json)
+    system_msg = """You are a precise JSON data extraction assistant.
+CRITICAL RULES:
+1. Output ONLY valid JSON - no markdown, no explanations, no extra text
+2. Maintain the EXACT structure provided in the template
+3. Only UPDATE fields that are empty or null - do not change existing data
+4. Extract data accurately from the PDF text provided
+5. If you cannot find data for a field, leave it as null or empty string
+6. Ensure all nested objects and arrays maintain their structure"""
+    user_prompt = f"""TASK: Update this JSON template with data from the PDF text.
+JSON TEMPLATE TO UPDATE:
+{json.dumps(word_json, indent=2, ensure_ascii=False)}
+STRUCTURE ANALYSIS:
+{chr(10).join(structure_analysis[:50])}  # Limit to first 50 for brevity
+PDF SOURCE TEXT:
+{pdf_text[:100000]}  # Truncate very long text
+EXTRACTION GUIDELINES:
+- For "Operator name (Legal entity)" or similar: Extract the company name
+- For "Date of Audit": Look for audit dates (format: DD Month YYYY or DD/MM/YYYY)
+- For "Auditor name": Extract auditor's name
+- For "Attendance List": Extract names and positions, format as list
+- For vehicle data: Extract registration numbers, maintenance info, etc.
+- For management summaries: Extract compliance details and findings
+CRITICAL: Return ONLY the updated JSON object. No other text whatsoever."""
+    return system_msg, user_prompt
 def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
+    # Load inputs
     word_json_text = read_any(word_json_file)
     pdf_txt = read_any(pdf_txt_file)
     try:
         word_json = json.loads(word_json_text)
+    except Exception as e:
+        print(f"⚠️ Input word_json is not valid JSON: {e}")
+        print("Writing original to output and exiting.")
         if hasattr(output_file, "write"):
             output_file.write(word_json_text)
             output_file.flush()
                 f.write(word_json_text)
         return
+    # Check API key
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
+        print("⚠️ OPENAI_API_KEY not found! Writing original JSON to output.")
         if hasattr(output_file, "write"):
             json.dump(word_json, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
         else:
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(word_json, f, indent=2, ensure_ascii=False)
         return
     if OpenAI is None:
+        print("⚠️ OpenAI SDK not available. Writing original JSON to output.")
         if hasattr(output_file, "write"):
             json.dump(word_json, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
         else:
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(word_json, f, indent=2, ensure_ascii=False)
         return
+    # Create enhanced prompts
+    system_msg, user_prompt = create_enhanced_prompt(word_json, pdf_txt)
+    print(f"📊 Original JSON has {len(json.dumps(word_json))} characters")
+    print(f"📊 PDF text has {len(pdf_txt)} characters")
+    client = OpenAI(api_key=api_key)
     model_name = DEFAULT_MODEL
+    # Try multiple attempts with different strategies
     for attempt in range(1, RETRIES + 1):
+        print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
+        # Modify prompt for different attempts
+        current_user_prompt = user_prompt
         if attempt == 2:
+            current_user_prompt += "\n\nIMPORTANT: Focus on extracting the most obvious data first. Ensure valid JSON format."
         elif attempt >= 3:
+            current_user_prompt += "\n\nFINAL ATTEMPT: Return a valid JSON object even if some fields remain empty. Prioritize JSON validity over completeness."
         try:
+            raw_text = call_model_and_get_raw(client, model_name, system_msg, current_user_prompt)
+            # Save raw output for debugging
             out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
             raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
+            try:
+                with open(raw_save_path, "w", encoding="utf-8") as f:
+                    f.write(raw_text)
+            except:
+                pass
+            # Try to parse directly
             try:
+                parsed = json.loads(raw_text)
+                print("✅ Model returned valid JSON directly.")
+                # Validate structure matches original
+                if validate_json_structure(parsed, word_json):
+                    print("✅ JSON structure validation passed.")
+                else:
+                    print("⚠️ JSON structure differs from template, but proceeding...")
+                # Write output
                 if hasattr(output_file, "write"):
                     json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                     output_file.flush()
                 else:
+                    with open(output_file, "w", encoding="utf-8") as f:
+                        json.dump(parsed, f, indent=2, ensure_ascii=False)
                 return parsed
+            except Exception as parse_error:
+                print(f"⚠️ Direct parsing failed: {parse_error}")
+                # Try to extract JSON substring
+                candidate = find_first_balanced_json(raw_text)
                 if candidate:
                     try:
+                        parsed = json.loads(candidate)
+                        print("✅ Successfully extracted JSON substring from model output.")
+                        # Write output
                         if hasattr(output_file, "write"):
                             json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                             output_file.flush()
                         else:
+                            with open(output_file, "w", encoding="utf-8") as f:
+                                json.dump(parsed, f, indent=2, ensure_ascii=False)
                         return parsed
+                    except Exception as sub_parse_error:
+                        print(f"⚠️ Substring parsing also failed: {sub_parse_error}")
+                # Try repair pass
+                print("🔧 Attempting JSON repair...")
+                repair_system = "You are a JSON repair specialist. Fix the JSON below to be valid. Return ONLY valid JSON, nothing else."
+                repair_user = f"Fix this JSON:\n\n{raw_text}"
                 try:
+                    repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
+                    repair_parsed = json.loads(repair_raw)
+                    print("✅ Repair pass succeeded.")
+                    # Write output
                     if hasattr(output_file, "write"):
+                        json.dump(repair_parsed, output_file, indent=2, ensure_ascii=False)
                         output_file.flush()
                     else:
+                        with open(output_file, "w", encoding="utf-8") as f:
+                            json.dump(repair_parsed, f, indent=2, ensure_ascii=False)
+                    return repair_parsed
+                except Exception as repair_error:
+                    print(f"⚠️ Repair pass failed: {repair_error}")
+        except Exception as call_error:
+            print(f"⚠️ Exception while calling model: {call_error}")
         # Wait before next attempt
+        if attempt < RETRIES:
+            time.sleep(RETRY_DELAY)
+    # All attempts failed
+    print("❗ All LLM attempts failed. Writing original JSON to output.")
     try:
         if hasattr(output_file, "write"):
             json.dump(word_json, output_file, indent=2, ensure_ascii=False)
             output_file.flush()
         else:
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(word_json, f, indent=2, ensure_ascii=False)
+        print("✅ Original JSON template written to output.")
     except Exception as e:
+        print(f"⚠️ Failed to write original JSON: {e}")
     return None
+def validate_json_structure(parsed_json, original_json):
+    """Validate that the parsed JSON maintains the original structure"""
+    try:
+        def compare_structure(parsed, original, path=""):
+            if type(parsed) != type(original):
+                print(f"⚠️ Type mismatch at {path}: {type(parsed)} vs {type(original)}")
+                return False
+            if isinstance(original, dict):
+                for key in original.keys():
+                    if key not in parsed:
+                        print(f"⚠️ Missing key at {path}.{key}")
+                        return False
+                    if not compare_structure(parsed[key], original[key], f"{path}.{key}"):
+                        return False
+            return True
+        return compare_structure(parsed_json, original_json)
+    except Exception:
+        return False
 if __name__ == "__main__":
     if len(sys.argv) != 4:
     try:
         update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
     except Exception as e:
+        print(f"Unexpected exception: {e}")
         try:
             with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
                 outf.write(inf.read())