Spaces:

Shami96
/

PDF-Data_Extractor

Running

File size: 13,490 Bytes

#!/usr/bin/env python3
"""
Enhanced update_docx_with_pdf.py with better JSON structure handling
"""

import os
import sys
import json
import time
import re
from typing import Optional

try:
    from openai import OpenAI
except Exception:
    OpenAI = None

# Config
RETRIES = 3
RETRY_DELAY = 1.0
DEFAULT_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
MAX_TOKENS = 4096
TEMPERATURE = 0.0

def read_any(path_or_file):
    """Read content from file path or file-like object."""
    if hasattr(path_or_file, "read"):
        path_or_file.seek(0)
        content = path_or_file.read()
        if isinstance(content, bytes):
            content = content.decode("utf-8")
        return content
    else:
        with open(path_or_file, "r", encoding="utf-8") as fh:
            return fh.read()

def find_first_balanced_json(s: str) -> Optional[str]:
    """Find the first valid JSON object in the string"""
    if not s:
        return None
    
    for m in re.finditer(r"\{", s):
        start = m.start()
        depth = 0
        in_str = False
        escape = False
        
        for i in range(start, len(s)):
            ch = s[i]
            if ch == '"' and not escape:
                in_str = not in_str
            if in_str:
                if ch == "\\" and not escape:
                    escape = True
                else:
                    escape = False
                continue
            if ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    candidate = s[start : i + 1]
                    try:
                        json.loads(candidate)
                        return candidate
                    except Exception:
                        break
    return None

def call_model_and_get_raw(client, model_name: str, system_msg: str, user_msg: str):
    """Call the model and return raw text content"""
    resp = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
    )

    try:
        raw_text = resp.choices[0].message.content
    except Exception:
        try:
            raw_text = resp.choices[0].text
        except Exception:
            raw_text = str(resp)
    
    if isinstance(raw_text, bytes):
        raw_text = raw_text.decode("utf-8", errors="replace")
    return (raw_text or "").strip()

def create_enhanced_prompt(word_json, pdf_text):
    """Create an enhanced prompt that ensures proper JSON structure"""
    
    # Analyze the word_json structure to understand what needs to be filled
    structure_analysis = []
    
    def analyze_structure(obj, path=""):
        if isinstance(obj, dict):
            for key, value in obj.items():
                current_path = f"{path}.{key}" if path else key
                if isinstance(value, dict):
                    structure_analysis.append(f"  {current_path} (nested object)")
                    analyze_structure(value, current_path)
                elif isinstance(value, list):
                    structure_analysis.append(f"  {current_path} (list with {len(value)} items)")
                elif value is None or str(value).strip() == "":
                    structure_analysis.append(f"  {current_path} (EMPTY - needs data)")
                else:
                    structure_analysis.append(f"  {current_path} (has data: {str(value)[:50]}...)")
    
    analyze_structure(word_json)
    
    system_msg = """You are a precise JSON data extraction assistant. 

CRITICAL RULES:
1. Output ONLY valid JSON - no markdown, no explanations, no extra text
2. Maintain the EXACT structure provided in the template
3. Only UPDATE fields that are empty or null - do not change existing data
4. Extract data accurately from the PDF text provided
5. If you cannot find data for a field, leave it as null or empty string
6. Ensure all nested objects and arrays maintain their structure"""

    user_prompt = f"""TASK: Update this JSON template with data from the PDF text.

JSON TEMPLATE TO UPDATE:
{json.dumps(word_json, indent=2, ensure_ascii=False)}

STRUCTURE ANALYSIS:
{chr(10).join(structure_analysis[:50])}  # Limit to first 50 for brevity

PDF SOURCE TEXT:
{pdf_text[:100000]}  # Truncate very long text

EXTRACTION GUIDELINES:
- For "Operator name (Legal entity)" or similar: Extract the company name
- For "Date of Audit": Look for audit dates (format: DD Month YYYY or DD/MM/YYYY)
- For "Auditor name": Extract auditor's name
- For "Attendance List": Extract names and positions, format as list
- For vehicle data: Extract registration numbers, maintenance info, etc.
- For management summaries: Extract compliance details and findings

CRITICAL: Return ONLY the updated JSON object. No other text whatsoever."""

    return system_msg, user_prompt

def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
    # Load inputs
    word_json_text = read_any(word_json_file)
    pdf_txt = read_any(pdf_txt_file)

    try:
        word_json = json.loads(word_json_text)
    except Exception as e:
        print(f"⚠️ Input word_json is not valid JSON: {e}")
        print("Writing original to output and exiting.")
        if hasattr(output_file, "write"):
            output_file.write(word_json_text)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(word_json_text)
        return

    # Check API key
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("⚠️ OPENAI_API_KEY not found! Writing original JSON to output.")
        if hasattr(output_file, "write"):
            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(word_json, f, indent=2, ensure_ascii=False)
        return

    if OpenAI is None:
        print("⚠️ OpenAI SDK not available. Writing original JSON to output.")
        if hasattr(output_file, "write"):
            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(word_json, f, indent=2, ensure_ascii=False)
        return

    # Create enhanced prompts
    system_msg, user_prompt = create_enhanced_prompt(word_json, pdf_txt)
    
    print(f"📊 Original JSON has {len(json.dumps(word_json))} characters")
    print(f"📊 PDF text has {len(pdf_txt)} characters")

    client = OpenAI(api_key=api_key)
    model_name = DEFAULT_MODEL

    # Try multiple attempts with different strategies
    for attempt in range(1, RETRIES + 1):
        print(f"🛰️ Calling LLM (attempt {attempt}/{RETRIES}) with model {model_name}...")
        
        # Modify prompt for different attempts
        current_user_prompt = user_prompt
        if attempt == 2:
            current_user_prompt += "\n\nIMPORTANT: Focus on extracting the most obvious data first. Ensure valid JSON format."
        elif attempt >= 3:
            current_user_prompt += "\n\nFINAL ATTEMPT: Return a valid JSON object even if some fields remain empty. Prioritize JSON validity over completeness."

        try:
            raw_text = call_model_and_get_raw(client, model_name, system_msg, current_user_prompt)
            
            # Save raw output for debugging
            out_base = output_file if isinstance(output_file, str) else getattr(output_file, "name", "output")
            raw_save_path = f"{out_base}.model_raw_attempt{attempt}.txt"
            try:
                with open(raw_save_path, "w", encoding="utf-8") as f:
                    f.write(raw_text)
            except:
                pass

            # Try to parse directly
            try:
                parsed = json.loads(raw_text)
                print("✅ Model returned valid JSON directly.")
                
                # Validate structure matches original
                if validate_json_structure(parsed, word_json):
                    print("✅ JSON structure validation passed.")
                else:
                    print("⚠️ JSON structure differs from template, but proceeding...")
                
                # Write output
                if hasattr(output_file, "write"):
                    json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                    output_file.flush()
                else:
                    with open(output_file, "w", encoding="utf-8") as f:
                        json.dump(parsed, f, indent=2, ensure_ascii=False)
                return parsed
                
            except Exception as parse_error:
                print(f"⚠️ Direct parsing failed: {parse_error}")
                
                # Try to extract JSON substring
                candidate = find_first_balanced_json(raw_text)
                if candidate:
                    try:
                        parsed = json.loads(candidate)
                        print("✅ Successfully extracted JSON substring from model output.")
                        
                        # Write output
                        if hasattr(output_file, "write"):
                            json.dump(parsed, output_file, indent=2, ensure_ascii=False)
                            output_file.flush()
                        else:
                            with open(output_file, "w", encoding="utf-8") as f:
                                json.dump(parsed, f, indent=2, ensure_ascii=False)
                        return parsed
                        
                    except Exception as sub_parse_error:
                        print(f"⚠️ Substring parsing also failed: {sub_parse_error}")
                
                # Try repair pass
                print("🔧 Attempting JSON repair...")
                repair_system = "You are a JSON repair specialist. Fix the JSON below to be valid. Return ONLY valid JSON, nothing else."
                repair_user = f"Fix this JSON:\n\n{raw_text}"
                
                try:
                    repair_raw = call_model_and_get_raw(client, model_name, repair_system, repair_user)
                    repair_parsed = json.loads(repair_raw)
                    print("✅ Repair pass succeeded.")
                    
                    # Write output
                    if hasattr(output_file, "write"):
                        json.dump(repair_parsed, output_file, indent=2, ensure_ascii=False)
                        output_file.flush()
                    else:
                        with open(output_file, "w", encoding="utf-8") as f:
                            json.dump(repair_parsed, f, indent=2, ensure_ascii=False)
                    return repair_parsed
                    
                except Exception as repair_error:
                    print(f"⚠️ Repair pass failed: {repair_error}")

        except Exception as call_error:
            print(f"⚠️ Exception while calling model: {call_error}")

        # Wait before next attempt
        if attempt < RETRIES:
            time.sleep(RETRY_DELAY)

    # All attempts failed
    print("❗ All LLM attempts failed. Writing original JSON to output.")
    
    try:
        if hasattr(output_file, "write"):
            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
            output_file.flush()
        else:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(word_json, f, indent=2, ensure_ascii=False)
        print("✅ Original JSON template written to output.")
    except Exception as e:
        print(f"⚠️ Failed to write original JSON: {e}")

    return None

def validate_json_structure(parsed_json, original_json):
    """Validate that the parsed JSON maintains the original structure"""
    try:
        def compare_structure(parsed, original, path=""):
            if type(parsed) != type(original):
                print(f"⚠️ Type mismatch at {path}: {type(parsed)} vs {type(original)}")
                return False
            
            if isinstance(original, dict):
                for key in original.keys():
                    if key not in parsed:
                        print(f"⚠️ Missing key at {path}.{key}")
                        return False
                    if not compare_structure(parsed[key], original[key], f"{path}.{key}"):
                        return False
            
            return True
        
        return compare_structure(parsed_json, original_json)
    except Exception:
        return False

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
        sys.exit(0)

    try:
        update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
    except Exception as e:
        print(f"Unexpected exception: {e}")
        try:
            with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
                outf.write(inf.read())
                print("Wrote original input JSON to output due to exception.")
        except Exception:
            pass
        sys.exit(0)