Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 1

Commit

e8b46b5

verified ·

1 Parent(s): 6cfe4a2

Upload 4 files

Browse files

Files changed (4) hide show

extract_pdf_data.py +33 -0
extract_red_text.py +335 -0
update_docx_with_pdf.py +56 -0
updated_word.py +605 -0

extract_pdf_data.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import pdfplumber
+from pdf2image import convert_from_path
+import pytesseract
+def extract_pdf_full_text(pdf_path, txt_path):
+    raw_texts = []
+    need_ocr = []
+    # Step 1: Try to extract RAW text, record which pages need OCR
+    with pdfplumber.open(pdf_path) as pdf:
+        for i, page in enumerate(pdf.pages):
+            print(f"Extracting text from page {i+1}...")
+            text = page.extract_text() or ""
+            if text.strip():
+                raw_texts.append(f"\n--- PAGE {i+1} RAW TEXT ---\n{text.strip()}")
+            else:
+                raw_texts.append(None)  # Mark that we need OCR for this page
+                need_ocr.append(i)
+    # Step 2: OCR only those pages with no RAW text
+    print("Running OCR where RAW text is missing...")
+    images = convert_from_path(pdf_path, dpi=300)
+    for idx in need_ocr:
+        ocr_text = pytesseract.image_to_string(images[idx])
+        raw_texts[idx] = f"\n--- PAGE {idx+1} OCR TEXT ---\n{ocr_text.strip()}"
+    # Step 3: Save to file (skip any leftover Nones, but there shouldn't be any)
+    result = [txt for txt in raw_texts if txt]
+    with open(txt_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(result))
+    print(f"✅ Saved deduped full text to {txt_path}")
+if __name__ == "__main__":
+    extract_pdf_full_text("test1.pdf", "pdf_all_text_full.txt")

extract_red_text.py ADDED Viewed

	@@ -0,0 +1,335 @@

+#!/usr/bin/env python3
+import re, json, sys
+from docx import Document
+from docx.oxml.ns import qn
+def is_red_font(run) -> bool:
+    """Return True if this run is coloured red-ish."""
+    col = run.font.color
+    if col and col.rgb:
+        r,g,b = col.rgb[0], col.rgb[1], col.rgb[2]
+        if r>150 and g<100 and b<100 and (r-g)>30 and (r-b)>30:
+            return True
+    # fallback: raw <w:color w:val="XXXXXX"/>
+    rPr = getattr(run._element, "rPr", None)
+    if rPr is not None:
+        clr = rPr.find(qn('w:color'))
+        if clr is not None:
+            val = clr.get(qn('w:val'))
+            if re.fullmatch(r"[0-9A-Fa-f]{6}", val):
+                rr,gg,bb = int(val[:2],16), int(val[2:4],16), int(val[4:],16)
+                if rr>150 and gg<100 and bb<100 and (rr-gg)>30 and (rr-bb)>30:
+                    return True
+    return False
+# ─────────────────────────────────────────────────────────────────────────────
+# Your template, mapped 1:1 to doc.tables[0..18]
+MASTER_TABLES = [
+  # Table 0: Tick as appropriate (Mass, Maintenance, etc.)
+  {
+    "name": "Tick as appropriate",
+    "labels_on_row1": True,
+    "labels": [
+      "Mass", "Maintenance", "Basic Fatigue", "Advanced Fatigue",
+      "Entry Audit", "Initial Compliance Audit", "Compliance Audit",
+      "Spot Check", "Triggered Audit"
+    ]
+  },
+  # Table 1: Audit Information
+  {
+    "name": "Audit Information",
+    "labels_on_left": True,
+    "labels": [
+      "Date of Audit",
+      "Location of audit",
+      "Auditor name",
+      "Audit Matrix Identifier (Name or Number)",  # Corrected full label
+      "Auditor Exemplar Global Reg No.",
+      "expiry Date:",
+      "NHVR Auditor Registration Number",
+      "expiry Date:"  # Note: Duplicate label, might need special handling
+    ]
+  },
+  # Table 2: Operator Information (including contact details)
+  {
+    "name": "Operator Information",
+    "labels_on_left": True,
+    "skip_rows": ["Operator contact details", ""],  # Skip subheading and blank rows
+    "labels": [
+      "Operator name (Legal entity)",
+      "NHVAS Accreditation No. (If applicable)",
+      "Registered trading name/s",
+      "Australian Company Number",
+      "NHVAS Manual (Policies and Procedures) developed by",
+      "Operator business address",
+      "Operator Postal address",
+      "Email address",
+      "Operator Telephone Number"
+    ]
+  },
+  # Table 3: Attendance List
+  {
+    "name": "Attendance List (Names and Position Titles)",
+    "labels": ["Attendance List (Names and Position Titles)"]
+  },
+  # Table 4: Nature of the Operators Business
+  {
+    "name": "Nature of the Operators Business (Summary)",
+    "labels": [
+      "Nature of the Operators Business (Summary)",
+      "Accreditation Number:",
+      "Expiry Date:"
+    ]
+  },
+  # Table 5: Accreditation Vehicle Summary
+  {
+    "name": "Accreditation Vehicle Summary",
+    "labels_on_left": True,
+    "labels": [
+      "Number of powered vehicles",
+      "Number of trailing vehicles"
+    ]
+  },
+  # Table 6: Accreditation Driver Summary
+  {
+    "name": "Accreditation Driver Summary",
+    "labels_on_left": True,
+    "labels": [
+      "Number of drivers in BFM",
+      "Number of drivers in AFM"
+    ]
+  },
+  # Table 7: Compliance Codes
+  {
+    "name": "Compliance Codes",
+    "labels_on_row1": True,
+    "labels": ["V", "SFI", "NA", "NC", "NAP"]
+  },
+  # Table 8: Corrective Action Request Identification
+  {
+    "name": "Corrective Action Request Identification",
+    "labels_on_row1": True,
+    "labels": ["Title", "Abbreviation", "Description"]
+  },
+  # Table 9: MASS MANAGEMENT (Standards 1-8)
+  {
+    "name": "MASS MANAGEMENT",
+    "labels_on_left": True,
+    "labels": [
+      "Std 1. Responsibilities",
+      "Std 2. Vehicle Control",
+      "Std 3. Vehicle Use",
+      "Std 4. Records and Documentation",
+      "Std 5. Verification",
+      "Std 6. Internal Review",
+      "Std 7. Training and Education",
+      "Std 8. Maintenance of Suspension"
+    ]
+  },
+  # Table 10: Mass Management Summary of Audit findings (Standards 1-8)
+  {
+    "name": "Mass Management Summary of Audit findings",
+    "labels_on_left": True,
+    "labels": [
+      "Std 1. Responsibilities",
+      "Std 2. Vehicle Control",
+      "Std 3. Vehicle Use",
+      "Std 4. Records and Documentation",
+      "Std 5. Verification",
+      "Std 6. Internal Review",
+      "Std 7. Training and Education",
+      "Std 8. Maintenance of Suspension"
+    ]
+  },
+  # Table 11: Vehicle Registration Numbers of Records Examined
+  {
+    "name": "Vehicle Registration Numbers of Records Examined",
+    "labels_on_row1": True,
+    "labels": [
+      "No.", "Registration Number",
+      "Sub-contractor (Yes/No)",
+      "Sub-contracted Vehicles Statement of Compliance (Yes/No)",
+      "Weight Verification Records (Date Range)",
+      "RFS Suspension Certification # (N/A if not applicable)",
+      "Suspension System Maintenance (Date Range)",
+      "Trip Records (Date Range)",
+      "Fault Recording/ Reporting on Suspension System (Date Range)"
+    ]
+  },
+  # Table 12: Operator's Name (legal entity) - Signature block
+  {
+    "name": "Operator’s Name (legal entity)",
+    "labels": ["Operator’s Name (legal entity)"]
+  },
+  # Table 13: Non-conformance type
+  {
+    "name": "Non-conformance type (please tick)",
+    "labels": ["Un-conditional", "Conditional"]
+  },
+  # Table 14: Non-conformance Information
+  {
+    "name": "Non-conformance Information",
+    "labels_on_row1": True,
+    "labels": [
+      "Non-conformance agreed close out date",
+      "Module and Standard",
+      "Corrective Action Request (CAR) Number"
+    ]
+  },
+  # Table 15: Non-conformance and action taken
+  {
+    "name": "Non-conformance and action taken",
+    "labels_on_row1": True,
+    "labels": [
+      "Observed Non-conformance:",
+      "Corrective Action taken or to be taken by operator:",
+      "Operator or Representative Signature", "Position", "Date"
+    ]
+  },
+  # Table 16: Print Name / Auditor Reg Number
+  {
+    "name": "Print Name / Auditor Reg Number",
+    "labels_on_row1": True,
+    "labels": [
+      "Print Name",
+      "NHVR or Exemplar Global Auditor Registration Number"
+    ]
+  },
+  # Table 17: Audit Declaration
+  {
+    "name": "Audit Declaration",
+    "labels_on_left": True,
+    "labels": [
+      "Audit was conducted on",
+      "Unconditional CARs closed out on:",
+      "Conditional CARs to be closed out by:"
+    ]
+  },
+  # Table 18: print accreditation name
+  {
+    "name": "print accreditation name",
+    "labels": ["print accreditation name"]
+  },
+  # Table 19: Operator Declaration
+  {
+    "name": "Operator Declaration",
+    "labels_on_row1": True,
+    "labels": ["Print Name", "Position Title"]
+  }
+]
+def extract_red_text(path):
+    doc = Document(path)
+    # debug print
+    print(f"Found {len(doc.tables)} tables:")
+    for i,t in enumerate(doc.tables):
+        print(f"  Table#{i}: “{t.rows[0].cells[0].text.strip()[:30]}…”")
+    print()
+    out = {}
+    for ti, spec in enumerate(MASTER_TABLES):
+        if ti >= len(doc.tables):
+            break
+        tbl = doc.tables[ti]
+        name = spec["name"]
+        # prepare container & dedupe sets
+        collected = {lbl:[] for lbl in spec["labels"]}
+        seen = {lbl:set() for lbl in spec["labels"]}
+        # choose orientation
+        if spec.get("labels_on_row1"):
+            headers   = spec["labels"]
+            rows      = tbl.rows[1:]
+            col_mode  = True
+        elif spec.get("labels_on_left"):
+            headers   = spec["labels"]
+            # skip any unwanted header/subheading rows
+            rows = [
+                row for row in tbl.rows[1:]
+                if row.cells[0].text.strip() not in spec.get("skip_rows",[])
+            ]
+            col_mode = False
+        else:
+            headers   = [name]
+            rows      = tbl.rows
+            col_mode  = None
+        # scan each cell
+        for ri,row in enumerate(rows):
+            for ci,cell in enumerate(row.cells):
+                red = "".join(
+                    run.text for p in cell.paragraphs for run in p.runs
+                    if is_red_font(run)
+                ).strip()
+                if not red: continue
+                # assign label
+                if   col_mode is True:
+                    lbl = headers[ci] if ci<len(headers) else name
+                elif col_mode is False:
+                    lbl = headers[ri] if ri<len(headers) else name
+                else:
+                    lbl = name
+                # dedupe & collect
+                if red not in seen[lbl]:
+                    seen[lbl].add(red)
+                    collected[lbl].append(red)
+        # only keep non-empty labels
+        filtered = {l:collected[l] for l in collected if collected[l]}
+        if filtered:
+            out[name] = filtered
+    # paragraphs
+    paras = {}
+    for i,para in enumerate(doc.paragraphs):
+        red = "".join(r.text for r in para.runs if is_red_font(r)).strip()
+        if not red: continue
+        # find nearest non-red above
+        lab = None
+        for j in range(i-1,-1,-1):
+            if any(is_red_font(r) for r in doc.paragraphs[j].runs):
+                continue
+            txt = doc.paragraphs[j].text.strip()
+            if txt:
+                lab = txt; break
+        key = lab or "(para)"
+        paras.setdefault(key,[]).append(red)
+    if paras:
+        out["paragraphs"] = paras
+    return out
+if __name__=="__main__":
+    fn = sys.argv[1] if len(sys.argv)>1 else "test.docx"
+    word_data = extract_red_text(fn)
+    # --- STORE TO JSON for later reuse ---
+    with open('word_red_data.json', 'w', encoding='utf-8') as f:
+        json.dump(word_data, f, indent=2, ensure_ascii=False)
+    # ----------------------------------------
+    # still print to console for immediate feedback
+    print(json.dumps(word_data, indent=2, ensure_ascii=False))

update_docx_with_pdf.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import openai
+import json
+# Set your OpenAI API key here
+OPENAI_API_KEY = "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"
+# Load PDF text
+WORD_JSON_FILE = "word_red_data.json"
+PDF_TEXT_FILE = "pdf_all_text_full.txt"
+OUTPUT_FILE = "updated_word_data1.json"
+# --- Load files ---
+with open(WORD_JSON_FILE, "r", encoding="utf-8") as f:
+    word_json = f.read()
+with open(PDF_TEXT_FILE, "r", encoding="utf-8") as f:
+    pdf_txt = f.read()
+# --- Build prompt ---
+user_prompt = f"""
+Here is a JSON template. It contains only the fields that need updating:
+{word_json}
+Here is the extracted text from a PDF:
+{pdf_txt}
+Instructions:
+- ONLY update the fields present in the JSON template, using information from the PDF text.
+- DO NOT add any extra fields, and do not change the JSON structure.
+- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
+- Make sure the JSON is valid and ready to use.
+"""
+# --- Call OpenAI API (no env var needed) ---
+client = openai.OpenAI(api_key=OPENAI_API_KEY)
+response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+        {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON."},
+        {"role": "user", "content": user_prompt}
+    ],
+    max_tokens=4096,
+    temperature=0
+)
+updated_json_str = response.choices[0].message.content.strip()
+# --- Try to parse as JSON ---
+try:
+    parsed = json.loads(updated_json_str)
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(parsed, f, indent=2, ensure_ascii=False)
+    print("✅ JSON updated and saved to", OUTPUT_FILE)
+except Exception as e:
+    print("⚠️ Model did not return valid JSON. Raw output below:\n")
+    print(updated_json_str)
+    print("\n❌ Failed to parse updated JSON:", e)

updated_word.py ADDED Viewed

	@@ -0,0 +1,605 @@

+import json
+from docx import Document
+from docx.shared import RGBColor
+import re
+def load_json(filepath):
+    with open(filepath, 'r') as file:
+        return json.load(file)
+def flatten_json(y, prefix=''):
+    out = {}
+    for key, val in y.items():
+        new_key = f"{prefix}.{key}" if prefix else key
+        if isinstance(val, dict):
+            out.update(flatten_json(val, new_key))
+        else:
+            out[new_key] = val
+            out[key] = val
+    return out
+def is_red(run):
+    color = run.font.color
+    return color and (color.rgb == RGBColor(255, 0, 0) or getattr(color, "theme_color", None) == 1)
+def get_value_as_string(value, field_name=""):
+    if isinstance(value, list):
+        if len(value) == 0:
+            return ""
+        elif len(value) == 1:
+            return str(value[0])
+        else:
+            if "australian company number" in field_name.lower() or "company number" in field_name.lower():
+                return value
+            else:
+                return " ".join(str(v) for v in value)
+    else:
+        return str(value)
+def find_matching_json_value(field_name, flat_json):
+    """Find matching JSON value based on field name (key)"""
+    field_name = field_name.strip()
+    # Manual mapping for specific sections that need special handling
+    manual_mappings = {
+        "attendance list name and position title": "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
+        "attendance list (names and position titles)": "Attendance List (Names and Position Titles).Attendance List (Names and Position Titles)",
+        "nature of the operators business (summary)": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
+        "nature of the operators business (summary):": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
+        "nature of operators business (summary)": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
+        "nature of operators business (summary):": "Nature of the Operators Business (Summary).Nature of the Operators Business (Summary)",
+        # Paragraph-level mappings
+        "mass management": "paragraphs.MASS MANAGEMENT",
+        "liam herbig": "paragraphs.MASS MANAGEMENT",  # Name should be replaced with company name
+        "date": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
+        # Date-related mappings
+        "13.11.2024": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
+        "auditor signature": "paragraphs.This management system I have audited when followed will ensure compliance with the relevant NHVAS Business Rules & Standards.",
+        "operator signature": "paragraphs.I hereby consent to information relating to my Accreditation to be shared with other law enforcement agencies, including a service provider authorised under the Heavy Vehicle National Law.",
+        # Specific data mappings
+        "jodie jones": "Audit Information.Auditor name",
+        "13th november 2024": "Audit Information.Date of Audit",
+        "adelaide barossa transport & warehousing pty ltd": "Operator Information.Operator name (Legal entity)",
+        "manager": "Operator Information.Operator name (Legal entity)",  # Replace manager title with company name
+        "liam herbig –manager": "Operator Information.Operator name (Legal entity)",
+        "liam herbig – manager": "Operator Information.Operator name (Legal entity)",
+        "deborah herbig – manager": "Operator Information.Operator name (Legal entity)",
+        # Contact information mappings (old data in red text -> new data from JSON)
+        "141 sitz road callington sa 5254": "Operator Information.Operator business address",  # Replace old address with new
+        "po box 743 mt barker sa": "Operator Information.Operator Postal address",  # Replace old postal with new
+        "debherbig@bigpond.com": "Operator Information.Email address",  # Replace old email with new
+        "0447 710 602": "Operator Information.Operator Telephone Number",  # Replace old phone with new
+        # Manual/Version mappings (old version -> new version)
+        "mahlo 092021v1": "Operator Information.NHVAS Manual (Policies and Procedures) developed by",  # Replace old manual with new
+        # These should stay as they are (no replacement needed, just different format)
+        "511840": "Operator Information.NHVAS Accreditation No. (If applicable)",  # Keep accreditation number
+        "26th october 2023": "Audit Information.Date of Audit",  # Use audit date instead
+        # Std 5 and Std 6 mappings
+        "the latest verification was dated 23rdnovember 2022": "Mass Management Summary of Audit findings.Std 5. Verification",
+        "the latest verification was dated 23rd november 2022": "Mass Management Summary of Audit findings.Std 5. Verification",
+        "internal review was dated 23rd august 2023 with 0 ncr": "Mass Management Summary of Audit findings.Std 6. Internal Review",
+        "23rd august2023 with 0 trips, 0 trips using mass, 0 overloads and 0 ncr's": "Mass Management Summary of Audit findings.Std 6. Internal Review",
+        "23rd august 2023 with 0 trips, 0 trips using mass, 0 overloads and 0 ncr's": "Mass Management Summary of Audit findings.Std 6. Internal Review",
+    }
+    # Check manual mappings first
+    normalized_field = field_name.lower().strip()
+    if normalized_field in manual_mappings:
+        mapped_key = manual_mappings[normalized_field]
+        if mapped_key in flat_json:
+            print(f"    ✅ Manual mapping found for '{field_name}' -> '{mapped_key}'")
+            return flat_json[mapped_key]
+    # Try exact match first
+    if field_name in flat_json:
+        print(f"    Direct match found for key '{field_name}'")
+        return flat_json[field_name]
+    # Try case-insensitive exact match
+    for key, value in flat_json.items():
+        if key.lower() == field_name.lower():
+            print(f"    Case-insensitive match found for key '{field_name}' with JSON key '{key}'")
+            return value
+    # Try to find a key that ends with this field name
+    for key, value in flat_json.items():
+        if key.endswith('.' + field_name):
+            print(f"    Suffix match found for key '{field_name}' with JSON key '{key}'")
+            return value
+    # Try partial matching for fields with parentheses or additional text
+    clean_field = re.sub(r'\s*\([^)]*\)', '', field_name).strip()  # Remove parentheses content
+    for key, value in flat_json.items():
+        clean_key = re.sub(r'\s*\([^)]*\)', '', key).strip()
+        if clean_field.lower() == clean_key.lower():
+            print(f"    Clean match found for key '{field_name}' with JSON key '{key}'")
+            return value
+    # Try word-based matching - more flexible approach
+    field_words = set(word.lower() for word in re.findall(r'\b\w+\b', field_name) if len(word) > 2)
+    best_match = None
+    best_score = 0
+    for key, value in flat_json.items():
+        key_words = set(word.lower() for word in re.findall(r'\b\w+\b', key) if len(word) > 2)
+        # Calculate how many words match
+        common_words = field_words.intersection(key_words)
+        if common_words:
+            score = len(common_words) / max(len(field_words), len(key_words))  # Normalized score
+            if score > best_score:
+                best_score = score
+                best_match = (key, value)
+    if best_match and best_score >= 0.5:  # At least 50% word overlap
+        print(f"    Word-based match found for key '{field_name}' with JSON key '{best_match[0]}' (score: {best_score:.2f})")
+        return best_match[1]
+    # No match found
+    print(f"    ❌ No match found for '{field_name}'")
+    return None
+def get_clean_text(cell):
+    text = ""
+    for paragraph in cell.paragraphs:
+        for run in paragraph.runs:
+            text += run.text
+    return text.strip()
+def has_red_text(cell):
+    for paragraph in cell.paragraphs:
+        for run in paragraph.runs:
+            if is_red(run) and run.text.strip():
+                return True
+    return False
+def replace_red_text_in_cell(cell, replacement_text):
+    replacements_made = 0
+    # First, collect all red text to show what we're replacing
+    all_red_text = ""
+    for paragraph in cell.paragraphs:
+        for run in paragraph.runs:
+            if is_red(run):
+                all_red_text += run.text
+    if all_red_text.strip():
+        print(f"      ✅ Replacing red text: '{all_red_text[:50]}...' → '{replacement_text[:50]}...'")
+    # Now replace all red text in the cell with the replacement text
+    first_replacement_done = False
+    for paragraph in cell.paragraphs:
+        red_runs = [run for run in paragraph.runs if is_red(run)]
+        if red_runs:
+            if not first_replacement_done:
+                # Replace the first red run with our text
+                red_runs[0].text = replacement_text
+                red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
+                first_replacement_done = True
+                replacements_made = 1
+            else:
+                # Clear the first red run since we already replaced content
+                red_runs[0].text = ''
+            # Clear all other red runs in this paragraph
+            for run in red_runs[1:]:
+                run.text = ''
+    return replacements_made
+def handle_australian_company_number(row, company_numbers):
+    replacements_made = 0
+    for i, digit in enumerate(company_numbers):
+        cell_idx = i + 1
+        if cell_idx < len(row.cells):
+            cell = row.cells[cell_idx]
+            if has_red_text(cell):
+                cell_replacements = replace_red_text_in_cell(cell, str(digit))
+                replacements_made += cell_replacements
+                print(f"      -> Placed digit '{digit}' in cell {cell_idx + 1}")
+    return replacements_made
+def handle_vehicle_registration_table(table, flat_json):
+    """Handle the Vehicle Registration Numbers table with column-based data"""
+    replacements_made = 0
+    # Look for the vehicle registration data in the flattened JSON
+    vehicle_section = None
+    # Try to find the vehicle registration section
+    for key, value in flat_json.items():
+        if "vehicle registration numbers of records examined" in key.lower():
+            if isinstance(value, dict):  # This should be the nested structure
+                vehicle_section = value
+                print(f"    ✅ Found vehicle data in key: '{key}'")
+                break
+    if not vehicle_section:
+        # Try alternative approach - look for individual column keys
+        potential_columns = {}
+        for key, value in flat_json.items():
+            if any(col_name in key.lower() for col_name in ["registration number", "sub-contractor", "weight verification", "rfs suspension"]):
+                # Extract the column name from the flattened key
+                if "." in key:
+                    column_name = key.split(".")[-1]
+                else:
+                    column_name = key
+                potential_columns[column_name] = value
+        if potential_columns:
+            vehicle_section = potential_columns
+            print(f"    ✅ Found vehicle data from flattened keys: {list(vehicle_section.keys())}")
+        else:
+            print(f"    ❌ Vehicle registration data not found in JSON")
+            return 0
+    print(f"    ✅ Found vehicle registration data with {len(vehicle_section)} columns")
+    # Find header row (usually row 0 or 1)
+    header_row_idx = -1
+    header_row = None
+    for row_idx, row in enumerate(table.rows):
+        row_text = "".join(get_clean_text(cell).lower() for cell in row.cells)
+        if "registration" in row_text and "number" in row_text:
+            header_row_idx = row_idx
+            header_row = row
+            break
+    if header_row_idx == -1:
+        print(f"    ❌ Could not find header row in vehicle table")
+        return 0
+    print(f"    ✅ Found header row at index {header_row_idx}")
+    # Create mapping between column indices and JSON keys
+    column_mapping = {}
+    for col_idx, cell in enumerate(header_row.cells):
+        header_text = get_clean_text(cell).strip()
+        if not header_text or header_text.lower() == "no.":
+            continue
+        # Try to match header text with JSON keys
+        best_match = None
+        best_score = 0
+        # Normalize header text for better matching
+        normalized_header = header_text.lower().replace("(", " (").replace(")", ") ").strip()
+        for json_key in vehicle_section.keys():
+            normalized_json = json_key.lower().strip()
+            # Try exact match first (after normalization)
+            if normalized_header == normalized_json:
+                best_match = json_key
+                best_score = 1.0
+                break
+            # Try word-based matching
+            header_words = set(word.lower() for word in normalized_header.split() if len(word) > 2)
+            json_words = set(word.lower() for word in normalized_json.split() if len(word) > 2)
+            if header_words and json_words:
+                common_words = header_words.intersection(json_words)
+                score = len(common_words) / max(len(header_words), len(json_words))
+                if score > best_score and score >= 0.3:  # At least 30% match
+                    best_score = score
+                    best_match = json_key
+            # Try substring matching for cases like "RegistrationNumber" vs "Registration Number"
+            header_clean = normalized_header.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
+            json_clean = normalized_json.replace(" ", "").replace("-", "").replace("(", "").replace(")", "")
+            if header_clean in json_clean or json_clean in header_clean:
+                if len(header_clean) > 5 and len(json_clean) > 5:  # Only for meaningful matches
+                    substring_score = min(len(header_clean), len(json_clean)) / max(len(header_clean), len(json_clean))
+                    if substring_score > best_score and substring_score >= 0.6:
+                        best_score = substring_score
+                        best_match = json_key
+        if best_match:
+            column_mapping[col_idx] = best_match
+            print(f"      📌 Column {col_idx + 1} ('{header_text}') -> '{best_match}' (score: {best_score:.2f})")
+    if not column_mapping:
+        print(f"    ❌ No column mappings found")
+        return 0
+    # Determine how many data rows we need based on the JSON arrays
+    max_data_rows = 0
+    for json_key, data in vehicle_section.items():
+        if isinstance(data, list):
+            max_data_rows = max(max_data_rows, len(data))
+    print(f"    📌 Need to populate {max_data_rows} data rows")
+    # Process all required data rows
+    for data_row_index in range(max_data_rows):
+        table_row_idx = header_row_idx + 1 + data_row_index
+        # Check if this table row exists, if not, add it
+        if table_row_idx >= len(table.rows):
+            print(f"    ⚠️ Row {table_row_idx + 1} doesn't exist - table only has {len(table.rows)} rows")
+            print(f"    ➕ Adding new row for vehicle {data_row_index + 1}")
+            # Add a new row to the table
+            new_row = table.add_row()
+            print(f"    ✅ Successfully added row {len(table.rows)} to the table")
+        row = table.rows[table_row_idx]
+        print(f"    📌 Processing data row {table_row_idx + 1} (vehicle {data_row_index + 1})")
+        # Fill in data for each mapped column
+        for col_idx, json_key in column_mapping.items():
+            if col_idx < len(row.cells):
+                cell = row.cells[col_idx]
+                # Get the data for this column and row
+                column_data = vehicle_section.get(json_key, [])
+                if isinstance(column_data, list) and data_row_index < len(column_data):
+                    replacement_value = str(column_data[data_row_index])
+                    # Check if cell has red text or is empty (needs data)
+                    cell_text = get_clean_text(cell)
+                    if has_red_text(cell) or not cell_text.strip():
+                        # If cell is empty, add the text directly
+                        if not cell_text.strip():
+                            cell.text = replacement_value
+                            replacements_made += 1
+                            print(f"      -> Added '{replacement_value}' to empty cell (column '{json_key}')")
+                        else:
+                            # If cell has red text, replace it
+                            cell_replacements = replace_red_text_in_cell(cell, replacement_value)
+                            replacements_made += cell_replacements
+                            if cell_replacements > 0:
+                                print(f"      -> Replaced red text with '{replacement_value}' (column '{json_key}')")
+    return replacements_made
+def handle_print_accreditation_section(table, flat_json):
+    """Handle the special case of print accreditation name with 2 values"""
+    replacements_made = 0
+    # Look for the print accreditation name data
+    print_data = flat_json.get("print accreditation name.print accreditation name", [])
+    if not isinstance(print_data, list) or len(print_data) < 2:
+        return 0
+    name_value = print_data[0]  # "Simon Anderson"
+    position_value = print_data[1]  # "Director"
+    print(f"    📋 Print accreditation data: Name='{name_value}', Position='{position_value}'")
+    # Find rows with "Print Name" and "Position Title"
+    for row_idx, row in enumerate(table.rows):
+        if len(row.cells) >= 2:
+            # Check if this row has the headers
+            cell1_text = get_clean_text(row.cells[0]).lower()
+            cell2_text = get_clean_text(row.cells[1]).lower()
+            if "print name" in cell1_text and "position title" in cell2_text:
+                print(f"    📍 Found header row {row_idx + 1}: '{cell1_text}' | '{cell2_text}'")
+                # Check the next row for red text to replace
+                if row_idx + 1 < len(table.rows):
+                    data_row = table.rows[row_idx + 1]
+                    if len(data_row.cells) >= 2:
+                        # Replace Print Name (first cell)
+                        if has_red_text(data_row.cells[0]):
+                            cell_replacements = replace_red_text_in_cell(data_row.cells[0], name_value)
+                            replacements_made += cell_replacements
+                            if cell_replacements > 0:
+                                print(f"      ✅ Replaced Print Name: '{name_value}'")
+                        # Replace Position Title (second cell)
+                        if has_red_text(data_row.cells[1]):
+                            cell_replacements = replace_red_text_in_cell(data_row.cells[1], position_value)
+                            replacements_made += cell_replacements
+                            if cell_replacements > 0:
+                                print(f"      ✅ Replaced Position Title: '{position_value}'")
+                break  # Found the section, no need to continue
+    return replacements_made
+def process_single_column_sections(cell, field_name, flat_json):
+    json_value = find_matching_json_value(field_name, flat_json)
+    if json_value is not None:
+        replacement_text = get_value_as_string(json_value, field_name)
+        if isinstance(json_value, list) and len(json_value) > 1:
+            replacement_text = "\n".join(str(item) for item in json_value)
+        if has_red_text(cell):
+            print(f"    ✅ Replacing red text in single-column section: '{field_name}'")
+            print(f"    ✅ Replacement text:\n{replacement_text}")
+            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
+            if cell_replacements > 0:
+                print(f"    -> Replaced with: '{replacement_text[:100]}...'")
+                return cell_replacements
+    return 0
+def process_tables(document, flat_json):
+    """Process tables to find key-value pairs and replace red values"""
+    replacements_made = 0
+    for table_idx, table in enumerate(document.tables):
+        print(f"\n🔍 Processing table {table_idx + 1}:")
+        # Check if this is the vehicle registration table
+        table_text = ""
+        for row in table.rows[:3]:  # Check first 3 rows
+            for cell in row.cells:
+                table_text += get_clean_text(cell).lower() + " "
+        # Look for vehicle registration indicators (need multiple indicators to avoid false positives)
+        vehicle_indicators = ["registration number", "sub-contractor", "weight verification", "rfs suspension"]
+        indicator_count = sum(1 for indicator in vehicle_indicators if indicator in table_text)
+        if indicator_count >= 3:  # Require at least 3 indicators to be sure it's a vehicle table
+            print(f"    🚗 Detected Vehicle Registration table")
+            vehicle_replacements = handle_vehicle_registration_table(table, flat_json)
+            replacements_made += vehicle_replacements
+            continue  # Skip normal processing for this table
+        # Check if this is the print accreditation table
+        print_accreditation_indicators = ["print name", "position title"]
+        indicator_count = sum(1 for indicator in print_accreditation_indicators if indicator in table_text)
+        if indicator_count >= 2:  # Require at least 2 indicators to be sure it's a print accreditation table
+            print(f"    📋 Detected Print Accreditation table")
+            print_accreditation_replacements = handle_print_accreditation_section(table, flat_json)
+            replacements_made += print_accreditation_replacements
+            continue  # Skip normal processing for this table
+        for row_idx, row in enumerate(table.rows):
+            if len(row.cells) < 1:  # Skip empty rows
+                continue
+            # Get the key from the first column
+            key_cell = row.cells[0]
+            key_text = get_clean_text(key_cell)
+            if not key_text:
+                continue
+            print(f"  📌 Row {row_idx + 1}: Key = '{key_text}'")
+            # Check if this key exists in our JSON
+            json_value = find_matching_json_value(key_text, flat_json)
+            if json_value is not None:
+                replacement_text = get_value_as_string(json_value, key_text)
+                # Special handling for Australian Company Number
+                if ("australian company number" in key_text.lower() or "company number" in key_text.lower()) and isinstance(json_value, list):
+                    cell_replacements = handle_australian_company_number(row, json_value)
+                    replacements_made += cell_replacements
+                # Handle section headers (like Attendance List, Nature of Business) where content is in next row
+                elif ("attendance list" in key_text.lower() or "nature of" in key_text.lower()) and row_idx + 1 < len(table.rows):
+                    print(f"    ✅ Section header detected, checking next row for content...")
+                    next_row = table.rows[row_idx + 1]
+                    # Check all cells in the next row for red text
+                    for cell_idx, cell in enumerate(next_row.cells):
+                        if has_red_text(cell):
+                            print(f"    ✅ Found red text in next row, cell {cell_idx + 1}")
+                            # For list values, join with line breaks
+                            if isinstance(json_value, list):
+                                replacement_text = "\n".join(str(item) for item in json_value)
+                            cell_replacements = replace_red_text_in_cell(cell, replacement_text)
+                            replacements_made += cell_replacements
+                            if cell_replacements > 0:
+                                print(f"    -> Replaced section content with: '{replacement_text[:100]}...'")
+                elif len(row.cells) == 1 or (len(row.cells) > 1 and not any(has_red_text(row.cells[i]) for i in range(1, len(row.cells)))):
+                    if has_red_text(key_cell):
+                        cell_replacements = process_single_column_sections(key_cell, key_text, flat_json)
+                        replacements_made += cell_replacements
+                else:
+                    for cell_idx in range(1, len(row.cells)):
+                        value_cell = row.cells[cell_idx]
+                        if has_red_text(value_cell):
+                            print(f"    ✅ Found red text in column {cell_idx + 1}")
+                            cell_replacements = replace_red_text_in_cell(value_cell, replacement_text)
+                            replacements_made += cell_replacements
+            else:
+                if len(row.cells) == 1 and has_red_text(key_cell):
+                    red_text = ""
+                    for paragraph in key_cell.paragraphs:
+                        for run in paragraph.runs:
+                            if is_red(run):
+                                red_text += run.text
+                    if red_text.strip():
+                        section_value = find_matching_json_value(red_text.strip(), flat_json)
+                        if section_value is not None:
+                            section_replacement = get_value_as_string(section_value, red_text.strip())
+                            cell_replacements = replace_red_text_in_cell(key_cell, section_replacement)
+                            replacements_made += cell_replacements
+                # Handle tables where red text appears in multiple columns (like contact info tables)
+                for cell_idx in range(len(row.cells)):
+                    cell = row.cells[cell_idx]
+                    if has_red_text(cell):
+                        # Get the red text from this cell
+                        red_text = ""
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                if is_red(run):
+                                    red_text += run.text
+                        if red_text.strip():
+                            # Try to find a direct mapping for this red text
+                            section_value = find_matching_json_value(red_text.strip(), flat_json)
+                            if section_value is not None:
+                                section_replacement = get_value_as_string(section_value, red_text.strip())
+                                cell_replacements = replace_red_text_in_cell(cell, section_replacement)
+                                replacements_made += cell_replacements
+                                if cell_replacements > 0:
+                                    print(f"    ✅ Replaced red text '{red_text.strip()[:30]}...' with '{section_replacement[:30]}...' in cell {cell_idx + 1}")
+    return replacements_made
+def process_paragraphs(document, flat_json):
+    replacements_made = 0
+    print(f"\n🔍 Processing paragraphs:")
+    for para_idx, paragraph in enumerate(document.paragraphs):
+        red_runs = [run for run in paragraph.runs if is_red(run) and run.text.strip()]
+        if red_runs:
+            full_text = paragraph.text.strip()
+            red_text_only = "".join(run.text for run in red_runs).strip()
+            print(f"  📌 Paragraph {para_idx + 1}: Found red text: '{red_text_only}'")
+            # Try to match the red text specifically first
+            json_value = find_matching_json_value(red_text_only, flat_json)
+            # If no match, try some common patterns
+            if json_value is None:
+                # Check for signature patterns
+                if "AUDITOR SIGNATURE" in red_text_only.upper() or "DATE" in red_text_only.upper():
+                    json_value = find_matching_json_value("auditor signature", flat_json)
+                elif "OPERATOR SIGNATURE" in red_text_only.upper():
+                    json_value = find_matching_json_value("operator signature", flat_json)
+            if json_value is not None:
+                replacement_text = get_value_as_string(json_value)
+                print(f"    ✅ Replacing red text with: '{replacement_text}'")
+                red_runs[0].text = replacement_text
+                red_runs[0].font.color.rgb = RGBColor(0, 0, 0)
+                for run in red_runs[1:]:
+                    run.text = ''
+                replacements_made += 1
+    return replacements_made
+def main():
+    json_path = 'updated_word_data.json'
+    docx_path = 'test.docx'
+    output_path = 'updated_reportv1.docx'
+    try:
+        json_data = load_json(json_path)
+        flat_json = flatten_json(json_data)
+        print("📄 Available JSON keys (sample):")
+        count = 0
+        for key, value in sorted(flat_json.items()):
+            if count < 10:
+                print(f"  - {key}: {value}")
+                count += 1
+        print(f"  ... and {len(flat_json) - count} more keys\n")
+        doc = Document(docx_path)
+        table_replacements = process_tables(doc, flat_json)
+        paragraph_replacements = process_paragraphs(doc, flat_json)
+        total_replacements = table_replacements + paragraph_replacements
+        doc.save(output_path)
+        print(f"\n✅ Document saved as: {output_path}")
+        print(f"✅ Total replacements: {total_replacements} ({table_replacements} in tables, {paragraph_replacements} in paragraphs)")
+    except FileNotFoundError as e:
+        print(f"❌ File not found: {e}")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()