""" Text Parser Module Parses free-form text input to extract structured patient data """ import re import logging from typing import Dict, Any, Optional from .config import get_llm logger = logging.getLogger(__name__) def parse_patient_text(text_input: str) -> Dict[str, Any]: """ Parse free-form text input to extract structured patient data using LLM-based extraction Args: text_input: Free-form text containing patient data Returns: Dictionary containing structured patient data matching HBVPatientInput schema """ try: # Create prompt for LLM to extract structured data extraction_prompt = f"""You are a medical data extraction system. Extract structured patient data from the following free-form text. PATIENT TEXT: {text_input} Extract the following information and return it as a JSON object. If a field is not mentioned, use reasonable defaults: Required fields: - sex: "Male" or "Female" - age: integer (years) - pregnancy_status: "Not pregnant" or "Pregnant" - hbsag_status: "Positive" or "Negative" - duration_hbsag_months: integer (months HBsAg has been positive) - hbv_dna_level: float (IU/mL) - hbeag_status: "Positive" or "Negative" - alt_level: float (U/L) - fibrosis_stage: "F0-F1", "F2-F3", or "F4" - necroinflammatory_activity: "A0", "A1", "A2", or "A3" - extrahepatic_manifestations: true or false - immunosuppression_status: "None", "Chemotherapy", or "Other" - coinfections: array of strings (e.g., ["HIV"], ["HCV"], ["HDV"], or []) - family_history_cirrhosis_hcc: true or false - other_comorbidities: array of strings or null IMPORTANT EXTRACTION RULES: 1. For sex: Look for "male", "female", "man", "woman", etc. 2. For age: Extract the number followed by "year" or "years old" 3. For pregnancy_status: Default to "Not pregnant" unless explicitly mentioned 4. For HBsAg status: Look for "HBsAg positive" or "HBsAg negative" 5. For duration_hbsag_months: Look for duration in months or years (convert years to months) 6. For HBV DNA: Look for numbers followed by "IU/mL" or "IU/ml" 7. For HBeAg: Look for "HBeAg positive" or "HBeAg negative" 8. For ALT: Look for "ALT" followed by number and "U/L" 9. For fibrosis: Look for "F0", "F1", "F2", "F3", "F4" or descriptions like "significant fibrosis", "cirrhosis" 10. For necroinflammatory: Look for "A0", "A1", "A2", "A3" 11. For extrahepatic manifestations: Look for mentions of extrahepatic conditions 12. For immunosuppression: Look for "immunosuppression", "chemotherapy", etc. 13. For coinfections: Look for "HIV", "HCV", "HDV" 14. For family history: Look for "family history" of "cirrhosis" or "HCC" DEFAULT VALUES (use if not mentioned): - pregnancy_status: "Not pregnant" - immunosuppression_status: "None" - coinfections: [] - extrahepatic_manifestations: false - family_history_cirrhosis_hcc: false - other_comorbidities: null Return ONLY a valid JSON object with the extracted data. Do not include any explanatory text. Example format: {{ "sex": "Male", "age": 45, "pregnancy_status": "Not pregnant", "hbsag_status": "Positive", "duration_hbsag_months": 12, "hbv_dna_level": 5000.0, "hbeag_status": "Positive", "alt_level": 80.0, "fibrosis_stage": "F2-F3", "necroinflammatory_activity": "A2", "extrahepatic_manifestations": false, "immunosuppression_status": "None", "coinfections": [], "family_history_cirrhosis_hcc": false, "other_comorbidities": null }} """ # Get LLM response llm = get_llm() logger.info("Sending text extraction prompt to LLM...") response = llm.invoke(extraction_prompt) logger.info("LLM response received for text extraction") # Extract JSON from response response_text = response.content if hasattr(response, 'content') else str(response) # Log the response logger.info(f"Text extraction response: {response_text}") # Try to parse JSON from response import json try: # Find JSON in response json_start = response_text.find('{') json_end = response_text.rfind('}') + 1 if json_start >= 0 and json_end > json_start: json_str = response_text[json_start:json_end] patient_data = json.loads(json_str) logger.info(f"✅ Successfully extracted patient data from text") return patient_data else: raise ValueError("No JSON found in response") except (json.JSONDecodeError, ValueError) as e: logger.error(f"Failed to parse LLM response as JSON: {e}") logger.error(f"Response text: {response_text}") raise ValueError(f"Failed to extract structured data from text: {str(e)}") except Exception as e: logger.error(f"Error in parse_patient_text: {str(e)}") raise def validate_extracted_data(data: Dict[str, Any]) -> Dict[str, Any]: """ Validate and clean extracted patient data Args: data: Extracted patient data dictionary Returns: Validated and cleaned patient data """ # Ensure required fields are present required_fields = [ 'sex', 'age', 'pregnancy_status', 'hbsag_status', 'duration_hbsag_months', 'hbv_dna_level', 'hbeag_status', 'alt_level', 'fibrosis_stage', 'necroinflammatory_activity', 'extrahepatic_manifestations', 'family_history_cirrhosis_hcc' ] for field in required_fields: if field not in data: raise ValueError(f"Missing required field: {field}") # Set defaults for optional fields if 'immunosuppression_status' not in data or not data['immunosuppression_status']: data['immunosuppression_status'] = 'None' if 'coinfections' not in data: data['coinfections'] = [] if 'other_comorbidities' not in data: data['other_comorbidities'] = None # Validate data types and values try: data['age'] = int(data['age']) data['duration_hbsag_months'] = int(data['duration_hbsag_months']) data['hbv_dna_level'] = float(data['hbv_dna_level']) data['alt_level'] = float(data['alt_level']) data['extrahepatic_manifestations'] = bool(data['extrahepatic_manifestations']) data['family_history_cirrhosis_hcc'] = bool(data['family_history_cirrhosis_hcc']) except (ValueError, TypeError) as e: raise ValueError(f"Invalid data type in extracted data: {str(e)}") # Validate enum values if data['sex'] not in ['Male', 'Female']: raise ValueError(f"Invalid sex value: {data['sex']}") if data['pregnancy_status'] not in ['Not pregnant', 'Pregnant']: raise ValueError(f"Invalid pregnancy_status value: {data['pregnancy_status']}") if data['hbsag_status'] not in ['Positive', 'Negative']: raise ValueError(f"Invalid hbsag_status value: {data['hbsag_status']}") if data['hbeag_status'] not in ['Positive', 'Negative']: raise ValueError(f"Invalid hbeag_status value: {data['hbeag_status']}") if data['fibrosis_stage'] not in ['F0-F1', 'F2-F3', 'F4']: raise ValueError(f"Invalid fibrosis_stage value: {data['fibrosis_stage']}") if data['necroinflammatory_activity'] not in ['A0', 'A1', 'A2', 'A3']: raise ValueError(f"Invalid necroinflammatory_activity value: {data['necroinflammatory_activity']}") return data