Spaces:
Running
Running
| """ | |
| Text Parser Module | |
| Parses free-form text input to extract structured patient data | |
| """ | |
| import re | |
| import logging | |
| from typing import Dict, Any, Optional | |
| from .config import get_llm | |
| logger = logging.getLogger(__name__) | |
| def parse_patient_text(text_input: str) -> Dict[str, Any]: | |
| """ | |
| Parse free-form text input to extract structured patient data | |
| using LLM-based extraction | |
| Args: | |
| text_input: Free-form text containing patient data | |
| Returns: | |
| Dictionary containing structured patient data matching HBVPatientInput schema | |
| """ | |
| try: | |
| # Create prompt for LLM to extract structured data | |
| extraction_prompt = f"""You are a medical data extraction system. Extract structured patient data from the following free-form text. | |
| PATIENT TEXT: | |
| {text_input} | |
| Extract the following information and return it as a JSON object. If a field is not mentioned, use reasonable defaults: | |
| Required fields: | |
| - sex: "Male" or "Female" | |
| - age: integer (years) | |
| - pregnancy_status: "Not pregnant" or "Pregnant" | |
| - hbsag_status: "Positive" or "Negative" | |
| - duration_hbsag_months: integer (months HBsAg has been positive) | |
| - hbv_dna_level: float (IU/mL) | |
| - hbeag_status: "Positive" or "Negative" | |
| - alt_level: float (U/L) | |
| - fibrosis_stage: "F0-F1", "F2-F3", or "F4" | |
| - necroinflammatory_activity: "A0", "A1", "A2", or "A3" | |
| - extrahepatic_manifestations: true or false | |
| - immunosuppression_status: "None", "Chemotherapy", or "Other" | |
| - coinfections: array of strings (e.g., ["HIV"], ["HCV"], ["HDV"], or []) | |
| - family_history_cirrhosis_hcc: true or false | |
| - other_comorbidities: array of strings or null | |
| IMPORTANT EXTRACTION RULES: | |
| 1. For sex: Look for "male", "female", "man", "woman", etc. | |
| 2. For age: Extract the number followed by "year" or "years old" | |
| 3. For pregnancy_status: Default to "Not pregnant" unless explicitly mentioned | |
| 4. For HBsAg status: Look for "HBsAg positive" or "HBsAg negative" | |
| 5. For duration_hbsag_months: Look for duration in months or years (convert years to months) | |
| 6. For HBV DNA: Look for numbers followed by "IU/mL" or "IU/ml" | |
| 7. For HBeAg: Look for "HBeAg positive" or "HBeAg negative" | |
| 8. For ALT: Look for "ALT" followed by number and "U/L" | |
| 9. For fibrosis: Look for "F0", "F1", "F2", "F3", "F4" or descriptions like "significant fibrosis", "cirrhosis" | |
| 10. For necroinflammatory: Look for "A0", "A1", "A2", "A3" | |
| 11. For extrahepatic manifestations: Look for mentions of extrahepatic conditions | |
| 12. For immunosuppression: Look for "immunosuppression", "chemotherapy", etc. | |
| 13. For coinfections: Look for "HIV", "HCV", "HDV" | |
| 14. For family history: Look for "family history" of "cirrhosis" or "HCC" | |
| DEFAULT VALUES (use if not mentioned): | |
| - pregnancy_status: "Not pregnant" | |
| - immunosuppression_status: "None" | |
| - coinfections: [] | |
| - extrahepatic_manifestations: false | |
| - family_history_cirrhosis_hcc: false | |
| - other_comorbidities: null | |
| Return ONLY a valid JSON object with the extracted data. Do not include any explanatory text. | |
| Example format: | |
| {{ | |
| "sex": "Male", | |
| "age": 45, | |
| "pregnancy_status": "Not pregnant", | |
| "hbsag_status": "Positive", | |
| "duration_hbsag_months": 12, | |
| "hbv_dna_level": 5000.0, | |
| "hbeag_status": "Positive", | |
| "alt_level": 80.0, | |
| "fibrosis_stage": "F2-F3", | |
| "necroinflammatory_activity": "A2", | |
| "extrahepatic_manifestations": false, | |
| "immunosuppression_status": "None", | |
| "coinfections": [], | |
| "family_history_cirrhosis_hcc": false, | |
| "other_comorbidities": null | |
| }} | |
| """ | |
| # Get LLM response | |
| llm = get_llm() | |
| logger.info("Sending text extraction prompt to LLM...") | |
| response = llm.invoke(extraction_prompt) | |
| logger.info("LLM response received for text extraction") | |
| # Extract JSON from response | |
| response_text = response.content if hasattr(response, 'content') else str(response) | |
| # Log the response | |
| logger.info(f"Text extraction response: {response_text}") | |
| # Try to parse JSON from response | |
| import json | |
| try: | |
| # Find JSON in response | |
| json_start = response_text.find('{') | |
| json_end = response_text.rfind('}') + 1 | |
| if json_start >= 0 and json_end > json_start: | |
| json_str = response_text[json_start:json_end] | |
| patient_data = json.loads(json_str) | |
| logger.info(f"✅ Successfully extracted patient data from text") | |
| return patient_data | |
| else: | |
| raise ValueError("No JSON found in response") | |
| except (json.JSONDecodeError, ValueError) as e: | |
| logger.error(f"Failed to parse LLM response as JSON: {e}") | |
| logger.error(f"Response text: {response_text}") | |
| raise ValueError(f"Failed to extract structured data from text: {str(e)}") | |
| except Exception as e: | |
| logger.error(f"Error in parse_patient_text: {str(e)}") | |
| raise | |
| def validate_extracted_data(data: Dict[str, Any]) -> Dict[str, Any]: | |
| """ | |
| Validate and clean extracted patient data | |
| Args: | |
| data: Extracted patient data dictionary | |
| Returns: | |
| Validated and cleaned patient data | |
| """ | |
| # Ensure required fields are present | |
| required_fields = [ | |
| 'sex', 'age', 'pregnancy_status', 'hbsag_status', | |
| 'duration_hbsag_months', 'hbv_dna_level', 'hbeag_status', | |
| 'alt_level', 'fibrosis_stage', 'necroinflammatory_activity', | |
| 'extrahepatic_manifestations', 'family_history_cirrhosis_hcc' | |
| ] | |
| for field in required_fields: | |
| if field not in data: | |
| raise ValueError(f"Missing required field: {field}") | |
| # Set defaults for optional fields | |
| if 'immunosuppression_status' not in data or not data['immunosuppression_status']: | |
| data['immunosuppression_status'] = 'None' | |
| if 'coinfections' not in data: | |
| data['coinfections'] = [] | |
| if 'other_comorbidities' not in data: | |
| data['other_comorbidities'] = None | |
| # Validate data types and values | |
| try: | |
| data['age'] = int(data['age']) | |
| data['duration_hbsag_months'] = int(data['duration_hbsag_months']) | |
| data['hbv_dna_level'] = float(data['hbv_dna_level']) | |
| data['alt_level'] = float(data['alt_level']) | |
| data['extrahepatic_manifestations'] = bool(data['extrahepatic_manifestations']) | |
| data['family_history_cirrhosis_hcc'] = bool(data['family_history_cirrhosis_hcc']) | |
| except (ValueError, TypeError) as e: | |
| raise ValueError(f"Invalid data type in extracted data: {str(e)}") | |
| # Validate enum values | |
| if data['sex'] not in ['Male', 'Female']: | |
| raise ValueError(f"Invalid sex value: {data['sex']}") | |
| if data['pregnancy_status'] not in ['Not pregnant', 'Pregnant']: | |
| raise ValueError(f"Invalid pregnancy_status value: {data['pregnancy_status']}") | |
| if data['hbsag_status'] not in ['Positive', 'Negative']: | |
| raise ValueError(f"Invalid hbsag_status value: {data['hbsag_status']}") | |
| if data['hbeag_status'] not in ['Positive', 'Negative']: | |
| raise ValueError(f"Invalid hbeag_status value: {data['hbeag_status']}") | |
| if data['fibrosis_stage'] not in ['F0-F1', 'F2-F3', 'F4']: | |
| raise ValueError(f"Invalid fibrosis_stage value: {data['fibrosis_stage']}") | |
| if data['necroinflammatory_activity'] not in ['A0', 'A1', 'A2', 'A3']: | |
| raise ValueError(f"Invalid necroinflammatory_activity value: {data['necroinflammatory_activity']}") | |
| return data | |