HBV_AI_Assistant / core /text_parser.py
moazx's picture
Initial commit with all files including LFS
73c6377
"""
Text Parser Module
Parses free-form text input to extract structured patient data
"""
import re
import logging
from typing import Dict, Any, Optional
from .config import get_llm
logger = logging.getLogger(__name__)
def parse_patient_text(text_input: str) -> Dict[str, Any]:
"""
Parse free-form text input to extract structured patient data
using LLM-based extraction
Args:
text_input: Free-form text containing patient data
Returns:
Dictionary containing structured patient data matching HBVPatientInput schema
"""
try:
# Create prompt for LLM to extract structured data
extraction_prompt = f"""You are a medical data extraction system. Extract structured patient data from the following free-form text.
PATIENT TEXT:
{text_input}
Extract the following information and return it as a JSON object. If a field is not mentioned, use reasonable defaults:
Required fields:
- sex: "Male" or "Female"
- age: integer (years)
- pregnancy_status: "Not pregnant" or "Pregnant"
- hbsag_status: "Positive" or "Negative"
- duration_hbsag_months: integer (months HBsAg has been positive)
- hbv_dna_level: float (IU/mL)
- hbeag_status: "Positive" or "Negative"
- alt_level: float (U/L)
- fibrosis_stage: "F0-F1", "F2-F3", or "F4"
- necroinflammatory_activity: "A0", "A1", "A2", or "A3"
- extrahepatic_manifestations: true or false
- immunosuppression_status: "None", "Chemotherapy", or "Other"
- coinfections: array of strings (e.g., ["HIV"], ["HCV"], ["HDV"], or [])
- family_history_cirrhosis_hcc: true or false
- other_comorbidities: array of strings or null
IMPORTANT EXTRACTION RULES:
1. For sex: Look for "male", "female", "man", "woman", etc.
2. For age: Extract the number followed by "year" or "years old"
3. For pregnancy_status: Default to "Not pregnant" unless explicitly mentioned
4. For HBsAg status: Look for "HBsAg positive" or "HBsAg negative"
5. For duration_hbsag_months: Look for duration in months or years (convert years to months)
6. For HBV DNA: Look for numbers followed by "IU/mL" or "IU/ml"
7. For HBeAg: Look for "HBeAg positive" or "HBeAg negative"
8. For ALT: Look for "ALT" followed by number and "U/L"
9. For fibrosis: Look for "F0", "F1", "F2", "F3", "F4" or descriptions like "significant fibrosis", "cirrhosis"
10. For necroinflammatory: Look for "A0", "A1", "A2", "A3"
11. For extrahepatic manifestations: Look for mentions of extrahepatic conditions
12. For immunosuppression: Look for "immunosuppression", "chemotherapy", etc.
13. For coinfections: Look for "HIV", "HCV", "HDV"
14. For family history: Look for "family history" of "cirrhosis" or "HCC"
DEFAULT VALUES (use if not mentioned):
- pregnancy_status: "Not pregnant"
- immunosuppression_status: "None"
- coinfections: []
- extrahepatic_manifestations: false
- family_history_cirrhosis_hcc: false
- other_comorbidities: null
Return ONLY a valid JSON object with the extracted data. Do not include any explanatory text.
Example format:
{{
"sex": "Male",
"age": 45,
"pregnancy_status": "Not pregnant",
"hbsag_status": "Positive",
"duration_hbsag_months": 12,
"hbv_dna_level": 5000.0,
"hbeag_status": "Positive",
"alt_level": 80.0,
"fibrosis_stage": "F2-F3",
"necroinflammatory_activity": "A2",
"extrahepatic_manifestations": false,
"immunosuppression_status": "None",
"coinfections": [],
"family_history_cirrhosis_hcc": false,
"other_comorbidities": null
}}
"""
# Get LLM response
llm = get_llm()
logger.info("Sending text extraction prompt to LLM...")
response = llm.invoke(extraction_prompt)
logger.info("LLM response received for text extraction")
# Extract JSON from response
response_text = response.content if hasattr(response, 'content') else str(response)
# Log the response
logger.info(f"Text extraction response: {response_text}")
# Try to parse JSON from response
import json
try:
# Find JSON in response
json_start = response_text.find('{')
json_end = response_text.rfind('}') + 1
if json_start >= 0 and json_end > json_start:
json_str = response_text[json_start:json_end]
patient_data = json.loads(json_str)
logger.info(f"✅ Successfully extracted patient data from text")
return patient_data
else:
raise ValueError("No JSON found in response")
except (json.JSONDecodeError, ValueError) as e:
logger.error(f"Failed to parse LLM response as JSON: {e}")
logger.error(f"Response text: {response_text}")
raise ValueError(f"Failed to extract structured data from text: {str(e)}")
except Exception as e:
logger.error(f"Error in parse_patient_text: {str(e)}")
raise
def validate_extracted_data(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate and clean extracted patient data
Args:
data: Extracted patient data dictionary
Returns:
Validated and cleaned patient data
"""
# Ensure required fields are present
required_fields = [
'sex', 'age', 'pregnancy_status', 'hbsag_status',
'duration_hbsag_months', 'hbv_dna_level', 'hbeag_status',
'alt_level', 'fibrosis_stage', 'necroinflammatory_activity',
'extrahepatic_manifestations', 'family_history_cirrhosis_hcc'
]
for field in required_fields:
if field not in data:
raise ValueError(f"Missing required field: {field}")
# Set defaults for optional fields
if 'immunosuppression_status' not in data or not data['immunosuppression_status']:
data['immunosuppression_status'] = 'None'
if 'coinfections' not in data:
data['coinfections'] = []
if 'other_comorbidities' not in data:
data['other_comorbidities'] = None
# Validate data types and values
try:
data['age'] = int(data['age'])
data['duration_hbsag_months'] = int(data['duration_hbsag_months'])
data['hbv_dna_level'] = float(data['hbv_dna_level'])
data['alt_level'] = float(data['alt_level'])
data['extrahepatic_manifestations'] = bool(data['extrahepatic_manifestations'])
data['family_history_cirrhosis_hcc'] = bool(data['family_history_cirrhosis_hcc'])
except (ValueError, TypeError) as e:
raise ValueError(f"Invalid data type in extracted data: {str(e)}")
# Validate enum values
if data['sex'] not in ['Male', 'Female']:
raise ValueError(f"Invalid sex value: {data['sex']}")
if data['pregnancy_status'] not in ['Not pregnant', 'Pregnant']:
raise ValueError(f"Invalid pregnancy_status value: {data['pregnancy_status']}")
if data['hbsag_status'] not in ['Positive', 'Negative']:
raise ValueError(f"Invalid hbsag_status value: {data['hbsag_status']}")
if data['hbeag_status'] not in ['Positive', 'Negative']:
raise ValueError(f"Invalid hbeag_status value: {data['hbeag_status']}")
if data['fibrosis_stage'] not in ['F0-F1', 'F2-F3', 'F4']:
raise ValueError(f"Invalid fibrosis_stage value: {data['fibrosis_stage']}")
if data['necroinflammatory_activity'] not in ['A0', 'A1', 'A2', 'A3']:
raise ValueError(f"Invalid necroinflammatory_activity value: {data['necroinflammatory_activity']}")
return data