Spaces:

moazx
/

HBV_AI_Assistant

Running

File size: 7,522 Bytes

73c6377

"""
Text Parser Module
Parses free-form text input to extract structured patient data
"""
import re
import logging
from typing import Dict, Any, Optional
from .config import get_llm

logger = logging.getLogger(__name__)


def parse_patient_text(text_input: str) -> Dict[str, Any]:
    """
    Parse free-form text input to extract structured patient data
    using LLM-based extraction
    
    Args:
        text_input: Free-form text containing patient data
        
    Returns:
        Dictionary containing structured patient data matching HBVPatientInput schema
    """
    try:
        # Create prompt for LLM to extract structured data
        extraction_prompt = f"""You are a medical data extraction system. Extract structured patient data from the following free-form text.

PATIENT TEXT:
{text_input}

Extract the following information and return it as a JSON object. If a field is not mentioned, use reasonable defaults:

Required fields:
- sex: "Male" or "Female"
- age: integer (years)
- pregnancy_status: "Not pregnant" or "Pregnant"
- hbsag_status: "Positive" or "Negative"
- duration_hbsag_months: integer (months HBsAg has been positive)
- hbv_dna_level: float (IU/mL)
- hbeag_status: "Positive" or "Negative"
- alt_level: float (U/L)
- fibrosis_stage: "F0-F1", "F2-F3", or "F4"
- necroinflammatory_activity: "A0", "A1", "A2", or "A3"
- extrahepatic_manifestations: true or false
- immunosuppression_status: "None", "Chemotherapy", or "Other"
- coinfections: array of strings (e.g., ["HIV"], ["HCV"], ["HDV"], or [])
- family_history_cirrhosis_hcc: true or false
- other_comorbidities: array of strings or null

IMPORTANT EXTRACTION RULES:
1. For sex: Look for "male", "female", "man", "woman", etc.
2. For age: Extract the number followed by "year" or "years old"
3. For pregnancy_status: Default to "Not pregnant" unless explicitly mentioned
4. For HBsAg status: Look for "HBsAg positive" or "HBsAg negative"
5. For duration_hbsag_months: Look for duration in months or years (convert years to months)
6. For HBV DNA: Look for numbers followed by "IU/mL" or "IU/ml"
7. For HBeAg: Look for "HBeAg positive" or "HBeAg negative"
8. For ALT: Look for "ALT" followed by number and "U/L"
9. For fibrosis: Look for "F0", "F1", "F2", "F3", "F4" or descriptions like "significant fibrosis", "cirrhosis"
10. For necroinflammatory: Look for "A0", "A1", "A2", "A3"
11. For extrahepatic manifestations: Look for mentions of extrahepatic conditions
12. For immunosuppression: Look for "immunosuppression", "chemotherapy", etc.
13. For coinfections: Look for "HIV", "HCV", "HDV"
14. For family history: Look for "family history" of "cirrhosis" or "HCC"

DEFAULT VALUES (use if not mentioned):
- pregnancy_status: "Not pregnant"
- immunosuppression_status: "None"
- coinfections: []
- extrahepatic_manifestations: false
- family_history_cirrhosis_hcc: false
- other_comorbidities: null

Return ONLY a valid JSON object with the extracted data. Do not include any explanatory text.

Example format:
{{
  "sex": "Male",
  "age": 45,
  "pregnancy_status": "Not pregnant",
  "hbsag_status": "Positive",
  "duration_hbsag_months": 12,
  "hbv_dna_level": 5000.0,
  "hbeag_status": "Positive",
  "alt_level": 80.0,
  "fibrosis_stage": "F2-F3",
  "necroinflammatory_activity": "A2",
  "extrahepatic_manifestations": false,
  "immunosuppression_status": "None",
  "coinfections": [],
  "family_history_cirrhosis_hcc": false,
  "other_comorbidities": null
}}
"""
        
        # Get LLM response
        llm = get_llm()
        logger.info("Sending text extraction prompt to LLM...")
        response = llm.invoke(extraction_prompt)
        logger.info("LLM response received for text extraction")
        
        # Extract JSON from response
        response_text = response.content if hasattr(response, 'content') else str(response)
        
        # Log the response
        logger.info(f"Text extraction response: {response_text}")
        
        # Try to parse JSON from response
        import json
        try:
            # Find JSON in response
            json_start = response_text.find('{')
            json_end = response_text.rfind('}') + 1
            if json_start >= 0 and json_end > json_start:
                json_str = response_text[json_start:json_end]
                patient_data = json.loads(json_str)
                logger.info(f"✅ Successfully extracted patient data from text")
                return patient_data
            else:
                raise ValueError("No JSON found in response")
        
        except (json.JSONDecodeError, ValueError) as e:
            logger.error(f"Failed to parse LLM response as JSON: {e}")
            logger.error(f"Response text: {response_text}")
            raise ValueError(f"Failed to extract structured data from text: {str(e)}")
    
    except Exception as e:
        logger.error(f"Error in parse_patient_text: {str(e)}")
        raise


def validate_extracted_data(data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Validate and clean extracted patient data
    
    Args:
        data: Extracted patient data dictionary
        
    Returns:
        Validated and cleaned patient data
    """
    # Ensure required fields are present
    required_fields = [
        'sex', 'age', 'pregnancy_status', 'hbsag_status', 
        'duration_hbsag_months', 'hbv_dna_level', 'hbeag_status',
        'alt_level', 'fibrosis_stage', 'necroinflammatory_activity',
        'extrahepatic_manifestations', 'family_history_cirrhosis_hcc'
    ]
    
    for field in required_fields:
        if field not in data:
            raise ValueError(f"Missing required field: {field}")
    
    # Set defaults for optional fields
    if 'immunosuppression_status' not in data or not data['immunosuppression_status']:
        data['immunosuppression_status'] = 'None'
    
    if 'coinfections' not in data:
        data['coinfections'] = []
    
    if 'other_comorbidities' not in data:
        data['other_comorbidities'] = None
    
    # Validate data types and values
    try:
        data['age'] = int(data['age'])
        data['duration_hbsag_months'] = int(data['duration_hbsag_months'])
        data['hbv_dna_level'] = float(data['hbv_dna_level'])
        data['alt_level'] = float(data['alt_level'])
        data['extrahepatic_manifestations'] = bool(data['extrahepatic_manifestations'])
        data['family_history_cirrhosis_hcc'] = bool(data['family_history_cirrhosis_hcc'])
    except (ValueError, TypeError) as e:
        raise ValueError(f"Invalid data type in extracted data: {str(e)}")
    
    # Validate enum values
    if data['sex'] not in ['Male', 'Female']:
        raise ValueError(f"Invalid sex value: {data['sex']}")
    
    if data['pregnancy_status'] not in ['Not pregnant', 'Pregnant']:
        raise ValueError(f"Invalid pregnancy_status value: {data['pregnancy_status']}")
    
    if data['hbsag_status'] not in ['Positive', 'Negative']:
        raise ValueError(f"Invalid hbsag_status value: {data['hbsag_status']}")
    
    if data['hbeag_status'] not in ['Positive', 'Negative']:
        raise ValueError(f"Invalid hbeag_status value: {data['hbeag_status']}")
    
    if data['fibrosis_stage'] not in ['F0-F1', 'F2-F3', 'F4']:
        raise ValueError(f"Invalid fibrosis_stage value: {data['fibrosis_stage']}")
    
    if data['necroinflammatory_activity'] not in ['A0', 'A1', 'A2', 'A3']:
        raise ValueError(f"Invalid necroinflammatory_activity value: {data['necroinflammatory_activity']}")
    
    return data