File size: 7,522 Bytes
73c6377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""
Text Parser Module
Parses free-form text input to extract structured patient data
"""
import re
import logging
from typing import Dict, Any, Optional
from .config import get_llm

logger = logging.getLogger(__name__)


def parse_patient_text(text_input: str) -> Dict[str, Any]:
    """
    Parse free-form text input to extract structured patient data
    using LLM-based extraction
    
    Args:
        text_input: Free-form text containing patient data
        
    Returns:
        Dictionary containing structured patient data matching HBVPatientInput schema
    """
    try:
        # Create prompt for LLM to extract structured data
        extraction_prompt = f"""You are a medical data extraction system. Extract structured patient data from the following free-form text.

PATIENT TEXT:
{text_input}

Extract the following information and return it as a JSON object. If a field is not mentioned, use reasonable defaults:

Required fields:
- sex: "Male" or "Female"
- age: integer (years)
- pregnancy_status: "Not pregnant" or "Pregnant"
- hbsag_status: "Positive" or "Negative"
- duration_hbsag_months: integer (months HBsAg has been positive)
- hbv_dna_level: float (IU/mL)
- hbeag_status: "Positive" or "Negative"
- alt_level: float (U/L)
- fibrosis_stage: "F0-F1", "F2-F3", or "F4"
- necroinflammatory_activity: "A0", "A1", "A2", or "A3"
- extrahepatic_manifestations: true or false
- immunosuppression_status: "None", "Chemotherapy", or "Other"
- coinfections: array of strings (e.g., ["HIV"], ["HCV"], ["HDV"], or [])
- family_history_cirrhosis_hcc: true or false
- other_comorbidities: array of strings or null

IMPORTANT EXTRACTION RULES:
1. For sex: Look for "male", "female", "man", "woman", etc.
2. For age: Extract the number followed by "year" or "years old"
3. For pregnancy_status: Default to "Not pregnant" unless explicitly mentioned
4. For HBsAg status: Look for "HBsAg positive" or "HBsAg negative"
5. For duration_hbsag_months: Look for duration in months or years (convert years to months)
6. For HBV DNA: Look for numbers followed by "IU/mL" or "IU/ml"
7. For HBeAg: Look for "HBeAg positive" or "HBeAg negative"
8. For ALT: Look for "ALT" followed by number and "U/L"
9. For fibrosis: Look for "F0", "F1", "F2", "F3", "F4" or descriptions like "significant fibrosis", "cirrhosis"
10. For necroinflammatory: Look for "A0", "A1", "A2", "A3"
11. For extrahepatic manifestations: Look for mentions of extrahepatic conditions
12. For immunosuppression: Look for "immunosuppression", "chemotherapy", etc.
13. For coinfections: Look for "HIV", "HCV", "HDV"
14. For family history: Look for "family history" of "cirrhosis" or "HCC"

DEFAULT VALUES (use if not mentioned):
- pregnancy_status: "Not pregnant"
- immunosuppression_status: "None"
- coinfections: []
- extrahepatic_manifestations: false
- family_history_cirrhosis_hcc: false
- other_comorbidities: null

Return ONLY a valid JSON object with the extracted data. Do not include any explanatory text.

Example format:
{{
  "sex": "Male",
  "age": 45,
  "pregnancy_status": "Not pregnant",
  "hbsag_status": "Positive",
  "duration_hbsag_months": 12,
  "hbv_dna_level": 5000.0,
  "hbeag_status": "Positive",
  "alt_level": 80.0,
  "fibrosis_stage": "F2-F3",
  "necroinflammatory_activity": "A2",
  "extrahepatic_manifestations": false,
  "immunosuppression_status": "None",
  "coinfections": [],
  "family_history_cirrhosis_hcc": false,
  "other_comorbidities": null
}}
"""
        
        # Get LLM response
        llm = get_llm()
        logger.info("Sending text extraction prompt to LLM...")
        response = llm.invoke(extraction_prompt)
        logger.info("LLM response received for text extraction")
        
        # Extract JSON from response
        response_text = response.content if hasattr(response, 'content') else str(response)
        
        # Log the response
        logger.info(f"Text extraction response: {response_text}")
        
        # Try to parse JSON from response
        import json
        try:
            # Find JSON in response
            json_start = response_text.find('{')
            json_end = response_text.rfind('}') + 1
            if json_start >= 0 and json_end > json_start:
                json_str = response_text[json_start:json_end]
                patient_data = json.loads(json_str)
                logger.info(f"✅ Successfully extracted patient data from text")
                return patient_data
            else:
                raise ValueError("No JSON found in response")
        
        except (json.JSONDecodeError, ValueError) as e:
            logger.error(f"Failed to parse LLM response as JSON: {e}")
            logger.error(f"Response text: {response_text}")
            raise ValueError(f"Failed to extract structured data from text: {str(e)}")
    
    except Exception as e:
        logger.error(f"Error in parse_patient_text: {str(e)}")
        raise


def validate_extracted_data(data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Validate and clean extracted patient data
    
    Args:
        data: Extracted patient data dictionary
        
    Returns:
        Validated and cleaned patient data
    """
    # Ensure required fields are present
    required_fields = [
        'sex', 'age', 'pregnancy_status', 'hbsag_status', 
        'duration_hbsag_months', 'hbv_dna_level', 'hbeag_status',
        'alt_level', 'fibrosis_stage', 'necroinflammatory_activity',
        'extrahepatic_manifestations', 'family_history_cirrhosis_hcc'
    ]
    
    for field in required_fields:
        if field not in data:
            raise ValueError(f"Missing required field: {field}")
    
    # Set defaults for optional fields
    if 'immunosuppression_status' not in data or not data['immunosuppression_status']:
        data['immunosuppression_status'] = 'None'
    
    if 'coinfections' not in data:
        data['coinfections'] = []
    
    if 'other_comorbidities' not in data:
        data['other_comorbidities'] = None
    
    # Validate data types and values
    try:
        data['age'] = int(data['age'])
        data['duration_hbsag_months'] = int(data['duration_hbsag_months'])
        data['hbv_dna_level'] = float(data['hbv_dna_level'])
        data['alt_level'] = float(data['alt_level'])
        data['extrahepatic_manifestations'] = bool(data['extrahepatic_manifestations'])
        data['family_history_cirrhosis_hcc'] = bool(data['family_history_cirrhosis_hcc'])
    except (ValueError, TypeError) as e:
        raise ValueError(f"Invalid data type in extracted data: {str(e)}")
    
    # Validate enum values
    if data['sex'] not in ['Male', 'Female']:
        raise ValueError(f"Invalid sex value: {data['sex']}")
    
    if data['pregnancy_status'] not in ['Not pregnant', 'Pregnant']:
        raise ValueError(f"Invalid pregnancy_status value: {data['pregnancy_status']}")
    
    if data['hbsag_status'] not in ['Positive', 'Negative']:
        raise ValueError(f"Invalid hbsag_status value: {data['hbsag_status']}")
    
    if data['hbeag_status'] not in ['Positive', 'Negative']:
        raise ValueError(f"Invalid hbeag_status value: {data['hbeag_status']}")
    
    if data['fibrosis_stage'] not in ['F0-F1', 'F2-F3', 'F4']:
        raise ValueError(f"Invalid fibrosis_stage value: {data['fibrosis_stage']}")
    
    if data['necroinflammatory_activity'] not in ['A0', 'A1', 'A2', 'A3']:
        raise ValueError(f"Invalid necroinflammatory_activity value: {data['necroinflammatory_activity']}")
    
    return data