Spaces:
Running
Running
File size: 7,522 Bytes
73c6377 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
"""
Text Parser Module
Parses free-form text input to extract structured patient data
"""
import re
import logging
from typing import Dict, Any, Optional
from .config import get_llm
logger = logging.getLogger(__name__)
def parse_patient_text(text_input: str) -> Dict[str, Any]:
"""
Parse free-form text input to extract structured patient data
using LLM-based extraction
Args:
text_input: Free-form text containing patient data
Returns:
Dictionary containing structured patient data matching HBVPatientInput schema
"""
try:
# Create prompt for LLM to extract structured data
extraction_prompt = f"""You are a medical data extraction system. Extract structured patient data from the following free-form text.
PATIENT TEXT:
{text_input}
Extract the following information and return it as a JSON object. If a field is not mentioned, use reasonable defaults:
Required fields:
- sex: "Male" or "Female"
- age: integer (years)
- pregnancy_status: "Not pregnant" or "Pregnant"
- hbsag_status: "Positive" or "Negative"
- duration_hbsag_months: integer (months HBsAg has been positive)
- hbv_dna_level: float (IU/mL)
- hbeag_status: "Positive" or "Negative"
- alt_level: float (U/L)
- fibrosis_stage: "F0-F1", "F2-F3", or "F4"
- necroinflammatory_activity: "A0", "A1", "A2", or "A3"
- extrahepatic_manifestations: true or false
- immunosuppression_status: "None", "Chemotherapy", or "Other"
- coinfections: array of strings (e.g., ["HIV"], ["HCV"], ["HDV"], or [])
- family_history_cirrhosis_hcc: true or false
- other_comorbidities: array of strings or null
IMPORTANT EXTRACTION RULES:
1. For sex: Look for "male", "female", "man", "woman", etc.
2. For age: Extract the number followed by "year" or "years old"
3. For pregnancy_status: Default to "Not pregnant" unless explicitly mentioned
4. For HBsAg status: Look for "HBsAg positive" or "HBsAg negative"
5. For duration_hbsag_months: Look for duration in months or years (convert years to months)
6. For HBV DNA: Look for numbers followed by "IU/mL" or "IU/ml"
7. For HBeAg: Look for "HBeAg positive" or "HBeAg negative"
8. For ALT: Look for "ALT" followed by number and "U/L"
9. For fibrosis: Look for "F0", "F1", "F2", "F3", "F4" or descriptions like "significant fibrosis", "cirrhosis"
10. For necroinflammatory: Look for "A0", "A1", "A2", "A3"
11. For extrahepatic manifestations: Look for mentions of extrahepatic conditions
12. For immunosuppression: Look for "immunosuppression", "chemotherapy", etc.
13. For coinfections: Look for "HIV", "HCV", "HDV"
14. For family history: Look for "family history" of "cirrhosis" or "HCC"
DEFAULT VALUES (use if not mentioned):
- pregnancy_status: "Not pregnant"
- immunosuppression_status: "None"
- coinfections: []
- extrahepatic_manifestations: false
- family_history_cirrhosis_hcc: false
- other_comorbidities: null
Return ONLY a valid JSON object with the extracted data. Do not include any explanatory text.
Example format:
{{
"sex": "Male",
"age": 45,
"pregnancy_status": "Not pregnant",
"hbsag_status": "Positive",
"duration_hbsag_months": 12,
"hbv_dna_level": 5000.0,
"hbeag_status": "Positive",
"alt_level": 80.0,
"fibrosis_stage": "F2-F3",
"necroinflammatory_activity": "A2",
"extrahepatic_manifestations": false,
"immunosuppression_status": "None",
"coinfections": [],
"family_history_cirrhosis_hcc": false,
"other_comorbidities": null
}}
"""
# Get LLM response
llm = get_llm()
logger.info("Sending text extraction prompt to LLM...")
response = llm.invoke(extraction_prompt)
logger.info("LLM response received for text extraction")
# Extract JSON from response
response_text = response.content if hasattr(response, 'content') else str(response)
# Log the response
logger.info(f"Text extraction response: {response_text}")
# Try to parse JSON from response
import json
try:
# Find JSON in response
json_start = response_text.find('{')
json_end = response_text.rfind('}') + 1
if json_start >= 0 and json_end > json_start:
json_str = response_text[json_start:json_end]
patient_data = json.loads(json_str)
logger.info(f"✅ Successfully extracted patient data from text")
return patient_data
else:
raise ValueError("No JSON found in response")
except (json.JSONDecodeError, ValueError) as e:
logger.error(f"Failed to parse LLM response as JSON: {e}")
logger.error(f"Response text: {response_text}")
raise ValueError(f"Failed to extract structured data from text: {str(e)}")
except Exception as e:
logger.error(f"Error in parse_patient_text: {str(e)}")
raise
def validate_extracted_data(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate and clean extracted patient data
Args:
data: Extracted patient data dictionary
Returns:
Validated and cleaned patient data
"""
# Ensure required fields are present
required_fields = [
'sex', 'age', 'pregnancy_status', 'hbsag_status',
'duration_hbsag_months', 'hbv_dna_level', 'hbeag_status',
'alt_level', 'fibrosis_stage', 'necroinflammatory_activity',
'extrahepatic_manifestations', 'family_history_cirrhosis_hcc'
]
for field in required_fields:
if field not in data:
raise ValueError(f"Missing required field: {field}")
# Set defaults for optional fields
if 'immunosuppression_status' not in data or not data['immunosuppression_status']:
data['immunosuppression_status'] = 'None'
if 'coinfections' not in data:
data['coinfections'] = []
if 'other_comorbidities' not in data:
data['other_comorbidities'] = None
# Validate data types and values
try:
data['age'] = int(data['age'])
data['duration_hbsag_months'] = int(data['duration_hbsag_months'])
data['hbv_dna_level'] = float(data['hbv_dna_level'])
data['alt_level'] = float(data['alt_level'])
data['extrahepatic_manifestations'] = bool(data['extrahepatic_manifestations'])
data['family_history_cirrhosis_hcc'] = bool(data['family_history_cirrhosis_hcc'])
except (ValueError, TypeError) as e:
raise ValueError(f"Invalid data type in extracted data: {str(e)}")
# Validate enum values
if data['sex'] not in ['Male', 'Female']:
raise ValueError(f"Invalid sex value: {data['sex']}")
if data['pregnancy_status'] not in ['Not pregnant', 'Pregnant']:
raise ValueError(f"Invalid pregnancy_status value: {data['pregnancy_status']}")
if data['hbsag_status'] not in ['Positive', 'Negative']:
raise ValueError(f"Invalid hbsag_status value: {data['hbsag_status']}")
if data['hbeag_status'] not in ['Positive', 'Negative']:
raise ValueError(f"Invalid hbeag_status value: {data['hbeag_status']}")
if data['fibrosis_stage'] not in ['F0-F1', 'F2-F3', 'F4']:
raise ValueError(f"Invalid fibrosis_stage value: {data['fibrosis_stage']}")
if data['necroinflammatory_activity'] not in ['A0', 'A1', 'A2', 'A3']:
raise ValueError(f"Invalid necroinflammatory_activity value: {data['necroinflammatory_activity']}")
return data
|