Spaces:

moazx
/

HBV_AI_Assistant

Running

App Files Files Community

HBV_AI_Assistant / core /text_parser.py

moazx

Initial commit with all files including LFS

73c6377 12 days ago

raw

history blame contribute delete

7.52 kB

	"""
	Text Parser Module
	Parses free-form text input to extract structured patient data
	"""
	import re
	import logging
	from typing import Dict, Any, Optional
	from .config import get_llm

	logger = logging.getLogger(__name__)


	def parse_patient_text(text_input: str) -> Dict[str, Any]:
	"""
	Parse free-form text input to extract structured patient data
	using LLM-based extraction

	Args:
	text_input: Free-form text containing patient data

	Returns:
	Dictionary containing structured patient data matching HBVPatientInput schema
	"""
	try:
	# Create prompt for LLM to extract structured data
	extraction_prompt = f"""You are a medical data extraction system. Extract structured patient data from the following free-form text.

	PATIENT TEXT:
	{text_input}

	Extract the following information and return it as a JSON object. If a field is not mentioned, use reasonable defaults:

	Required fields:
	- sex: "Male" or "Female"
	- age: integer (years)
	- pregnancy_status: "Not pregnant" or "Pregnant"
	- hbsag_status: "Positive" or "Negative"
	- duration_hbsag_months: integer (months HBsAg has been positive)
	- hbv_dna_level: float (IU/mL)
	- hbeag_status: "Positive" or "Negative"
	- alt_level: float (U/L)
	- fibrosis_stage: "F0-F1", "F2-F3", or "F4"
	- necroinflammatory_activity: "A0", "A1", "A2", or "A3"
	- extrahepatic_manifestations: true or false
	- immunosuppression_status: "None", "Chemotherapy", or "Other"
	- coinfections: array of strings (e.g., ["HIV"], ["HCV"], ["HDV"], or [])
	- family_history_cirrhosis_hcc: true or false
	- other_comorbidities: array of strings or null

	IMPORTANT EXTRACTION RULES:
	1. For sex: Look for "male", "female", "man", "woman", etc.
	2. For age: Extract the number followed by "year" or "years old"
	3. For pregnancy_status: Default to "Not pregnant" unless explicitly mentioned
	4. For HBsAg status: Look for "HBsAg positive" or "HBsAg negative"
	5. For duration_hbsag_months: Look for duration in months or years (convert years to months)
	6. For HBV DNA: Look for numbers followed by "IU/mL" or "IU/ml"
	7. For HBeAg: Look for "HBeAg positive" or "HBeAg negative"
	8. For ALT: Look for "ALT" followed by number and "U/L"
	9. For fibrosis: Look for "F0", "F1", "F2", "F3", "F4" or descriptions like "significant fibrosis", "cirrhosis"
	10. For necroinflammatory: Look for "A0", "A1", "A2", "A3"
	11. For extrahepatic manifestations: Look for mentions of extrahepatic conditions
	12. For immunosuppression: Look for "immunosuppression", "chemotherapy", etc.
	13. For coinfections: Look for "HIV", "HCV", "HDV"
	14. For family history: Look for "family history" of "cirrhosis" or "HCC"

	DEFAULT VALUES (use if not mentioned):
	- pregnancy_status: "Not pregnant"
	- immunosuppression_status: "None"
	- coinfections: []
	- extrahepatic_manifestations: false
	- family_history_cirrhosis_hcc: false
	- other_comorbidities: null

	Return ONLY a valid JSON object with the extracted data. Do not include any explanatory text.

	Example format:
	{{
	"sex": "Male",
	"age": 45,
	"pregnancy_status": "Not pregnant",
	"hbsag_status": "Positive",
	"duration_hbsag_months": 12,
	"hbv_dna_level": 5000.0,
	"hbeag_status": "Positive",
	"alt_level": 80.0,
	"fibrosis_stage": "F2-F3",
	"necroinflammatory_activity": "A2",
	"extrahepatic_manifestations": false,
	"immunosuppression_status": "None",
	"coinfections": [],
	"family_history_cirrhosis_hcc": false,
	"other_comorbidities": null
	}}
	"""

	# Get LLM response
	llm = get_llm()
	logger.info("Sending text extraction prompt to LLM...")
	response = llm.invoke(extraction_prompt)
	logger.info("LLM response received for text extraction")

	# Extract JSON from response
	response_text = response.content if hasattr(response, 'content') else str(response)

	# Log the response
	logger.info(f"Text extraction response: {response_text}")

	# Try to parse JSON from response
	import json
	try:
	# Find JSON in response
	json_start = response_text.find('{')
	json_end = response_text.rfind('}') + 1
	if json_start >= 0 and json_end > json_start:
	json_str = response_text[json_start:json_end]
	patient_data = json.loads(json_str)
	logger.info(f"✅ Successfully extracted patient data from text")
	return patient_data
	else:
	raise ValueError("No JSON found in response")

	except (json.JSONDecodeError, ValueError) as e:
	logger.error(f"Failed to parse LLM response as JSON: {e}")
	logger.error(f"Response text: {response_text}")
	raise ValueError(f"Failed to extract structured data from text: {str(e)}")

	except Exception as e:
	logger.error(f"Error in parse_patient_text: {str(e)}")
	raise


	def validate_extracted_data(data: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Validate and clean extracted patient data

	Args:
	data: Extracted patient data dictionary

	Returns:
	Validated and cleaned patient data
	"""
	# Ensure required fields are present
	required_fields = [
	'sex', 'age', 'pregnancy_status', 'hbsag_status',
	'duration_hbsag_months', 'hbv_dna_level', 'hbeag_status',
	'alt_level', 'fibrosis_stage', 'necroinflammatory_activity',
	'extrahepatic_manifestations', 'family_history_cirrhosis_hcc'
	]

	for field in required_fields:
	if field not in data:
	raise ValueError(f"Missing required field: {field}")

	# Set defaults for optional fields
	if 'immunosuppression_status' not in data or not data['immunosuppression_status']:
	data['immunosuppression_status'] = 'None'

	if 'coinfections' not in data:
	data['coinfections'] = []

	if 'other_comorbidities' not in data:
	data['other_comorbidities'] = None

	# Validate data types and values
	try:
	data['age'] = int(data['age'])
	data['duration_hbsag_months'] = int(data['duration_hbsag_months'])
	data['hbv_dna_level'] = float(data['hbv_dna_level'])
	data['alt_level'] = float(data['alt_level'])
	data['extrahepatic_manifestations'] = bool(data['extrahepatic_manifestations'])
	data['family_history_cirrhosis_hcc'] = bool(data['family_history_cirrhosis_hcc'])
	except (ValueError, TypeError) as e:
	raise ValueError(f"Invalid data type in extracted data: {str(e)}")

	# Validate enum values
	if data['sex'] not in ['Male', 'Female']:
	raise ValueError(f"Invalid sex value: {data['sex']}")

	if data['pregnancy_status'] not in ['Not pregnant', 'Pregnant']:
	raise ValueError(f"Invalid pregnancy_status value: {data['pregnancy_status']}")

	if data['hbsag_status'] not in ['Positive', 'Negative']:
	raise ValueError(f"Invalid hbsag_status value: {data['hbsag_status']}")

	if data['hbeag_status'] not in ['Positive', 'Negative']:
	raise ValueError(f"Invalid hbeag_status value: {data['hbeag_status']}")

	if data['fibrosis_stage'] not in ['F0-F1', 'F2-F3', 'F4']:
	raise ValueError(f"Invalid fibrosis_stage value: {data['fibrosis_stage']}")

	if data['necroinflammatory_activity'] not in ['A0', 'A1', 'A2', 'A3']:
	raise ValueError(f"Invalid necroinflammatory_activity value: {data['necroinflammatory_activity']}")

	return data