HBV_AI_Assistant / core /medical_terminology.py
moazx's picture
Initial commit with all files including LFS
73c6377
"""
Medical Terminology Module for HBV (Hepatitis B Virus)
This module provides intelligent handling of HBV medical linguistic variability including:
- Synonyms and alternate terms
- Abbreviations and acronyms (with context awareness)
- Regional spelling variations (US/UK/International)
- Specialty-specific terminology
- Dynamic learning from corpus
"""
import re
import json
from typing import List, Dict, Set, Tuple, Optional
from collections import defaultdict
from pathlib import Path
from .config import logger
# ============================================================================
# CORE HBV MEDICAL TERMINOLOGY MAPPINGS
# ============================================================================
# Common HBV medical abbreviations with context-aware expansions
MEDICAL_ABBREVIATIONS = {
# HBV Terminology
"hbv": ["hepatitis b virus", "hepatitis b"],
"hbsag": ["hepatitis b surface antigen", "hbs antigen"],
"hbeag": ["hepatitis b e antigen", "hbe antigen"],
"hbcag": ["hepatitis b core antigen"],
"anti-hbs": ["antibody to hepatitis b surface antigen", "anti-hbs antibody"],
"anti-hbe": ["antibody to hepatitis b e antigen"],
"anti-hbc": ["antibody to hepatitis b core antigen"],
"hbv dna": ["hepatitis b virus dna", "hbv viral load"],
# Liver Disease Terms
"alt": ["alanine aminotransferase", "alanine transaminase", "sgpt"],
"ast": ["aspartate aminotransferase", "aspartate transaminase", "sgot"],
"alp": ["alkaline phosphatase"],
"ggt": ["gamma-glutamyl transferase", "gamma glutamyl transpeptidase"],
"inr": ["international normalized ratio"],
"pt": ["prothrombin time"],
"apri": ["ast to platelet ratio index"],
"fib-4": ["fibrosis-4 index"],
# Fibrosis Staging
"f0": ["no fibrosis"],
"f1": ["mild fibrosis", "portal fibrosis"],
"f2": ["moderate fibrosis"],
"f3": ["severe fibrosis", "advanced fibrosis"],
"f4": ["cirrhosis", "liver cirrhosis"],
# Necroinflammatory Activity
"a0": ["no activity"],
"a1": ["mild activity"],
"a2": ["moderate activity"],
"a3": ["severe activity"],
# Treatment Terms
"etv": ["entecavir"],
"tdf": ["tenofovir disoproxil fumarate", "tenofovir df"],
"taf": ["tenofovir alafenamide"],
"lam": ["lamivudine", "3tc"],
"adv": ["adefovir", "adefovir dipivoxil"],
"ldv": ["telbivudine"],
"peg-ifn": ["pegylated interferon", "peginterferon"],
"ifn": ["interferon"],
# Complications
"hcc": ["hepatocellular carcinoma", "liver cancer"],
"dc": ["decompensated cirrhosis"],
"cc": ["compensated cirrhosis"],
"esld": ["end-stage liver disease"],
"alf": ["acute liver failure"],
"aclf": ["acute-on-chronic liver failure"],
# Coinfections
"hiv": ["human immunodeficiency virus"],
"hcv": ["hepatitis c virus", "hepatitis c"],
"hdv": ["hepatitis d virus", "hepatitis delta"],
"hav": ["hepatitis a virus", "hepatitis a"],
# Clinical Terms
"uln": ["upper limit of normal"],
"iu/ml": ["international units per milliliter"],
"log": ["logarithm", "log10"],
"svr": ["sustained virological response"],
"vr": ["virological response"],
"br": ["biochemical response"],
"sr": ["serological response"],
}
# Synonym mappings for HBV medical terms
MEDICAL_SYNONYMS = {
# HBV terminology
"hepatitis b": ["hbv", "hepatitis b virus", "hep b", "hbv infection"],
"chronic hepatitis b": ["chb", "chronic hbv", "chronic hbv infection"],
"acute hepatitis b": ["ahb", "acute hbv"],
"hbv dna": ["viral load", "hbv viral load", "serum hbv dna"],
# Serological markers
"hbsag positive": ["hbsag+", "hbs antigen positive"],
"hbeag positive": ["hbeag+", "hbe antigen positive"],
"hbsag negative": ["hbsag-", "hbs antigen negative"],
"hbeag negative": ["hbeag-", "hbe antigen negative"],
# Liver disease stages
"cirrhosis": ["f4", "liver cirrhosis", "hepatic cirrhosis"],
"fibrosis": ["liver fibrosis", "hepatic fibrosis"],
"compensated cirrhosis": ["cc", "child-pugh a", "child-pugh b"],
"decompensated cirrhosis": ["dc", "child-pugh c"],
# Treatment terms
"antiviral therapy": ["antiviral treatment", "nucleos(t)ide analogue", "na therapy"],
"entecavir": ["etv", "baraclude"],
"tenofovir": ["tdf", "taf", "viread", "vemlidy"],
"interferon": ["ifn", "pegylated interferon", "peg-ifn"],
# Clinical outcomes
"treatment response": ["virological response", "biochemical response"],
"viral suppression": ["undetectable hbv dna", "hbv dna < lloq"],
"alt normalization": ["alt normal", "alt within normal limits"],
# Complications
"hepatocellular carcinoma": ["hcc", "liver cancer", "primary liver cancer"],
"liver failure": ["hepatic failure", "end-stage liver disease", "esld"],
"portal hypertension": ["esophageal varices", "ascites", "splenomegaly"],
# Special populations
"pregnant women": ["pregnancy", "pregnant patients"],
"immunosuppressed": ["immunocompromised", "on immunosuppression"],
"coinfection": ["co-infection", "dual infection"],
}
# Regional spelling variations (US/UK/International)
SPELLING_VARIATIONS = {
"fibrosis": ["fibrosis"],
"cirrhosis": ["cirrhosis"],
"anaemia": ["anemia"],
"haemorrhage": ["hemorrhage"],
"oesophageal": ["esophageal"],
}
# Context-specific term preferences
CONTEXT_PREFERENCES = {
"treatment": ["antiviral", "therapy", "regimen", "medication"],
"diagnosis": ["hbsag", "hbeag", "hbv dna", "serology"],
"monitoring": ["alt", "hbv dna", "liver function", "fibrosis"],
"complications": ["hcc", "cirrhosis", "decompensation", "liver failure"],
}
# ============================================================================
# DYNAMIC TERMINOLOGY LEARNING
# ============================================================================
class MedicalTerminologyExpander:
"""
Dynamically learns and expands medical terminology from corpus.
Handles abbreviations, synonyms, and context-specific variations for HBV.
"""
def __init__(self, corpus_path: Optional[Path] = None):
"""Initialize with optional corpus for dynamic learning."""
self.abbreviations = MEDICAL_ABBREVIATIONS.copy()
self.synonyms = MEDICAL_SYNONYMS.copy()
self.spelling_vars = SPELLING_VARIATIONS.copy()
self.learned_terms = defaultdict(set)
if corpus_path and corpus_path.exists():
self._learn_from_corpus(corpus_path)
def expand_query(self, query: str, context: Optional[str] = None) -> List[str]:
"""
Expand a query with medical synonyms and abbreviations.
Args:
query: Original query string
context: Optional context hint (e.g., 'treatment', 'diagnosis')
Returns:
List of expanded query variations
"""
expansions = [query]
query_lower = query.lower()
# Expand abbreviations
for abbrev, full_forms in self.abbreviations.items():
if abbrev in query_lower:
for full_form in full_forms:
expansions.append(query_lower.replace(abbrev, full_form))
# Expand synonyms
for term, synonyms in self.synonyms.items():
if term in query_lower:
for synonym in synonyms:
expansions.append(query_lower.replace(term, synonym))
# Add context-specific preferences
if context and context in CONTEXT_PREFERENCES:
for pref_term in CONTEXT_PREFERENCES[context]:
if pref_term not in query_lower:
expansions.append(f"{query} {pref_term}")
# Remove duplicates while preserving order
seen = set()
unique_expansions = []
for exp in expansions:
if exp not in seen:
seen.add(exp)
unique_expansions.append(exp)
return unique_expansions
def normalize_term(self, term: str) -> str:
"""
Normalize a medical term to its canonical form.
Args:
term: Medical term to normalize
Returns:
Normalized canonical form
"""
term_lower = term.lower().strip()
# Check if it's an abbreviation
if term_lower in self.abbreviations:
return self.abbreviations[term_lower][0]
# Check if it's a synonym
for canonical, synonyms in self.synonyms.items():
if term_lower in synonyms or term_lower == canonical:
return canonical
# Check spelling variations
for canonical, variations in self.spelling_vars.items():
if term_lower in variations:
return canonical
return term
def _learn_from_corpus(self, corpus_path: Path):
"""Learn new terminology patterns from corpus."""
try:
# Implementation for dynamic learning from HBV guidelines
logger.info(f"Learning terminology from corpus: {corpus_path}")
# This would analyze the corpus and extract new term relationships
except Exception as e:
logger.warning(f"Could not learn from corpus: {e}")
def get_related_terms(self, term: str, max_terms: int = 5) -> List[str]:
"""
Get related medical terms for a given term.
Args:
term: Medical term
max_terms: Maximum number of related terms to return
Returns:
List of related terms
"""
related = set()
term_lower = term.lower()
# Find synonyms
for canonical, synonyms in self.synonyms.items():
if term_lower == canonical or term_lower in synonyms:
related.update(synonyms)
related.add(canonical)
# Find abbreviations
if term_lower in self.abbreviations:
related.update(self.abbreviations[term_lower])
# Remove the original term
related.discard(term_lower)
return list(related)[:max_terms]
# Global instance for easy access
_global_expander = None
def get_terminology_expander() -> MedicalTerminologyExpander:
"""Get or create the global terminology expander instance."""
global _global_expander
if _global_expander is None:
_global_expander = MedicalTerminologyExpander()
return _global_expander