Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from typing import List, Dict, Optional | |
| import os | |
| import logging | |
| from functools import lru_cache | |
| logger = logging.getLogger(__name__) | |
| class CameroonData: | |
| _instance = None | |
| def __new__(cls): | |
| if cls._instance is None: | |
| cls._instance = super(CameroonData, cls).__new__(cls) | |
| cls._instance._initialized = False | |
| return cls._instance | |
| def __init__(self): | |
| if self._initialized: | |
| return | |
| self._initialized = True | |
| self.df = None | |
| self._load_data() | |
| def _load_data(self): | |
| """Load the clinical data with error handling and performance optimizations.""" | |
| try: | |
| csv_path = os.path.join(os.path.dirname(__file__), '../../clinical_summaries.csv') | |
| if not os.path.exists(csv_path): | |
| logger.warning(f"Clinical data file not found at {csv_path}") | |
| self.df = pd.DataFrame() | |
| return | |
| # Load only necessary columns if possible | |
| self.df = pd.read_csv(csv_path, dtype=str) | |
| logger.info(f"Loaded {len(self.df)} clinical cases") | |
| except Exception as e: | |
| logger.error(f"Error loading clinical data: {str(e)}") | |
| self.df = pd.DataFrame() | |
| def search_similar_cases(self, query: str, top_k: int = 3) -> List[Dict]: | |
| """ | |
| Search for similar cases using the query. | |
| Args: | |
| query: Search query string | |
| top_k: Maximum number of results to return | |
| Returns: | |
| List of matching case dictionaries | |
| """ | |
| if self.df is None or self.df.empty: | |
| logger.warning("No clinical data available for search") | |
| return [] | |
| if not query or not query.strip(): | |
| return [] | |
| try: | |
| # Convert query to lowercase once | |
| query_terms = [term.lower() for term in query.split() if len(term) > 2] # Ignore very short terms | |
| if not query_terms: | |
| return [] | |
| results = [] | |
| # Pre-process text for each row once | |
| for _, row in self.df.iterrows(): | |
| # Only process string columns and skip NaN values | |
| row_text = ' '.join( | |
| str(row[col]) for col in self.df.columns | |
| if isinstance(row[col], str) and pd.notna(row[col]) | |
| ).lower() | |
| # Check if any query term is in the row text | |
| if any(term in row_text for term in query_terms): | |
| results.append(row.to_dict()) | |
| if len(results) >= top_k: | |
| break | |
| return results | |
| except Exception as e: | |
| logger.error(f"Error in search_similar_cases: {str(e)}") | |
| return [] | |
| # Singleton instance | |
| _cameroon_data_instance = None | |
| def get_cameroon_data() -> Optional[CameroonData]: | |
| """ | |
| Get the singleton instance of CameroonData. | |
| Returns None if the data cannot be loaded. | |
| """ | |
| global _cameroon_data_instance | |
| if _cameroon_data_instance is None: | |
| try: | |
| _cameroon_data_instance = CameroonData() | |
| # Verify data was loaded | |
| if _cameroon_data_instance.df is None or _cameroon_data_instance.df.empty: | |
| logger.error("Failed to load clinical data") | |
| _cameroon_data_instance = None | |
| except Exception as e: | |
| logger.error(f"Error initializing CameroonData: {str(e)}") | |
| _cameroon_data_instance = None | |
| return _cameroon_data_instance | |