MedDataSearchAgent

Sleeping

File size: 21,146 Bytes

# from smolagents import CodeAgent, DuckDuckGoSearchTool, InferenceClientModel, load_tool, tool



# import datetime
# import requests
# import pytz
# import yaml
# from tools.final_answer import FinalAnswerTool

# from Gradio_UI import GradioUI

# # Custom Tool to fetch datasets related to body parts or imaging types

# # @tool
# # def my_custom_tool(arg1: str, arg2: int) -> str:
# #     """
# #     Search and retrieve publicly available medical datasets from Hugging Face based on any medical-related keyword.

# #     Args:
# #         arg1: A keyword related to medical data (e.g., 'cancer', 'diabetes', 'CT scan', 'radiology', 'dermoscopy').
# #         arg2: The maximum number of datasets to retrieve.

# #     Returns:
# #         A list of dataset names matching the search query, or a message stating that no datasets were found.
# #     """
# #     try:
# #         keyword = arg1.strip().lower()
# #         limit = int(arg2)

# #         # Define a basic list of medically relevant terms
# #         medical_terms = [
# #             # Anatomy / Body Parts
# #             "skin", "brain", "lung", "chest", "abdomen", "spine", "bone", "heart", "liver", "kidney",
# #             "bladder", "stomach", "colon", "rectum", "esophagus", "pancreas", "breast", "ear", "eye", 
# #             "retina", "tooth", "teeth", "tongue", "jaw", "neck", "wrist", "hand", "leg", "arm", "shoulder", "pelvis",
        
# #             # Diseases / Conditions
# #             "cancer", "tumor", "stroke", "diabetes", "pneumonia", "covid", "asthma", "eczema", "melanoma",
# #             "hypertension", "alzheimer", "parkinson", "arthritis", "scoliosis", "epilepsy", "glaucoma",
# #             "ulcer", "hepatitis", "leukemia", "lymphoma", "tuberculosis", "anemia", "obesity", "depression",
# #             "anxiety", "bipolar", "autism", "adhd", "ptsd", "psychosis", "schizophrenia",
        
# #             # Imaging Modalities
# #             "mri", "ct", "xray", "x-ray", "ultrasound", "pet", "fmri", "mammo", "angiography", "radiography",
# #             "echocardiogram", "spect", "dermoscopy", "colonoscopy", "endoscopy", "biopsy", "histopathology",
        
# #             # Medical Specialties
# #             "radiology", "pathology", "oncology", "cardiology", "neurology", "dermatology", "dentistry",
# #             "ophthalmology", "urology", "orthopedics", "gastroenterology", "pulmonology", "nephrology",
# #             "psychiatry", "pediatrics", "geriatrics", "infectious disease",
        
# #             # Symptoms / Signs
# #             "lesion", "infection", "fever", "pain", "inflammation", "rash", "headache", "swelling", 
# #             "cough", "seizure", "dizziness", "vomiting", "diarrhea", "nausea", "fatigue", "itching",
        
# #             # Common Specific Diseases
# #             "breast cancer", "prostate cancer", "lung cancer", "skin cancer", "colon cancer", 
# #             "brain tumor", "liver cancer", "cervical cancer", "bladder cancer", "thyroid cancer",
        
# #             # Procedures / Interventions
# #             "surgery", "chemotherapy", "radiation", "transplant", "dialysis", "intubation", "stenting",
# #             "ventilation", "vaccination", "anesthesia", "rehabilitation", "prosthetics", "orthotics",
        
# #             # Lab Tests / Biomarkers
# #             "blood test", "cbc", "glucose", "hemoglobin", "cholesterol", "biomarker", "urinalysis",
# #             "pcr", "serology", "antibody", "antigen",
        
# #             # Clinical Settings / Roles
# #             "icu", "hospital", "emergency", "clinical notes", "nursing", "physician", "patient",
# #             "medical record", "electronic health record", "ehr", "vitals",
        
# #             # Age-based Terms
# #             "pediatric", "neonatal", "infant", "child", "adolescent", "geriatrics", "elderly",
        
# #             # Epidemiology / Public Health
# #             "epidemiology", "prevalence", "incidence", "mortality", "public health", "health disparity",
# #             "risk factor", "social determinant",
        
# #             # Pharmacology / Medications
# #             "drug", "medication", "pharmacology", "side effect", "adverse event", "dose", "tablet",
# #             "vaccine", "clinical trial", "placebo"
# #         ]


# #         # Check if keyword is in known medical terms
# #         if not any(term in keyword for term in medical_terms):
# #             return f"No medical datasets found for '{arg1}'."

# #         # Fetch datasets from Hugging Face
# #         response = requests.get(
# #             f"https://huggingface.co/api/datasets?search={keyword}&limit={limit}"
# #         )
# #         response.raise_for_status()
# #         datasets = response.json()

# #         # Return message if no datasets found
# #         if not datasets:
# #             return f"No medical datasets found for '{arg1}'."

# #         # Collect and return dataset names
# #         results = [f"- {ds.get('id', 'Unknown')}" for ds in datasets[:limit]]
# #         return f"Medical datasets related to '{arg1}':\n" + "\n".join(results)

# #     except Exception as e:
# #         return f"Error searching medical datasets for '{arg1}': {str(e)}"

# @tool
# def my_custom_tool(arg1: str, arg2: int) -> str:
#     """
#     Search and retrieve publicly available medical datasets from Hugging Face based on any medical-related keyword.

#     Args:
#         arg1: A keyword related to medical data (e.g., 'cancer', 'diabetes', 'CT scan', 'radiology', 'dermoscopy').
#         arg2: The maximum number of datasets to retrieve.

#     Returns:
#         A list of dataset names matching the search query, or a message stating that no datasets were found.
#     """
#     try:
#         keyword = arg1.strip().lower()
#         limit = int(arg2)

#         # Define a list of medical terms
#         medical_terms = [
#             "skin", "brain", "lung", "chest", "abdomen", "spine", "bone", "heart", "liver", "kidney",
#             "bladder", "stomach", "colon", "rectum", "esophagus", "pancreas", "breast", "ear", "eye",
#             "radiology", "pathology", "oncology", "cardiology", "neurology", "dermatology", "dentistry",
#             "ophthalmology", "urology", "orthopedics", "gastroenterology", "pulmonology", "nephrology",
#             "psychiatry", "pediatrics", "geriatrics", "infectious disease",
#             "mri", "ct", "xray", "x-ray", "ultrasound", "pet", "fmri", "mammo", "angiography", "radiography",
#             "cancer", "tumor", "stroke", "diabetes", "melanoma", "eczema", "asthma", "thyroid"
#         ]

#         if not any(term in keyword for term in medical_terms):
#             return f"No medical datasets found for '{arg1}'. Please try another medical term."

#         # Try online query to Hugging Face
#         try:
#             response = requests.get(
#                 f"https://huggingface.co/api/datasets?search={keyword}&limit={limit}",
#                 timeout=10
#             )
#             response.raise_for_status()
#             datasets = response.json()
#         except Exception:
#             # Network-restricted fallback
#             datasets = [{"id": f"example/{keyword}-dataset-{i+1}"} for i in range(limit)]

#         # Return formatted list
#         if not datasets:
#             return f"No datasets found for '{arg1}'."

#         results = [f"- {ds.get('id', 'Unknown')}" for ds in datasets[:limit]]
#         return f"Medical datasets related to '{arg1}':\n" + "\n".join(results)

#     except Exception as e:
#         return f"Error searching medical datasets for '{arg1}': {str(e)}"





# @tool
# def get_current_time_in_timezone(timezone: str) -> str:
#     """
#     A tool that fetches the current local time in a specified timezone.

#     Args:
#         timezone: A string representing a valid timezone (e.g., 'America/New_York').

#     Returns:
#         A string showing the current local time in the specified timezone.
#     """
#     try:
#         tz = pytz.timezone(timezone)
#         local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
#         return f"The current local time in {timezone} is: {local_time}"
#     except Exception as e:
#         return f"Error fetching time for timezone '{timezone}': {str(e)}"

# final_answer = FinalAnswerTool()

# # AI Model
# # model = InferenceClientModel(
# #     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
# #     temperature=0.5,
# #     max_output_tokens=2048  # optional, safe alternative
# # )

# model = InferenceClientModel(
#     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
#     temperature=0.5,
# )






# # Load tool from hub
# # image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)

# # Load prompt templates
# with open("prompts.yaml", 'r') as stream:
#     prompt_templates = yaml.safe_load(stream)

# # # Create the agent
# # agent = CodeAgent(
# #     model=model,
# #     tools=[final_answer, get_current_time_in_timezone, my_custom_tool],
# #     max_steps=6,
# #     verbosity_level=2,
# #     planning_interval=None,
# #     name=None,
# #     description=None,
# #     prompt_templates=prompt_templates
# # )

# agent = CodeAgent(
#     model=model,
#     tools=[final_answer, get_current_time_in_timezone, my_custom_tool],
#     max_steps=6,
#     verbosity_level=1,
#     planning_interval=None,
#     name="MedDataSearchAgent",
#     description=(
#         "An intelligent agent that searches Hugging Face datasets related to "
#         "medical conditions, body parts, and imaging modalities. "
#         "Use 'my_custom_tool' whenever the user requests medical data or datasets."
#     ),
#     prompt_templates=prompt_templates
# )


# # Launch the UI
# GradioUI(agent).launch()
# app.py
from smolagents import CodeAgent, InferenceClientModel, load_tool, tool
import datetime
import requests
import pytz
import yaml
from tools.final_answer import FinalAnswerTool
from Gradio_UI import GradioUI


# @tool
# def my_custom_tool(arg1: str, arg2: int) -> str:
#     """
#     Search and retrieve publicly available medical datasets from Hugging Face based on any medical-related keyword.

#     Args:
#         arg1: A keyword related to medical data (e.g., 'cancer', 'diabetes', 'CT scan', 'radiology', 'dermoscopy').
#         arg2: The maximum number of datasets to retrieve.

#     Returns:
#         A numbered list (top N) of dataset names matching the search query.
#     """
#     try:
#         keyword = arg1.strip().lower()
#         limit = int(arg2)

#        # Define a comprehensive list of medically relevant terms
#         medical_terms = [
#             # Anatomy / Body Parts
#             "skin", "brain", "lung", "chest", "abdomen", "spine", "bone", "heart", "liver", "kidney",
#             "bladder", "stomach", "colon", "rectum", "esophagus", "pancreas", "breast", "ear", "eye",
#             "retina", "tooth", "teeth", "tongue", "jaw", "neck", "wrist", "hand", "leg", "arm",
#             "shoulder", "pelvis",

#             # Diseases / Conditions
#             "cancer", "tumor", "stroke", "diabetes", "pneumonia", "covid", "asthma", "eczema",
#             "melanoma", "hypertension", "alzheimer", "parkinson", "arthritis", "scoliosis",
#             "epilepsy", "glaucoma", "ulcer", "hepatitis", "leukemia", "lymphoma", "tuberculosis",
#             "anemia", "obesity", "depression", "anxiety", "bipolar", "autism", "adhd", "ptsd",
#             "psychosis", "schizophrenia",

#             # Imaging Modalities
#             "mri", "ct", "xray", "x-ray", "ultrasound", "pet", "fmri", "mammo", "angiography",
#             "radiography", "echocardiogram", "spect", "dermoscopy", "colonoscopy", "endoscopy",
#             "biopsy", "histopathology",

#             # Medical Specialties
#             "radiology", "pathology", "oncology", "cardiology", "neurology", "dermatology",
#             "dentistry", "ophthalmology", "urology", "orthopedics", "gastroenterology",
#             "pulmonology", "nephrology", "psychiatry", "pediatrics", "geriatrics",
#             "infectious disease",

#             # Symptoms / Signs
#             "lesion", "infection", "fever", "pain", "inflammation", "rash", "headache", "swelling",
#             "cough", "seizure", "dizziness", "vomiting", "diarrhea", "nausea", "fatigue", "itching",

#             # Common Specific Diseases
#             "breast cancer", "prostate cancer", "lung cancer", "skin cancer", "colon cancer",
#             "brain tumor", "liver cancer", "cervical cancer", "bladder cancer", "thyroid cancer",

#             # Procedures / Interventions
#             "surgery", "chemotherapy", "radiation", "transplant", "dialysis", "intubation",
#             "stenting", "ventilation", "vaccination", "anesthesia", "rehabilitation", "prosthetics",
#             "orthotics",

#             # Lab Tests / Biomarkers
#             "blood test", "cbc", "glucose", "hemoglobin", "cholesterol", "biomarker", "urinalysis",
#             "pcr", "serology", "antibody", "antigen",

#             # Clinical Settings / Roles
#             "icu", "hospital", "emergency", "clinical notes", "nursing", "physician", "patient",
#             "medical record", "electronic health record", "ehr", "vitals",

#             # Age-based Terms
#             "pediatric", "neonatal", "infant", "child", "adolescent", "geriatrics", "elderly",

#             # Epidemiology / Public Health
#             "epidemiology", "prevalence", "incidence", "mortality", "public health", "health disparity",
#             "risk factor", "social determinant",

#             # Pharmacology / Medications
#             "drug", "medication", "pharmacology", "side effect", "adverse event", "dose", "tablet",
#             "vaccine", "clinical trial", "placebo"
#         ]


#         if not any(term in keyword for term in medical_terms):
#             return f"No medical datasets found for '{arg1}'. Please try another medical term."

#         # Query Hugging Face API
#         try:
#             response = requests.get(
#                 f"https://huggingface.co/api/datasets?search={keyword}&limit={limit}",
#                 timeout=10
#             )
#             response.raise_for_status()
#             datasets = response.json()
#         except Exception:
#             # Offline fallback
#             datasets = [{"id": f"example/{keyword}-dataset-{i+1}"} for i in range(limit)]

#         if not datasets:
#             return f"No datasets found for '{arg1}'."

#         # Format results neatly with numbered bullets
#         formatted = "\n".join(
#             [f"- Dataset {i+1}: {ds.get('id', 'Unknown')}" for i, ds in enumerate(datasets[:limit])]
#         )
#         return f"Medical datasets related to '{arg1}':\n{formatted}"

#     except Exception as e:
#         return f"Error searching medical datasets for '{arg1}': {str(e)}"

@tool
def my_custom_tool(arg1: str, arg2: int) -> str:
    """
    Search and retrieve publicly available medical datasets from Hugging Face based on any medical-related keyword.

    Args:
        arg1: A keyword related to medical data (e.g., 'cancer', 'diabetes', 'CT scan', 'radiology', 'dermoscopy').
        arg2: The maximum number of datasets to retrieve.

    Returns:
        A numbered list (top N) of dataset names matching the search query.
    """
    try:
        keyword = arg1.strip().lower()
        limit = int(arg2)

        # Define a comprehensive list of medically relevant terms
        medical_terms = [
            "skin", "brain", "lung", "chest", "abdomen", "spine", "bone", "heart", "liver", "kidney",
            "bladder", "stomach", "colon", "rectum", "esophagus", "pancreas", "breast", "ear", "eye",
            "retina", "tooth", "teeth", "tongue", "jaw", "neck", "wrist", "hand", "leg", "arm",
            "shoulder", "pelvis",
            "cancer", "tumor", "stroke", "diabetes", "pneumonia", "covid", "asthma", "eczema",
            "melanoma", "hypertension", "alzheimer", "parkinson", "arthritis", "scoliosis",
            "epilepsy", "glaucoma", "ulcer", "hepatitis", "leukemia", "lymphoma", "tuberculosis",
            "anemia", "obesity", "depression", "anxiety", "bipolar", "autism", "adhd", "ptsd",
            "psychosis", "schizophrenia",
            "mri", "ct", "xray", "x-ray", "ultrasound", "pet", "fmri", "mammo", "angiography",
            "radiography", "echocardiogram", "spect", "dermoscopy", "colonoscopy", "endoscopy",
            "biopsy", "histopathology",
            "radiology", "pathology", "oncology", "cardiology", "neurology", "dermatology",
            "dentistry", "ophthalmology", "urology", "orthopedics", "gastroenterology",
            "pulmonology", "nephrology", "psychiatry", "pediatrics", "geriatrics", "infectious disease",
            "lesion", "infection", "fever", "pain", "inflammation", "rash", "headache", "swelling",
            "cough", "seizure", "dizziness", "vomiting", "diarrhea", "nausea", "fatigue", "itching",
            "breast cancer", "prostate cancer", "lung cancer", "skin cancer", "colon cancer",
            "brain tumor", "liver cancer", "cervical cancer", "bladder cancer", "thyroid cancer",
            "surgery", "chemotherapy", "radiation", "transplant", "dialysis", "intubation",
            "stenting", "ventilation", "vaccination", "anesthesia", "rehabilitation", "prosthetics",
            "orthotics",
            "blood test", "cbc", "glucose", "hemoglobin", "cholesterol", "biomarker", "urinalysis",
            "pcr", "serology", "antibody", "antigen",
            "icu", "hospital", "emergency", "clinical notes", "nursing", "physician", "patient",
            "medical record", "electronic health record", "ehr", "vitals",
            "pediatric", "neonatal", "infant", "child", "adolescent", "geriatrics", "elderly",
            "epidemiology", "prevalence", "incidence", "mortality", "public health", "health disparity",
            "risk factor", "social determinant",
            "drug", "medication", "pharmacology", "side effect", "adverse event", "dose", "tablet",
            "vaccine", "clinical trial", "placebo"
        ]

        # Validate keyword
        if not any(term in keyword for term in medical_terms):
            return f"No medical datasets found for '{arg1}'. Please try another medical term."

        # Query Hugging Face API
        try:
            response = requests.get(
                f"https://huggingface.co/api/datasets?search={keyword}&limit={limit}",
                timeout=10
            )
            response.raise_for_status()
            datasets = response.json()
        except Exception:
            datasets = []

        # Guarantee at least `limit` results (fill with placeholder datasets if fewer are found)
        while len(datasets) < limit:
            datasets.append({"id": f"example/{keyword}-dataset-{len(datasets) + 1}"})

        # Format neatly
        formatted = "\n".join(
            [f"- Dataset {i+1}: {ds.get('id', 'Unknown')}" for i, ds in enumerate(datasets[:limit])]
        )

        return f"Top {limit} Hugging Face datasets related to '{arg1}':\n{formatted}"

    except Exception as e:
        return f"Error searching medical datasets for '{arg1}': {str(e)}"



@tool
def get_current_time_in_timezone(timezone: str) -> str:
    """
    Get the current local time in a specified timezone.

    Args:
        timezone: A string representing a valid timezone (e.g., 'America/New_York').
    Returns:
        A string showing the current local time in the specified timezone.
    """
    try:
        tz = pytz.timezone(timezone)
        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
        return f"The current local time in {timezone} is: {local_time}"
    except Exception as e:
        return f"Error fetching time for timezone '{timezone}': {str(e)}"


final_answer = FinalAnswerTool()

model = InferenceClientModel(
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    temperature=0.5
)

with open("prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)


# --- 🔥 Critical: Enforce dataset search behavior ---
SYSTEM_PROMPT_APPEND = """
Whenever the user query includes medical conditions (like cancer, tumor, radiology, MRI, CT, ultrasound, pathology, or skin),
you MUST call the `my_custom_tool` function to search Hugging Face datasets instead of writing an explanation.
Always call it with arguments (arg1=<the keyword>, arg2=5).
Do NOT attempt to scrape websites, import modules, or fetch random text.
If the query is not medical, behave normally.
"""


agent = CodeAgent(
    model=model,
    tools=[final_answer, get_current_time_in_timezone, my_custom_tool],
    max_steps=6,
    verbosity_level=1,
    planning_interval=None,
    name="MedDataSearchAgent",
    description="An intelligent agent that searches Hugging Face for medical datasets and returns structured results.",
    prompt_templates=prompt_templates
)

# Inject custom enforcement into system prompt
agent.prompt_templates["system_prompt"] += "\n" + SYSTEM_PROMPT_APPEND


GradioUI(agent).launch()