Spaces:

Thiresh
/

DataAnalysisAgent

Running

File size: 25,746 Bytes

import os, io, re
import pandas as pd
import numpy as np
import streamlit as st
from openai import OpenAI
import matplotlib.pyplot as plt
from typing import List, Any, Optional

# === Configuration ===
# Global configuration
API_BASE_URL = "https://integrate.api.nvidia.com/v1"
API_KEY = os.environ.get("NVIDIA_API_KEY")

# Plot configuration
DEFAULT_FIGSIZE = (6, 4)
DEFAULT_DPI = 100

# Display configuration
MAX_RESULT_DISPLAY_LENGTH = 300

class ModelConfig:
    """Configuration class for different models."""
    
    def __init__(self, model_name: str, model_url: str, model_print_name: str, 
                 # QueryUnderstandingTool parameters
                 query_understanding_temperature: float = 0.1,
                 query_understanding_max_tokens: int = 5,
                 # CodeGenerationAgent parameters
                 code_generation_temperature: float = 0.2,
                 code_generation_max_tokens: int = 1024,
                 # ReasoningAgent parameters
                 reasoning_temperature: float = 0.2,
                 reasoning_max_tokens: int = 1024,
                 # DataInsightAgent parameters
                 insights_temperature: float = 0.2,
                 insights_max_tokens: int = 512,
                 reasoning_false: str = "detailed thinking off",
                 reasoning_true: str = "detailed thinking on"):
        self.MODEL_NAME = model_name
        self.MODEL_URL = model_url
        self.MODEL_PRINT_NAME = model_print_name
        
        # Function-specific LLM parameters
        self.QUERY_UNDERSTANDING_TEMPERATURE = query_understanding_temperature
        self.QUERY_UNDERSTANDING_MAX_TOKENS = query_understanding_max_tokens
        self.CODE_GENERATION_TEMPERATURE = code_generation_temperature
        self.CODE_GENERATION_MAX_TOKENS = code_generation_max_tokens
        self.REASONING_TEMPERATURE = reasoning_temperature
        self.REASONING_MAX_TOKENS = reasoning_max_tokens
        self.INSIGHTS_TEMPERATURE = insights_temperature
        self.INSIGHTS_MAX_TOKENS = insights_max_tokens
        self.REASONING_FALSE = reasoning_false
        self.REASONING_TRUE = reasoning_true

# Predefined model configurations
MODEL_CONFIGS = {
    "llama-3-1-nemotron-ultra-v1": ModelConfig(
        model_name="nvidia/llama-3.1-nemotron-ultra-253b-v1",
        model_url="https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1",
        model_print_name="NVIDIA Llama 3.1 Nemotron Ultra 253B v1",
        # QueryUnderstandingTool
        query_understanding_temperature=0.1,
        query_understanding_max_tokens=5,
        # CodeGenerationAgent
        code_generation_temperature=0.2,
        code_generation_max_tokens=1024,
        # ReasoningAgent
        reasoning_temperature=0.6,
        reasoning_max_tokens=1024,
        # DataInsightAgent
        insights_temperature=0.2,
        insights_max_tokens=512,
        reasoning_false="detailed thinking off",
        reasoning_true="detailed thinking on"
    ),
    "llama-3-3-nemotron-super-v1-5": ModelConfig(
        model_name="nvidia/llama-3.3-nemotron-super-49b-v1.5",
        model_url="https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1_5",
        model_print_name="NVIDIA Llama 3.3 Nemotron Super 49B v1.5",
        # QueryUnderstandingTool
        query_understanding_temperature=0.1,
        query_understanding_max_tokens=5,
        # CodeGenerationAgent
        code_generation_temperature=0.0,
        code_generation_max_tokens=1024,
        # ReasoningAgent
        reasoning_temperature=0.6,
        reasoning_max_tokens=2048,
        # DataInsightAgent
        insights_temperature=0.2,
        insights_max_tokens=512,
        reasoning_false="/no_think",
        reasoning_true=""
    )
}

# Default configuration (can be changed via environment variable or UI)
DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "llama-3-1-nemotron-ultra-v1")
Config = MODEL_CONFIGS.get(DEFAULT_MODEL, MODEL_CONFIGS["llama-3-1-nemotron-ultra-v1"])

# Initialize OpenAI client with configuration
client = OpenAI(
    base_url=API_BASE_URL,
    api_key=API_KEY
)

def get_current_config():
    """Get the current model configuration based on session state."""
    # Always return the current model from session state
    if "current_model" in st.session_state:
        return MODEL_CONFIGS[st.session_state.current_model]
    
    return MODEL_CONFIGS[DEFAULT_MODEL]

# ------------------  QueryUnderstandingTool ---------------------------
def QueryUnderstandingTool(query: str) -> bool:
    """Return True if the query seems to request a visualisation based on keywords."""
    # Use LLM to understand intent instead of keyword matching
    current_config = get_current_config()
    
    # Prepend the instruction to the query
    full_prompt = f"""You are a query classifier. Your task is to determine if a user query is requesting a data visualization.

IMPORTANT: Respond with ONLY 'true' or 'false' (lowercase, no quotes, no punctuation).

Classify as 'true' ONLY if the query explicitly asks for:
- A plot, chart, graph, visualization, or figure
- To "show" or "display" data visually
- To "create" or "generate" a visual representation
- Words like: plot, chart, graph, visualize, show, display, create, generate, draw

Classify as 'false' for:
- Data analysis without visualization requests
- Statistical calculations, aggregations, filtering, sorting
- Questions about data content, counts, summaries
- Requests for tables, dataframes, or text results

User query: {query}"""
    
    messages = [
        {"role": "system", "content": current_config.REASONING_FALSE},
        {"role": "user", "content": full_prompt}
    ]
    
    response = client.chat.completions.create(
        model=current_config.MODEL_NAME,
        messages=messages,
        temperature=current_config.QUERY_UNDERSTANDING_TEMPERATURE,
        max_tokens=current_config.QUERY_UNDERSTANDING_MAX_TOKENS  # We only need a short response
    )
    
    # Extract the response and convert to boolean

    intent_response = response.choices[0].message.content.strip().lower()

    return intent_response == "true"

# === CodeGeneration TOOLS ============================================


# ------------------  CodeWritingTool ---------------------------------
def CodeWritingTool(cols: List[str], query: str) -> str:
    """Generate a prompt for the LLM to write pandas-only code for a data query (no plotting)."""

    return f"""

    Given DataFrame `df` with columns: 

    {', '.join(cols)}

    Write Python code (pandas **only**, no plotting) to answer: 
    "{query}"

    Rules
    -----
    1. Use pandas operations on `df` only.
    2. Rely only on the columns in the DataFrame.
    3. Assign the final result to `result`.
    4. Return your answer inside a single markdown fence that starts with ```python and ends with ```.
    5. Do not include any explanations, comments, or prose outside the code block.
    6. Use **df** as the sole data source. **Do not** read files, fetch data, or use Streamlit.
    7. Do **not** import any libraries (pandas is already imported as pd).
    8. Handle missing values (`dropna`) before aggregations.

    Example
    -----
    ```python
    result = df.groupby("some_column")["a_numeric_col"].mean().sort_values(ascending=False)
    ```

    """


# ------------------  PlotCodeGeneratorTool ---------------------------
def PlotCodeGeneratorTool(cols: List[str], query: str) -> str:

    """Generate a prompt for the LLM to write pandas + matplotlib code for a plot based on the query and columns."""

    return f"""

    Given DataFrame `df` with columns:

    {', '.join(cols)}

    Write Python code using pandas **and matplotlib** (as plt) to answer:
    "{query}"

    Rules
    -----
    1. Use pandas for data manipulation and matplotlib.pyplot (as plt) for plotting.
    2. Rely only on the columns in the DataFrame.
    3. Assign the final result (DataFrame, Series, scalar *or* matplotlib Figure) to a variable named `result`.
    4. Create only ONE relevant plot. Set `figsize={DEFAULT_FIGSIZE}`, add title/labels.
    5. Return your answer inside a single markdown fence that starts with ```python and ends with ```.
    6. Do not include any explanations, comments, or prose outside the code block.
    7. Handle missing values (`dropna`) before plotting/aggregations.

    """
  

# === CodeGenerationAgent ==============================================

def CodeGenerationAgent(query: str, df: pd.DataFrame, chat_context: Optional[str] = None):
    """Selects the appropriate code generation tool and gets code from the LLM for the user's query."""

    should_plot = QueryUnderstandingTool(query)

    prompt = PlotCodeGeneratorTool(df.columns.tolist(), query) if should_plot else CodeWritingTool(df.columns.tolist(), query)

    # Prepend the instruction to the query
    context_section = f"\nConversation context (recent user turns):\n{chat_context}\n" if chat_context else ""

    full_prompt = f"""You are a senior Python data analyst who writes clean, efficient code. 
    Solve the given problem with optimal pandas operations. Be concise and focused. 
    Your response must contain ONLY a properly-closed ```python code block with no explanations before or after (starts with ```python and ends with ```). 
    Ensure your solution is correct, handles edge cases, and follows best practices for data analysis. 
    If the latest user request references prior results ambiguously (e.g., "it", "that", "same groups"), infer intent from the conversation context and choose the most reasonable interpretation. {context_section}{prompt}"""

    current_config = get_current_config()

    messages = [
        {"role": "system", "content": current_config.REASONING_FALSE},
        {"role": "user", "content": full_prompt}
    ]

    response = client.chat.completions.create(
        model=current_config.MODEL_NAME,
        messages=messages,
        temperature=current_config.CODE_GENERATION_TEMPERATURE,
        max_tokens=current_config.CODE_GENERATION_MAX_TOKENS
    )

    full_response = response.choices[0].message.content

    code = extract_first_code_block(full_response)
    return code, should_plot, ""

# === ExecutionAgent ====================================================

def ExecutionAgent(code: str, df: pd.DataFrame, should_plot: bool):
    """Executes the generated code in a controlled environment and returns the result or error message."""
    
    # Set up execution environment with all necessary modules
    env = {
        "pd": pd,
        "df": df
    }
    
    if should_plot:
        plt.rcParams["figure.dpi"] = DEFAULT_DPI  # Set default DPI for all figures
        env["plt"] = plt
        env["io"] = io
    
    try:
        # Execute the code in the environment
        exec(code, {}, env)
        result = env.get("result", None)
        
        # If no result was assigned, return the last expression
        if result is None:
            # Try to get the last executed expression
            if "result" not in env:
                return "No result was assigned to 'result' variable"
        
        return result
    except Exception as exc:
        return f"Error executing code: {exc}"

# === ReasoningCurator TOOL =========================================
def ReasoningCurator(query: str, result: Any) -> str:
    """Builds and returns the LLM prompt for reasoning about the result."""
    is_error = isinstance(result, str) and result.startswith("Error executing code")
    is_plot = isinstance(result, (plt.Figure, plt.Axes))

    if is_error:
        desc = result
    elif is_plot:
        title = ""
        if isinstance(result, plt.Figure):
            title = result._suptitle.get_text() if result._suptitle else ""
        elif isinstance(result, plt.Axes):
            title = result.get_title()
        desc = f"[Plot Object: {title or 'Chart'}]"
    else:
        desc = str(result)[:MAX_RESULT_DISPLAY_LENGTH]

    if is_plot:
        prompt = f'''
        The user asked: "{query}".
        Below is a description of the plot result:
        {desc}
        Explain in 2–3 concise sentences what the chart shows (no code talk).'''
    else:
        prompt = f'''
        The user asked: "{query}".
        The result value is: {desc}
        Explain in 2–3 concise sentences what this tells about the data (no mention of charts).'''
    return prompt

# === ReasoningAgent (streaming) =========================================
def ReasoningAgent(query: str, result: Any):
    """Streams the LLM's reasoning about the result (plot or value) and extracts model 'thinking' and final explanation."""
    current_config = get_current_config()
    prompt = ReasoningCurator(query, result)

    # Streaming LLM call
    response = client.chat.completions.create(
        model=current_config.MODEL_NAME,
        messages=[
            {"role": "system", "content": current_config.REASONING_TRUE},
            {"role": "user", "content": "You are an insightful data analyst. " + prompt}
        ],
        temperature=current_config.REASONING_TEMPERATURE,
        max_tokens=current_config.REASONING_MAX_TOKENS,
        stream=True
    )

    # Stream and display thinking
    thinking_placeholder = st.empty()
    full_response = ""
    thinking_content = ""
    in_think = False

    for chunk in response:
        if chunk.choices[0].delta.content is not None:
            token = chunk.choices[0].delta.content
            full_response += token

            # Simple state machine to extract <think>...</think> as it streams
            if "<think>" in token:
                in_think = True
                token = token.split("<think>", 1)[1]
            if "</think>" in token:
                token = token.split("</think>", 1)[0]
                in_think = False
            if in_think or ("<think>" in full_response and not "</think>" in full_response):
                thinking_content += token
                thinking_placeholder.markdown(
                    f'<details class="thinking" open><summary>🤔 Model Thinking</summary><pre>{thinking_content}</pre></details>',
                    unsafe_allow_html=True
                )

    # After streaming, extract final reasoning (outside <think>...</think>)
    cleaned = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL).strip()
    return thinking_content, cleaned

# === DataFrameSummary TOOL (pandas only) =========================================
def DataFrameSummaryTool(df: pd.DataFrame) -> str:
    """Generate a summary prompt string for the LLM based on the DataFrame."""
    prompt = f"""
        Given a dataset with {len(df)} rows and {len(df.columns)} columns:
        Columns: {', '.join(df.columns)}
        Data types: {df.dtypes.to_dict()}
        Missing values: {df.isnull().sum().to_dict()}

        Provide:
        1. A brief description of what this dataset contains
        2. 3-4 possible data analysis questions that could be explored
        Keep it concise and focused."""
    return prompt

# === DataInsightAgent (upload-time only) ===============================

def DataInsightAgent(df: pd.DataFrame) -> str:
    """Uses the LLM to generate a brief summary and possible questions for the uploaded dataset."""
    current_config = get_current_config()
    prompt = DataFrameSummaryTool(df)
    try:
        response = client.chat.completions.create(
            model=current_config.MODEL_NAME,
            messages=[
                {"role": "system", "content": current_config.REASONING_FALSE},
                {"role": "user", "content": "You are a data analyst providing brief, focused insights. " + prompt}
            ],
            temperature=current_config.INSIGHTS_TEMPERATURE,
            max_tokens=current_config.INSIGHTS_MAX_TOKENS
        )
        return response.choices[0].message.content
    except Exception as exc:
        raise Exception(f"Error generating dataset insights: {exc}")

# === Helpers ===========================================================

def extract_first_code_block(text: str) -> str:
    """Extracts the first Python code block from a markdown-formatted string."""
    start = text.find("```python")
    if start == -1:
        return ""
    start += len("```python")
    end = text.find("```", start)
    if end == -1:
        return ""
    return text[start:end].strip()

# === Main Streamlit App ===============================================

def main():
    st.set_page_config(layout="wide")
    if "plots" not in st.session_state:
        st.session_state.plots = []
    if "current_model" not in st.session_state:
        st.session_state.current_model = DEFAULT_MODEL

    # Page logo at top right corner, large and clickable
    st.markdown(
        """
        <div style='position: absolute; top: 20px; right: 30px; z-index: 999;'>
            <a href='https://www.linkedin.com/in/thiresh-sidda/' target='_blank'>
                <img src='https://ih1.redbubble.net/image.1849728168.3104/raf,360x360,075,t,fafafa:ca443f4786.jpg' alt='Logo' style='height:120px; border-radius:20px; box-shadow:0 2px 12px rgba(0,0,0,0.15);'>
            </a>
        </div>
        """,
        unsafe_allow_html=True
    )
    # Main title centered with large font and GIF
    st.markdown(
        """
        <div style='display: flex; align-items: center; justify-content: center; margin-bottom: 30px;'>
            <span style='color:#1976D2; font-weight:bold; font-size:3.5em; margin-right:30px;'>Data Analysis Agent</span>
            <img src='https://cdn.dribbble.com/userupload/23161671/file/original-4c7894556285d8f223ab21fd10554fe4.gif' alt='GIF' style='height:120px;'>
        </div>
        """,
        unsafe_allow_html=True
    )

    medium_blue = "#1976D2"  # Medium blue color

    # Move left panel to sidebar
    with st.sidebar:
        st.markdown(f"<span style='color:{medium_blue}; font-weight:bold; font-size:1.5em;'>Insights Generator</span>", unsafe_allow_html=True)
        available_models = list(MODEL_CONFIGS.keys())
        model_display_names = {key: MODEL_CONFIGS[key].MODEL_PRINT_NAME for key in available_models}
        selected_model = st.selectbox(
            "Select Model",
            options=available_models,
            format_func=lambda x: model_display_names[x],
            index=available_models.index(st.session_state.current_model)
        )
        display_config = MODEL_CONFIGS[selected_model]
        file = st.file_uploader("Choose CSV", type=["csv"], key="csv_uploader")
        # Update configuration if model changed
        if selected_model != st.session_state.current_model:
            st.session_state.current_model = selected_model
            new_config = MODEL_CONFIGS[selected_model]
            if "messages" in st.session_state:
                st.session_state.messages = []
            if "plots" in st.session_state:
                st.session_state.plots = []
            if "df" in st.session_state and file is not None:
                with st.spinner("Generating dataset insights with new model …"):
                    try:
                        st.session_state.insights = DataInsightAgent(st.session_state.df)
                        st.success(f"Insights updated with {new_config.MODEL_PRINT_NAME}")
                    except Exception as e:
                        st.error(f"Error updating insights: {str(e)}")
                        if "insights" in st.session_state:
                            del st.session_state.insights
                st.rerun()
        if not file and "df" in st.session_state and "current_file" in st.session_state:
            del st.session_state.df
            del st.session_state.current_file
            if "insights" in st.session_state:
                del st.session_state.insights
            st.rerun()
        if file:
            if ("df" not in st.session_state) or (st.session_state.get("current_file") != file.name):
                st.session_state.df = pd.read_csv(file)
                st.session_state.current_file = file.name
                st.session_state.messages = []
                with st.spinner("Generating dataset insights …"):
                    try:
                        st.session_state.insights = DataInsightAgent(st.session_state.df)
                    except Exception as e:
                        st.error(f"Error generating insights: {str(e)}")
            elif "insights" not in st.session_state:
                with st.spinner("Generating dataset insights …"):
                    try:
                        st.session_state.insights = DataInsightAgent(st.session_state.df)
                    except Exception as e:
                        st.error(f"Error generating insights: {str(e)}")
        if "df" in st.session_state:
            st.markdown(f"<span style='color:{medium_blue}; font-weight:bold; font-size:1.2em;'>Your Dataset Insights</span>", unsafe_allow_html=True)
            if "insights" in st.session_state and st.session_state.insights:
                st.dataframe(st.session_state.df.head())
                st.markdown(f"<span style='color:{medium_blue};'>{st.session_state.insights}</span>", unsafe_allow_html=True)
                current_config_left = get_current_config()
                #st.markdown(f"*<span style='color: grey; font-style: italic;'>Generated with {current_config_left.MODEL_PRINT_NAME}</span>*", unsafe_allow_html=True)
            else:
                st.warning("No insights available.")
        else:
            st.info("Upload a CSV to begin chatting with your data.")

    with st.container():
        st.markdown(
            f"""
            <div style='display: flex; align-items: center; justify-content: flex-start; margin-bottom: 10px;'>
                <span style='color:{medium_blue}; font-weight:bold; font-size:2em; margin-right:20px;'>Chat with your data</span>
                <img src='https://i.pinimg.com/originals/5f/d5/58/5fd558f8b7a4f9e2138709cbe63c7052.gif' alt='Chat GIF' style='height:48px;'>
            </div>
            """,
            unsafe_allow_html=True
        )
        if "df" in st.session_state:
            current_config_right = get_current_config()
            st.markdown(f"*<span style='color: grey; font-style: italic;'>Using {current_config_right.MODEL_PRINT_NAME}</span>*", unsafe_allow_html=True)
        if "messages" not in st.session_state:
            st.session_state.messages = []

        clear_col1, clear_col2 = st.columns([9,1])
        with clear_col2:
            if st.button("Clear chat"):
                st.session_state.messages = []
                st.session_state.plots = []
                st.rerun()

        for msg in st.session_state.messages:
            with st.chat_message(msg["role"]):
                st.markdown(f"<span style='color:{medium_blue}; font-size:1.1em;'>{msg['content']}</span>", unsafe_allow_html=True)
                if msg.get("plot_index") is not None:
                    idx = msg["plot_index"]
                    if 0 <= idx < len(st.session_state.plots):
                        st.pyplot(st.session_state.plots[idx], use_container_width=False)

        if "df" in st.session_state:
            if user_q := st.chat_input("Ask about your data…"):
                st.session_state.messages.append({"role": "user", "content": user_q})
                with st.spinner("Working …"):
                    recent_user_turns = [m["content"] for m in st.session_state.messages if m["role"] == "user"][-3:]
                    context_text = "\n".join(recent_user_turns[:-1]) if len(recent_user_turns) > 1 else None
                    code, should_plot_flag, code_thinking = CodeGenerationAgent(user_q, st.session_state.df, context_text)
                    result_obj = ExecutionAgent(code, st.session_state.df, should_plot_flag)
                    raw_thinking, reasoning_txt = ReasoningAgent(user_q, result_obj)
                    reasoning_txt = reasoning_txt.replace("`", "")

                is_plot = isinstance(result_obj, (plt.Figure, plt.Axes))
                plot_idx = None
                if is_plot:
                    fig = result_obj.figure if isinstance(result_obj, plt.Axes) else result_obj
                    st.session_state.plots.append(fig)
                    plot_idx = len(st.session_state.plots) - 1
                    header = "Here is the visualization you requested:"
                elif isinstance(result_obj, (pd.DataFrame, pd.Series)):
                    header = f"Result: {len(result_obj)} rows" if isinstance(result_obj, pd.DataFrame) else "Result series"
                else:
                    header = f"Result: {result_obj}"

                thinking_html = ""
                if raw_thinking:
                    thinking_html = (
                        '<details class="thinking">'
                        '<summary>🧠 Reasoning</summary>'
                        f'<pre>{raw_thinking}</pre>'
                        '</details>'
                    )

                explanation_html = reasoning_txt

                code_html = (
                    '<details class="code">'
                    '<summary>View code</summary>'
                    '<pre><code class="language-python">'
                    f'{code}'
                    '</code></pre>'
                    '</details>'
                )
                assistant_msg = f"{thinking_html}{explanation_html}\n\n{code_html}"

                st.session_state.messages.append({
                    "role": "assistant",
                    "content": assistant_msg,
                    "plot_index": plot_idx
                })
                st.rerun()

if __name__ == "__main__":
    main()