Spaces:

Thiresh
/

DataAnalysisAgent

Sleeping

App Files Files Community

DataAnalysisAgent / data_analysis_agent.py

Thiresh

Update data_analysis_agent.py

f64df95 verified about 2 months ago

raw

history blame

25.7 kB

	import os, io, re
	import pandas as pd
	import numpy as np
	import streamlit as st
	from openai import OpenAI
	import matplotlib.pyplot as plt
	from typing import List, Any, Optional

	# === Configuration ===
	# Global configuration
	API_BASE_URL = "https://integrate.api.nvidia.com/v1"
	API_KEY = os.environ.get("NVIDIA_API_KEY")

	# Plot configuration
	DEFAULT_FIGSIZE = (6, 4)
	DEFAULT_DPI = 100

	# Display configuration
	MAX_RESULT_DISPLAY_LENGTH = 300

	class ModelConfig:
	"""Configuration class for different models."""

	def __init__(self, model_name: str, model_url: str, model_print_name: str,
	# QueryUnderstandingTool parameters
	query_understanding_temperature: float = 0.1,
	query_understanding_max_tokens: int = 5,
	# CodeGenerationAgent parameters
	code_generation_temperature: float = 0.2,
	code_generation_max_tokens: int = 1024,
	# ReasoningAgent parameters
	reasoning_temperature: float = 0.2,
	reasoning_max_tokens: int = 1024,
	# DataInsightAgent parameters
	insights_temperature: float = 0.2,
	insights_max_tokens: int = 512,
	reasoning_false: str = "detailed thinking off",
	reasoning_true: str = "detailed thinking on"):
	self.MODEL_NAME = model_name
	self.MODEL_URL = model_url
	self.MODEL_PRINT_NAME = model_print_name

	# Function-specific LLM parameters
	self.QUERY_UNDERSTANDING_TEMPERATURE = query_understanding_temperature
	self.QUERY_UNDERSTANDING_MAX_TOKENS = query_understanding_max_tokens
	self.CODE_GENERATION_TEMPERATURE = code_generation_temperature
	self.CODE_GENERATION_MAX_TOKENS = code_generation_max_tokens
	self.REASONING_TEMPERATURE = reasoning_temperature
	self.REASONING_MAX_TOKENS = reasoning_max_tokens
	self.INSIGHTS_TEMPERATURE = insights_temperature
	self.INSIGHTS_MAX_TOKENS = insights_max_tokens
	self.REASONING_FALSE = reasoning_false
	self.REASONING_TRUE = reasoning_true

	# Predefined model configurations
	MODEL_CONFIGS = {
	"llama-3-1-nemotron-ultra-v1": ModelConfig(
	model_name="nvidia/llama-3.1-nemotron-ultra-253b-v1",
	model_url="https://build.nvidia.com/nvidia/llama-3_1-nemotron-ultra-253b-v1",
	model_print_name="NVIDIA Llama 3.1 Nemotron Ultra 253B v1",
	# QueryUnderstandingTool
	query_understanding_temperature=0.1,
	query_understanding_max_tokens=5,
	# CodeGenerationAgent
	code_generation_temperature=0.2,
	code_generation_max_tokens=1024,
	# ReasoningAgent
	reasoning_temperature=0.6,
	reasoning_max_tokens=1024,
	# DataInsightAgent
	insights_temperature=0.2,
	insights_max_tokens=512,
	reasoning_false="detailed thinking off",
	reasoning_true="detailed thinking on"
	),
	"llama-3-3-nemotron-super-v1-5": ModelConfig(
	model_name="nvidia/llama-3.3-nemotron-super-49b-v1.5",
	model_url="https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1_5",
	model_print_name="NVIDIA Llama 3.3 Nemotron Super 49B v1.5",
	# QueryUnderstandingTool
	query_understanding_temperature=0.1,
	query_understanding_max_tokens=5,
	# CodeGenerationAgent
	code_generation_temperature=0.0,
	code_generation_max_tokens=1024,
	# ReasoningAgent
	reasoning_temperature=0.6,
	reasoning_max_tokens=2048,
	# DataInsightAgent
	insights_temperature=0.2,
	insights_max_tokens=512,
	reasoning_false="/no_think",
	reasoning_true=""
	)
	}

	# Default configuration (can be changed via environment variable or UI)
	DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "llama-3-1-nemotron-ultra-v1")
	Config = MODEL_CONFIGS.get(DEFAULT_MODEL, MODEL_CONFIGS["llama-3-1-nemotron-ultra-v1"])

	# Initialize OpenAI client with configuration
	client = OpenAI(
	base_url=API_BASE_URL,
	api_key=API_KEY
	)

	def get_current_config():
	"""Get the current model configuration based on session state."""
	# Always return the current model from session state
	if "current_model" in st.session_state:
	return MODEL_CONFIGS[st.session_state.current_model]

	return MODEL_CONFIGS[DEFAULT_MODEL]

	# ------------------ QueryUnderstandingTool ---------------------------
	def QueryUnderstandingTool(query: str) -> bool:
	"""Return True if the query seems to request a visualisation based on keywords."""
	# Use LLM to understand intent instead of keyword matching
	current_config = get_current_config()

	# Prepend the instruction to the query
	full_prompt = f"""You are a query classifier. Your task is to determine if a user query is requesting a data visualization.

	IMPORTANT: Respond with ONLY 'true' or 'false' (lowercase, no quotes, no punctuation).

	Classify as 'true' ONLY if the query explicitly asks for:
	- A plot, chart, graph, visualization, or figure
	- To "show" or "display" data visually
	- To "create" or "generate" a visual representation
	- Words like: plot, chart, graph, visualize, show, display, create, generate, draw

	Classify as 'false' for:
	- Data analysis without visualization requests
	- Statistical calculations, aggregations, filtering, sorting
	- Questions about data content, counts, summaries
	- Requests for tables, dataframes, or text results

	User query: {query}"""

	messages = [
	{"role": "system", "content": current_config.REASONING_FALSE},
	{"role": "user", "content": full_prompt}
	]

	response = client.chat.completions.create(
	model=current_config.MODEL_NAME,
	messages=messages,
	temperature=current_config.QUERY_UNDERSTANDING_TEMPERATURE,
	max_tokens=current_config.QUERY_UNDERSTANDING_MAX_TOKENS # We only need a short response
	)

	# Extract the response and convert to boolean

	intent_response = response.choices[0].message.content.strip().lower()

	return intent_response == "true"

	# === CodeGeneration TOOLS ============================================


	# ------------------ CodeWritingTool ---------------------------------
	def CodeWritingTool(cols: List[str], query: str) -> str:
	"""Generate a prompt for the LLM to write pandas-only code for a data query (no plotting)."""

	return f"""

	Given DataFrame `df` with columns:

	{', '.join(cols)}

	Write Python code (pandas only, no plotting) to answer:
	"{query}"

	Rules
	-----
	1. Use pandas operations on `df` only.
	2. Rely only on the columns in the DataFrame.
	3. Assign the final result to `result`.
	4. Return your answer inside a single markdown fence that starts with ```python and ends with ```.
	5. Do not include any explanations, comments, or prose outside the code block.
	6. Use df as the sole data source. Do not read files, fetch data, or use Streamlit.
	7. Do not import any libraries (pandas is already imported as pd).
	8. Handle missing values (`dropna`) before aggregations.

	Example
	-----
	```python
	result = df.groupby("some_column")["a_numeric_col"].mean().sort_values(ascending=False)
	```

	"""


	# ------------------ PlotCodeGeneratorTool ---------------------------
	def PlotCodeGeneratorTool(cols: List[str], query: str) -> str:

	"""Generate a prompt for the LLM to write pandas + matplotlib code for a plot based on the query and columns."""

	return f"""

	Given DataFrame `df` with columns:

	{', '.join(cols)}

	Write Python code using pandas and matplotlib (as plt) to answer:
	"{query}"

	Rules
	-----
	1. Use pandas for data manipulation and matplotlib.pyplot (as plt) for plotting.
	2. Rely only on the columns in the DataFrame.
	3. Assign the final result (DataFrame, Series, scalar or matplotlib Figure) to a variable named `result`.
	4. Create only ONE relevant plot. Set `figsize={DEFAULT_FIGSIZE}`, add title/labels.
	5. Return your answer inside a single markdown fence that starts with ```python and ends with ```.
	6. Do not include any explanations, comments, or prose outside the code block.
	7. Handle missing values (`dropna`) before plotting/aggregations.

	"""


	# === CodeGenerationAgent ==============================================

	def CodeGenerationAgent(query: str, df: pd.DataFrame, chat_context: Optional[str] = None):
	"""Selects the appropriate code generation tool and gets code from the LLM for the user's query."""

	should_plot = QueryUnderstandingTool(query)

	prompt = PlotCodeGeneratorTool(df.columns.tolist(), query) if should_plot else CodeWritingTool(df.columns.tolist(), query)

	# Prepend the instruction to the query
	context_section = f"\nConversation context (recent user turns):\n{chat_context}\n" if chat_context else ""

	full_prompt = f"""You are a senior Python data analyst who writes clean, efficient code.
	Solve the given problem with optimal pandas operations. Be concise and focused.
	Your response must contain ONLY a properly-closed ```python code block with no explanations before or after (starts with ```python and ends with ```).
	Ensure your solution is correct, handles edge cases, and follows best practices for data analysis.
	If the latest user request references prior results ambiguously (e.g., "it", "that", "same groups"), infer intent from the conversation context and choose the most reasonable interpretation. {context_section}{prompt}"""

	current_config = get_current_config()

	messages = [
	{"role": "system", "content": current_config.REASONING_FALSE},
	{"role": "user", "content": full_prompt}
	]

	response = client.chat.completions.create(
	model=current_config.MODEL_NAME,
	messages=messages,
	temperature=current_config.CODE_GENERATION_TEMPERATURE,
	max_tokens=current_config.CODE_GENERATION_MAX_TOKENS
	)

	full_response = response.choices[0].message.content

	code = extract_first_code_block(full_response)
	return code, should_plot, ""

	# === ExecutionAgent ====================================================

	def ExecutionAgent(code: str, df: pd.DataFrame, should_plot: bool):
	"""Executes the generated code in a controlled environment and returns the result or error message."""

	# Set up execution environment with all necessary modules
	env = {
	"pd": pd,
	"df": df
	}

	if should_plot:
	plt.rcParams["figure.dpi"] = DEFAULT_DPI # Set default DPI for all figures
	env["plt"] = plt
	env["io"] = io

	try:
	# Execute the code in the environment
	exec(code, {}, env)
	result = env.get("result", None)

	# If no result was assigned, return the last expression
	if result is None:
	# Try to get the last executed expression
	if "result" not in env:
	return "No result was assigned to 'result' variable"

	return result
	except Exception as exc:
	return f"Error executing code: {exc}"

	# === ReasoningCurator TOOL =========================================
	def ReasoningCurator(query: str, result: Any) -> str:
	"""Builds and returns the LLM prompt for reasoning about the result."""
	is_error = isinstance(result, str) and result.startswith("Error executing code")
	is_plot = isinstance(result, (plt.Figure, plt.Axes))

	if is_error:
	desc = result
	elif is_plot:
	title = ""
	if isinstance(result, plt.Figure):
	title = result._suptitle.get_text() if result._suptitle else ""
	elif isinstance(result, plt.Axes):
	title = result.get_title()
	desc = f"[Plot Object: {title or 'Chart'}]"
	else:
	desc = str(result)[:MAX_RESULT_DISPLAY_LENGTH]

	if is_plot:
	prompt = f'''
	The user asked: "{query}".
	Below is a description of the plot result:
	{desc}
	Explain in 2–3 concise sentences what the chart shows (no code talk).'''
	else:
	prompt = f'''
	The user asked: "{query}".
	The result value is: {desc}
	Explain in 2–3 concise sentences what this tells about the data (no mention of charts).'''
	return prompt

	# === ReasoningAgent (streaming) =========================================
	def ReasoningAgent(query: str, result: Any):
	"""Streams the LLM's reasoning about the result (plot or value) and extracts model 'thinking' and final explanation."""
	current_config = get_current_config()
	prompt = ReasoningCurator(query, result)

	# Streaming LLM call
	response = client.chat.completions.create(
	model=current_config.MODEL_NAME,
	messages=[
	{"role": "system", "content": current_config.REASONING_TRUE},
	{"role": "user", "content": "You are an insightful data analyst. " + prompt}
	],
	temperature=current_config.REASONING_TEMPERATURE,
	max_tokens=current_config.REASONING_MAX_TOKENS,
	stream=True
	)

	# Stream and display thinking
	thinking_placeholder = st.empty()
	full_response = ""
	thinking_content = ""
	in_think = False

	for chunk in response:
	if chunk.choices[0].delta.content is not None:
	token = chunk.choices[0].delta.content
	full_response += token

	# Simple state machine to extract <think>...</think> as it streams
	if "<think>" in token:
	in_think = True
	token = token.split("<think>", 1)[1]
	if "</think>" in token:
	token = token.split("</think>", 1)[0]
	in_think = False
	if in_think or ("<think>" in full_response and not "</think>" in full_response):
	thinking_content += token
	thinking_placeholder.markdown(
	f'<details class="thinking" open><summary>🤔 Model Thinking</summary><pre>{thinking_content}</pre></details>',
	unsafe_allow_html=True
	)

	# After streaming, extract final reasoning (outside <think>...</think>)
	cleaned = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL).strip()
	return thinking_content, cleaned

	# === DataFrameSummary TOOL (pandas only) =========================================
	def DataFrameSummaryTool(df: pd.DataFrame) -> str:
	"""Generate a summary prompt string for the LLM based on the DataFrame."""
	prompt = f"""
	Given a dataset with {len(df)} rows and {len(df.columns)} columns:
	Columns: {', '.join(df.columns)}
	Data types: {df.dtypes.to_dict()}
	Missing values: {df.isnull().sum().to_dict()}

	Provide:
	1. A brief description of what this dataset contains
	2. 3-4 possible data analysis questions that could be explored
	Keep it concise and focused."""
	return prompt

	# === DataInsightAgent (upload-time only) ===============================

	def DataInsightAgent(df: pd.DataFrame) -> str:
	"""Uses the LLM to generate a brief summary and possible questions for the uploaded dataset."""
	current_config = get_current_config()
	prompt = DataFrameSummaryTool(df)
	try:
	response = client.chat.completions.create(
	model=current_config.MODEL_NAME,
	messages=[
	{"role": "system", "content": current_config.REASONING_FALSE},
	{"role": "user", "content": "You are a data analyst providing brief, focused insights. " + prompt}
	],
	temperature=current_config.INSIGHTS_TEMPERATURE,
	max_tokens=current_config.INSIGHTS_MAX_TOKENS
	)
	return response.choices[0].message.content
	except Exception as exc:
	raise Exception(f"Error generating dataset insights: {exc}")

	# === Helpers ===========================================================

	def extract_first_code_block(text: str) -> str:
	"""Extracts the first Python code block from a markdown-formatted string."""
	start = text.find("```python")
	if start == -1:
	return ""
	start += len("```python")
	end = text.find("```", start)
	if end == -1:
	return ""
	return text[start:end].strip()

	# === Main Streamlit App ===============================================

	def main():
	st.set_page_config(layout="wide")
	if "plots" not in st.session_state:
	st.session_state.plots = []
	if "current_model" not in st.session_state:
	st.session_state.current_model = DEFAULT_MODEL

	# Page logo at top right corner, large and clickable
	st.markdown(
	"""
	<div style='position: absolute; top: 20px; right: 30px; z-index: 999;'>
	<a href='https://www.linkedin.com/in/thiresh-sidda/' target='_blank'>
	<img src='https://ih1.redbubble.net/image.1849728168.3104/raf,360x360,075,t,fafafa:ca443f4786.jpg' alt='Logo' style='height:120px; border-radius:20px; box-shadow:0 2px 12px rgba(0,0,0,0.15);'>
	</a>
	</div>
	""",
	unsafe_allow_html=True
	)
	# Main title centered with large font and GIF
	st.markdown(
	"""
	<div style='display: flex; align-items: center; justify-content: center; margin-bottom: 30px;'>
	<span style='color:#1976D2; font-weight:bold; font-size:3.5em; margin-right:30px;'>Data Analysis Agent</span>
	<img src='https://cdn.dribbble.com/userupload/23161671/file/original-4c7894556285d8f223ab21fd10554fe4.gif' alt='GIF' style='height:120px;'>
	</div>
	""",
	unsafe_allow_html=True
	)

	medium_blue = "#1976D2" # Medium blue color

	# Move left panel to sidebar
	with st.sidebar:
	st.markdown(f"<span style='color:{medium_blue}; font-weight:bold; font-size:1.5em;'>Insights Generator</span>", unsafe_allow_html=True)
	available_models = list(MODEL_CONFIGS.keys())
	model_display_names = {key: MODEL_CONFIGS[key].MODEL_PRINT_NAME for key in available_models}
	selected_model = st.selectbox(
	"Select Model",
	options=available_models,
	format_func=lambda x: model_display_names[x],
	index=available_models.index(st.session_state.current_model)
	)
	display_config = MODEL_CONFIGS[selected_model]
	file = st.file_uploader("Choose CSV", type=["csv"], key="csv_uploader")
	# Update configuration if model changed
	if selected_model != st.session_state.current_model:
	st.session_state.current_model = selected_model
	new_config = MODEL_CONFIGS[selected_model]
	if "messages" in st.session_state:
	st.session_state.messages = []
	if "plots" in st.session_state:
	st.session_state.plots = []
	if "df" in st.session_state and file is not None:
	with st.spinner("Generating dataset insights with new model …"):
	try:
	st.session_state.insights = DataInsightAgent(st.session_state.df)
	st.success(f"Insights updated with {new_config.MODEL_PRINT_NAME}")
	except Exception as e:
	st.error(f"Error updating insights: {str(e)}")
	if "insights" in st.session_state:
	del st.session_state.insights
	st.rerun()
	if not file and "df" in st.session_state and "current_file" in st.session_state:
	del st.session_state.df
	del st.session_state.current_file
	if "insights" in st.session_state:
	del st.session_state.insights
	st.rerun()
	if file:
	if ("df" not in st.session_state) or (st.session_state.get("current_file") != file.name):
	st.session_state.df = pd.read_csv(file)
	st.session_state.current_file = file.name
	st.session_state.messages = []
	with st.spinner("Generating dataset insights …"):
	try:
	st.session_state.insights = DataInsightAgent(st.session_state.df)
	except Exception as e:
	st.error(f"Error generating insights: {str(e)}")
	elif "insights" not in st.session_state:
	with st.spinner("Generating dataset insights …"):
	try:
	st.session_state.insights = DataInsightAgent(st.session_state.df)
	except Exception as e:
	st.error(f"Error generating insights: {str(e)}")
	if "df" in st.session_state:
	st.markdown(f"<span style='color:{medium_blue}; font-weight:bold; font-size:1.2em;'>Your Dataset Insights</span>", unsafe_allow_html=True)
	if "insights" in st.session_state and st.session_state.insights:
	st.dataframe(st.session_state.df.head())
	st.markdown(f"<span style='color:{medium_blue};'>{st.session_state.insights}</span>", unsafe_allow_html=True)
	current_config_left = get_current_config()
	#st.markdown(f"<span style='color: grey; font-style: italic;'>Generated with {current_config_left.MODEL_PRINT_NAME}</span>", unsafe_allow_html=True)
	else:
	st.warning("No insights available.")
	else:
	st.info("Upload a CSV to begin chatting with your data.")

	with st.container():
	st.markdown(
	f"""
	<div style='display: flex; align-items: center; justify-content: flex-start; margin-bottom: 10px;'>
	<span style='color:{medium_blue}; font-weight:bold; font-size:2em; margin-right:20px;'>Chat with your data</span>
	<img src='https://i.pinimg.com/originals/5f/d5/58/5fd558f8b7a4f9e2138709cbe63c7052.gif' alt='Chat GIF' style='height:48px;'>
	</div>
	""",
	unsafe_allow_html=True
	)
	if "df" in st.session_state:
	current_config_right = get_current_config()
	st.markdown(f"<span style='color: grey; font-style: italic;'>Using {current_config_right.MODEL_PRINT_NAME}</span>", unsafe_allow_html=True)
	if "messages" not in st.session_state:
	st.session_state.messages = []

	clear_col1, clear_col2 = st.columns([9,1])
	with clear_col2:
	if st.button("Clear chat"):
	st.session_state.messages = []
	st.session_state.plots = []
	st.rerun()

	for msg in st.session_state.messages:
	with st.chat_message(msg["role"]):
	st.markdown(f"<span style='color:{medium_blue}; font-size:1.1em;'>{msg['content']}</span>", unsafe_allow_html=True)
	if msg.get("plot_index") is not None:
	idx = msg["plot_index"]
	if 0 <= idx < len(st.session_state.plots):
	st.pyplot(st.session_state.plots[idx], use_container_width=False)

	if "df" in st.session_state:
	if user_q := st.chat_input("Ask about your data…"):
	st.session_state.messages.append({"role": "user", "content": user_q})
	with st.spinner("Working …"):
	recent_user_turns = [m["content"] for m in st.session_state.messages if m["role"] == "user"][-3:]
	context_text = "\n".join(recent_user_turns[:-1]) if len(recent_user_turns) > 1 else None
	code, should_plot_flag, code_thinking = CodeGenerationAgent(user_q, st.session_state.df, context_text)
	result_obj = ExecutionAgent(code, st.session_state.df, should_plot_flag)
	raw_thinking, reasoning_txt = ReasoningAgent(user_q, result_obj)
	reasoning_txt = reasoning_txt.replace("`", "")

	is_plot = isinstance(result_obj, (plt.Figure, plt.Axes))
	plot_idx = None
	if is_plot:
	fig = result_obj.figure if isinstance(result_obj, plt.Axes) else result_obj
	st.session_state.plots.append(fig)
	plot_idx = len(st.session_state.plots) - 1
	header = "Here is the visualization you requested:"
	elif isinstance(result_obj, (pd.DataFrame, pd.Series)):
	header = f"Result: {len(result_obj)} rows" if isinstance(result_obj, pd.DataFrame) else "Result series"
	else:
	header = f"Result: {result_obj}"

	thinking_html = ""
	if raw_thinking:
	thinking_html = (
	'<details class="thinking">'
	'<summary>🧠 Reasoning</summary>'
	f'<pre>{raw_thinking}</pre>'
	'</details>'
	)

	explanation_html = reasoning_txt

	code_html = (
	'<details class="code">'
	'<summary>View code</summary>'
	'<pre><code class="language-python">'
	f'{code}'
	'</code></pre>'
	'</details>'
	)
	assistant_msg = f"{thinking_html}{explanation_html}\n\n{code_html}"

	st.session_state.messages.append({
	"role": "assistant",
	"content": assistant_msg,
	"plot_index": plot_idx
	})
	st.rerun()

	if __name__ == "__main__":
	main()