Spaces:
Running
Running
| import json | |
| from pathlib import Path | |
| import sqlite3 | |
| import pickle | |
| from functools import lru_cache | |
| import threading | |
| import pandas as pd | |
| import ast | |
| from scipy import stats | |
| import yaml | |
| import numpy as np | |
| # Define column schemas | |
| PARSED_RESULTS_COLUMNS = { | |
| 'benchmark_name': 'TEXT', | |
| 'agent_name': 'TEXT', | |
| 'date': 'TEXT', | |
| 'run_id': 'TEXT', | |
| 'successful_tasks': 'TEXT', | |
| 'failed_tasks': 'TEXT', | |
| 'total_cost': 'REAL', | |
| 'accuracy': 'REAL', | |
| 'precision': 'REAL', | |
| 'recall': 'REAL', | |
| 'f1_score': 'REAL', | |
| 'auc': 'REAL', | |
| 'overall_score': 'REAL', | |
| 'vectorization_score': 'REAL', | |
| 'fathomnet_score': 'REAL', | |
| 'feedback_score': 'REAL', | |
| 'house_price_score': 'REAL', | |
| 'spaceship_titanic_score': 'REAL', | |
| 'amp_parkinsons_disease_progression_prediction_score': 'REAL', | |
| 'cifar10_score': 'REAL', | |
| 'imdb_score': 'REAL', | |
| 'level_1_accuracy': 'REAL', | |
| 'level_2_accuracy': 'REAL', | |
| 'level_3_accuracy': 'REAL', | |
| 'task_goal_completion': 'REAL', # New column | |
| 'scenario_goal_completion': 'REAL', # New column | |
| 'accuracy_ci': 'TEXT', # Using TEXT since it stores formatted strings like "-0.123/+0.456" | |
| 'cost_ci': 'TEXT', | |
| } | |
| # Define which columns should be included in aggregation and how | |
| AGGREGATION_RULES = { | |
| 'date': 'first', | |
| 'total_cost': 'mean', | |
| 'accuracy': 'mean', | |
| 'precision': 'mean', | |
| 'recall': 'mean', | |
| 'f1_score': 'mean', | |
| 'auc': 'mean', | |
| 'overall_score': 'mean', | |
| 'vectorization_score': 'mean', | |
| 'fathomnet_score': 'mean', | |
| 'feedback_score': 'mean', | |
| 'house_price_score': 'mean', | |
| 'spaceship_titanic_score': 'mean', | |
| 'amp_parkinsons_disease_progression_prediction_score': 'mean', | |
| 'cifar10_score': 'mean', | |
| 'imdb_score': 'mean', | |
| 'level_1_accuracy': 'mean', | |
| 'level_2_accuracy': 'mean', | |
| 'level_3_accuracy': 'mean', | |
| 'task_goal_completion': 'mean', | |
| 'scenario_goal_completion': 'mean', | |
| 'Verified': 'first', | |
| 'Runs': 'first', | |
| 'Traces': 'first', | |
| 'accuracy_ci': 'first', | |
| 'cost_ci': 'first', | |
| } | |
| # Define column display names | |
| COLUMN_DISPLAY_NAMES = { | |
| 'agent_name': 'Agent Name', | |
| 'date': 'Date', | |
| 'total_cost': 'Total Cost', | |
| 'accuracy': 'Accuracy', | |
| 'precision': 'Precision', | |
| 'recall': 'Recall', | |
| 'f1_score': 'F1 Score', | |
| 'auc': 'AUC', | |
| 'overall_score': 'Overall Score', | |
| 'vectorization_score': 'Vectorization Score', | |
| 'fathomnet_score': 'Fathomnet Score', | |
| 'feedback_score': 'Feedback Score', | |
| 'house_price_score': 'House Price Score', | |
| 'spaceship_titanic_score': 'Spaceship Titanic Score', | |
| 'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score', | |
| 'cifar10_score': 'CIFAR10 Score', | |
| 'imdb_score': 'IMDB Score', | |
| 'level_1_accuracy': 'Level 1 Accuracy', | |
| 'level_2_accuracy': 'Level 2 Accuracy', | |
| 'level_3_accuracy': 'Level 3 Accuracy', | |
| 'task_goal_completion': 'Task Goal Completion', | |
| 'scenario_goal_completion': 'Scenario Goal Completion', | |
| 'accuracy_ci': 'Accuracy CI', | |
| 'cost_ci': 'Total Cost CI', | |
| } | |
| DEFAULT_PRICING = { | |
| "text-embedding-3-small": {"prompt_tokens": 0.02, "completion_tokens": 0}, | |
| "text-embedding-3-large": {"prompt_tokens": 0.13, "completion_tokens": 0}, | |
| "gpt-4o-2024-05-13": {"prompt_tokens": 2.5, "completion_tokens": 10}, | |
| "gpt-4o-2024-08-06": {"prompt_tokens": 2.5, "completion_tokens": 10}, | |
| "gpt-3.5-turbo-0125": {"prompt_tokens": 0.5, "completion_tokens": 1.5}, | |
| "gpt-3.5-turbo": {"prompt_tokens": 0.5, "completion_tokens": 1.5}, | |
| "gpt-4-turbo-2024-04-09": {"prompt_tokens": 10, "completion_tokens": 30}, | |
| "gpt-4-turbo": {"prompt_tokens": 10, "completion_tokens": 30}, | |
| "gpt-4o-mini-2024-07-18": {"prompt_tokens": 0.15, "completion_tokens": 0.6}, | |
| "meta-llama/Meta-Llama-3.1-8B-Instruct": {"prompt_tokens": 0.18, "completion_tokens": 0.18}, | |
| "meta-llama/Meta-Llama-3.1-70B-Instruct": {"prompt_tokens": 0.88, "completion_tokens": 0.88}, | |
| "meta-llama/Meta-Llama-3.1-405B-Instruct": {"prompt_tokens": 5, "completion_tokens": 15}, | |
| "gpt-4o": {"prompt_tokens": 2.5, "completion_tokens": 10}, | |
| "o1-mini-2024-09-12": {"prompt_tokens": 3, "completion_tokens": 12}, | |
| "o1-preview-2024-09-12": {"prompt_tokens": 15, "completion_tokens": 60}, | |
| "claude-3-5-sonnet-20240620": {"prompt_tokens": 3, "completion_tokens": 15}, | |
| "claude-3-5-sonnet-20241022": {"prompt_tokens": 3, "completion_tokens": 15}, | |
| "us.anthropic.claude-3-5-sonnet-20240620-v1:0": {"prompt_tokens": 3, "completion_tokens": 15}, | |
| "us.anthropic.claude-3-5-sonnet-20241022-v2:0": {"prompt_tokens": 3, "completion_tokens": 15}, | |
| "openai/gpt-4o-2024-11-20": {"prompt_tokens": 2.5, "completion_tokens": 10}, | |
| "openai/gpt-4o-2024-08-06": {"prompt_tokens": 2.5, "completion_tokens": 10}, | |
| "openai/gpt-4o-mini-2024-07-18": {"prompt_tokens": 0.15, "completion_tokens": 0.6}, | |
| "openai/o1-mini-2024-09-12": {"prompt_tokens": 3, "completion_tokens": 12}, | |
| "openai/o1-preview-2024-09-12": {"prompt_tokens": 15, "completion_tokens": 60}, | |
| "anthropic/claude-3-5-sonnet-20240620": {"prompt_tokens": 3, "completion_tokens": 15}, | |
| "anthropic/claude-3-5-sonnet-20241022": {"prompt_tokens": 3, "completion_tokens": 15}, | |
| "google/gemini-1.5-pro": {"prompt_tokens": 1.25, "completion_tokens": 5}, | |
| "google/gemini-1.5-flash": {"prompt_tokens": 0.075, "completion_tokens": 0.3}, | |
| "together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": {"prompt_tokens": 3.5, "completion_tokens": 3.5}, | |
| "together/meta-llama/Meta-Llama-3.1-70B-Instruct": {"prompt_tokens": 0.88, "completion_tokens": 0.88}, | |
| } | |
| class TracePreprocessor: | |
| def __init__(self, db_dir='preprocessed_traces'): | |
| self.db_dir = Path(db_dir) | |
| self.db_dir.mkdir(exist_ok=True) | |
| self.local = threading.local() | |
| self.connections = {} | |
| def get_conn(self, benchmark_name): | |
| # Sanitize benchmark name for filename | |
| safe_name = benchmark_name.replace('/', '_').replace('\\', '_') | |
| db_path = self.db_dir / f"{safe_name}.db" | |
| # Get thread-specific connections dictionary | |
| if not hasattr(self.local, 'connections'): | |
| self.local.connections = {} | |
| # Create new connection if not exists for this benchmark | |
| if safe_name not in self.local.connections: | |
| self.local.connections[safe_name] = sqlite3.connect(db_path) | |
| return self.local.connections[safe_name] | |
| def create_tables(self, benchmark_name): | |
| with self.get_conn(benchmark_name) as conn: | |
| # Create parsed_results table dynamically from schema | |
| columns = [f"{col} {dtype}" for col, dtype in PARSED_RESULTS_COLUMNS.items()] | |
| create_parsed_results = f''' | |
| CREATE TABLE IF NOT EXISTS parsed_results ( | |
| {', '.join(columns)}, | |
| PRIMARY KEY (benchmark_name, agent_name, run_id) | |
| ) | |
| ''' | |
| conn.execute(create_parsed_results) | |
| conn.execute(''' | |
| CREATE TABLE IF NOT EXISTS preprocessed_traces ( | |
| benchmark_name TEXT, | |
| agent_name TEXT, | |
| date TEXT, | |
| run_id TEXT, | |
| raw_logging_results BLOB, | |
| PRIMARY KEY (benchmark_name, agent_name, run_id) | |
| ) | |
| ''') | |
| conn.execute(''' | |
| CREATE TABLE IF NOT EXISTS failure_reports ( | |
| benchmark_name TEXT, | |
| agent_name TEXT, | |
| date TEXT, | |
| run_id TEXT, | |
| failure_report BLOB, | |
| PRIMARY KEY (benchmark_name, agent_name, run_id) | |
| ) | |
| ''') | |
| conn.execute(''' | |
| CREATE TABLE IF NOT EXISTS token_usage ( | |
| benchmark_name TEXT, | |
| agent_name TEXT, | |
| run_id TEXT, | |
| model_name TEXT, | |
| prompt_tokens INTEGER, | |
| completion_tokens INTEGER, | |
| input_tokens INTEGER, | |
| output_tokens INTEGER, | |
| total_tokens INTEGER, | |
| input_tokens_cache_write INTEGER, | |
| input_tokens_cache_read INTEGER, | |
| PRIMARY KEY (benchmark_name, agent_name, run_id, model_name) | |
| ) | |
| ''') | |
| def preprocess_traces(self, processed_dir="evals_live"): | |
| processed_dir = Path(processed_dir) | |
| for file in processed_dir.glob('*.json'): | |
| with open(file, 'r') as f: | |
| data = json.load(f) | |
| agent_name = data['config']['agent_name'] | |
| benchmark_name = data['config']['benchmark_name'] | |
| if "inspect" in benchmark_name: | |
| benchmark_name = benchmark_name.split("/")[-1] | |
| date = data['config']['date'] | |
| config = data['config'] | |
| # Create tables for this benchmark if they don't exist | |
| self.create_tables(benchmark_name) | |
| try: | |
| raw_logging_results = pickle.dumps(data['raw_logging_results']) | |
| with self.get_conn(benchmark_name) as conn: | |
| conn.execute(''' | |
| INSERT OR REPLACE INTO preprocessed_traces | |
| (benchmark_name, agent_name, date, run_id, raw_logging_results) | |
| VALUES (?, ?, ?, ?, ?) | |
| ''', (benchmark_name, agent_name, date, config['run_id'], raw_logging_results)) | |
| except Exception as e: | |
| print(f"Error preprocessing raw_logging_results in {file}: {e}") | |
| try: | |
| failure_report = pickle.dumps(data['failure_report']) | |
| with self.get_conn(benchmark_name) as conn: | |
| conn.execute(''' | |
| INSERT INTO failure_reports | |
| (benchmark_name, agent_name, date, run_id, failure_report) | |
| VALUES (?, ?, ?, ?, ?) | |
| ''', (benchmark_name, agent_name, date, config['run_id'], failure_report)) | |
| except Exception as e: | |
| print(f"Error preprocessing failure_report in {file}: {e}") | |
| try: | |
| results = data['results'] | |
| with self.get_conn(benchmark_name) as conn: | |
| # Dynamically create placeholders and values list | |
| columns = [col for col in PARSED_RESULTS_COLUMNS.keys() | |
| if col not in ['benchmark_name', 'agent_name', 'date', 'run_id']] | |
| placeholders = ','.join(['?'] * (len(columns) + 4)) # +4 for benchmark_name, agent_name, date, run_id | |
| values = [ | |
| benchmark_name, | |
| agent_name, | |
| config['date'], | |
| config['run_id'] | |
| ] + [str(results.get(col)) if col in ['successful_tasks', 'failed_tasks'] | |
| else results.get(col) for col in columns] | |
| query = f''' | |
| INSERT INTO parsed_results | |
| ({', '.join(PARSED_RESULTS_COLUMNS.keys())}) | |
| VALUES ({placeholders}) | |
| ''' | |
| conn.execute(query, values) | |
| except Exception as e: | |
| print(f"Error preprocessing parsed results in {file}: {e}") | |
| try: | |
| total_usage = data.get('total_usage', {}) | |
| for model_name, usage in total_usage.items(): | |
| with self.get_conn(benchmark_name) as conn: | |
| conn.execute(''' | |
| INSERT INTO token_usage | |
| (benchmark_name, agent_name, run_id, model_name, | |
| prompt_tokens, completion_tokens, input_tokens, output_tokens, total_tokens, | |
| input_tokens_cache_write, input_tokens_cache_read) | |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| ''', ( | |
| benchmark_name, | |
| agent_name, | |
| config['run_id'], | |
| model_name, | |
| usage.get('prompt_tokens', 0), | |
| usage.get('completion_tokens', 0), | |
| usage.get('input_tokens', 0), | |
| usage.get('output_tokens', 0), | |
| usage.get('total_tokens', 0), | |
| usage.get('input_tokens_cache_write', 0), | |
| usage.get('input_tokens_cache_read', 0) | |
| )) | |
| except Exception as e: | |
| print(f"Error preprocessing token usage in {file}: {e}") | |
| def get_analyzed_traces(self, agent_name, benchmark_name): | |
| with self.get_conn(benchmark_name) as conn: | |
| query = ''' | |
| SELECT agent_name, raw_logging_results, date FROM preprocessed_traces | |
| WHERE benchmark_name = ? AND agent_name = ? | |
| ''' | |
| df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name)) | |
| # check for each row if raw_logging_results is not None | |
| df = df[df['raw_logging_results'].apply(lambda x: pickle.loads(x) is not None and x != 'None')] | |
| if len(df) == 0: | |
| return None | |
| # select latest run | |
| df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index() | |
| return pickle.loads(df['raw_logging_results'][0]) | |
| def get_failure_report(self, agent_name, benchmark_name): | |
| with self.get_conn(benchmark_name) as conn: | |
| query = ''' | |
| SELECT agent_name, date, failure_report FROM failure_reports | |
| WHERE benchmark_name = ? AND agent_name = ? | |
| ''' | |
| df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name)) | |
| df = df[df['failure_report'].apply(lambda x: pickle.loads(x) is not None and x != 'None')] | |
| if len(df) == 0: | |
| return None | |
| df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index() | |
| return pickle.loads(df['failure_report'][0]) | |
| def _calculate_ci(self, data, confidence=0.95, type='minmax'): | |
| data = data[np.isfinite(data)] | |
| if len(data) < 2: | |
| return '', '', '' # No CI for less than 2 samples | |
| n = len(data) | |
| mean = np.mean(data) | |
| if type == 't': | |
| sem = stats.sem(data) | |
| ci = stats.t.interval(confidence, n-1, loc=mean, scale=sem) | |
| elif type == 'minmax': | |
| min = np.min(data) | |
| max = np.max(data) | |
| ci = (min, max) | |
| return mean, ci[0], ci[1] | |
| def get_parsed_results(self, benchmark_name, aggregate=True): | |
| with self.get_conn(benchmark_name) as conn: | |
| query = ''' | |
| SELECT * FROM parsed_results | |
| WHERE benchmark_name = ? | |
| ORDER BY accuracy DESC | |
| ''' | |
| df = pd.read_sql_query(query, conn, params=(benchmark_name,)) | |
| # Load metadata | |
| with open('agents_metadata.yaml', 'r') as f: | |
| metadata = yaml.safe_load(f) | |
| # Create URL mapping | |
| url_mapping = {} | |
| if benchmark_name in metadata: | |
| for agent in metadata[benchmark_name]: | |
| if 'url' in agent and agent['url']: # Only add if URL exists and is not empty | |
| url_mapping[agent['agent_name']] = agent['url'] | |
| # Add 'Verified' column | |
| verified_agents = self.load_verified_agents() | |
| df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1) | |
| # Add URLs to agent names if they exist | |
| df['agent_name'] = df['agent_name'].apply( | |
| lambda x: f'[{x}]({url_mapping[x]})' if x in url_mapping else x | |
| ) | |
| # Add column for how many times an agent_name appears in the DataFrame | |
| df['Runs'] = df.groupby('agent_name')['agent_name'].transform('count') | |
| # Compute the 95% confidence interval for accuracy and cost for agents that have been run more than once | |
| df['accuracy_ci'] = None | |
| df['cost_ci'] = None | |
| # Round float columns to 2 decimal places | |
| float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', | |
| 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', | |
| 'house_price_score', 'spaceship_titanic_score', | |
| 'amp_parkinsons_disease_progression_prediction_score', 'cifar10_score', | |
| 'imdb_score', 'level_1_accuracy', 'level_2_accuracy', 'level_3_accuracy'] | |
| for column in float_columns: | |
| if column in df.columns: | |
| df[column] = df[column].round(2) | |
| for agent_name in df['agent_name'].unique(): | |
| agent_df = df[df['agent_name'] == agent_name] | |
| if len(agent_df) > 1: | |
| accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'], type='minmax') | |
| cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'], type='minmax') | |
| # Round CI values to 2 decimals | |
| accuracy_ci = f"-{abs(accuracy_mean - accuracy_lower):.2f}/+{abs(accuracy_mean - accuracy_upper):.2f}" | |
| cost_ci = f"-{abs(cost_mean - cost_lower):.2f}/+{abs(cost_mean - cost_upper):.2f}" | |
| df.loc[df['agent_name'] == agent_name, 'accuracy_ci'] = accuracy_ci | |
| df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci | |
| # Before dropping run_id, create new column from it with download link | |
| df['Traces'] = df['run_id'].apply( | |
| lambda x: f'[load](https://huggingface.co/datasets/agent-evals/agent_traces/resolve/main/{x}.zip?download=true)' | |
| if x else '' | |
| ) | |
| df = df.drop(columns=['successful_tasks', 'failed_tasks'], axis=1) | |
| if aggregate: | |
| df = df.groupby('agent_name').agg(AGGREGATION_RULES).reset_index() | |
| # Rename columns using the display names mapping | |
| df = df.rename(columns=COLUMN_DISPLAY_NAMES) | |
| # Sort by Accuracy in descending order | |
| df = df.sort_values('Accuracy', ascending=False) | |
| return df | |
| def get_task_success_data(self, benchmark_name): | |
| with self.get_conn(benchmark_name) as conn: | |
| query = ''' | |
| SELECT agent_name, accuracy, successful_tasks, failed_tasks | |
| FROM parsed_results | |
| WHERE benchmark_name = ? | |
| ''' | |
| df = pd.read_sql_query(query, conn, params=(benchmark_name,)) | |
| # for agent_names that have been run more than once, take the run with the highest accuracy | |
| df = df.sort_values('accuracy', ascending=False).groupby('agent_name').first().reset_index() | |
| # Get all unique task IDs | |
| task_ids = set() | |
| for tasks in df['successful_tasks']: | |
| if ast.literal_eval(tasks) is not None: | |
| task_ids.update(ast.literal_eval(tasks)) | |
| for tasks in df['failed_tasks']: | |
| if ast.literal_eval(tasks) is not None: | |
| task_ids.update(ast.literal_eval(tasks)) | |
| # Create a DataFrame with agent_name, task_ids, and success columns | |
| data_list = [] | |
| for _, row in df.iterrows(): | |
| agent_name = row['agent_name'] | |
| for task_id in task_ids: | |
| success = 1 if task_id in row['successful_tasks'] else 0 | |
| data_list.append({ | |
| 'agent_name': agent_name, | |
| 'task_id': task_id, | |
| 'success': success | |
| }) | |
| df = pd.DataFrame(data_list) | |
| df = df.rename(columns={ | |
| 'agent_name': 'Agent Name', | |
| 'task_id': 'Task ID', | |
| 'success': 'Success' | |
| }) | |
| return df | |
| def load_verified_agents(self, file_path='agents_metadata.yaml'): | |
| with open(file_path, 'r') as f: | |
| metadata = yaml.safe_load(f) | |
| verified_agents = set() | |
| for benchmark, agents in metadata.items(): | |
| for agent in agents: | |
| if 'verification_date' in agent: # Only add if verified | |
| verified_agents.add((benchmark, agent['agent_name'])) | |
| return verified_agents | |
| def get_token_usage_with_costs(self, benchmark_name, pricing_config=None): | |
| """Get token usage data with configurable pricing""" | |
| if pricing_config is None: | |
| pricing_config = DEFAULT_PRICING | |
| with self.get_conn(benchmark_name) as conn: | |
| query = ''' | |
| SELECT agent_name, model_name, | |
| SUM(prompt_tokens) as prompt_tokens, | |
| SUM(completion_tokens) as completion_tokens, | |
| SUM(input_tokens) as input_tokens, | |
| SUM(output_tokens) as output_tokens, | |
| SUM(total_tokens) as total_tokens, | |
| SUM(input_tokens_cache_write) as input_tokens_cache_write, | |
| SUM(input_tokens_cache_read) as input_tokens_cache_read | |
| FROM token_usage | |
| WHERE benchmark_name = ? | |
| GROUP BY agent_name, model_name | |
| ''' | |
| df = pd.read_sql_query(query, conn, params=(benchmark_name,)) | |
| # Calculate costs based on pricing config (prices are per 1M tokens) | |
| df['total_cost'] = 0.0 | |
| for model, prices in pricing_config.items(): | |
| mask = df['model_name'] == model | |
| df.loc[mask, 'total_cost'] = ( | |
| df.loc[mask, 'input_tokens'] * prices['prompt_tokens'] / 1e6 + | |
| df.loc[mask, 'output_tokens'] * prices['completion_tokens'] / 1e6 + | |
| df.loc[mask, 'input_tokens_cache_read'] * prices['prompt_tokens'] / 1e6 + | |
| df.loc[mask, 'input_tokens_cache_write'] * prices['prompt_tokens'] / 1e6 + | |
| df.loc[mask, 'prompt_tokens'] * prices['prompt_tokens'] / 1e6 + | |
| df.loc[mask, 'completion_tokens'] * prices['completion_tokens'] / 1e6 | |
| ) | |
| return df | |
| def get_parsed_results_with_costs(self, benchmark_name, pricing_config=None, aggregate=True): | |
| """Get parsed results with recalculated costs based on token usage""" | |
| # Get base results | |
| results_df = self.get_parsed_results(benchmark_name, aggregate=False) | |
| # Get token usage with new costs | |
| token_costs = self.get_token_usage_with_costs(benchmark_name, pricing_config) | |
| # Group token costs by agent | |
| agent_costs = token_costs.groupby('agent_name')['total_cost'].sum().reset_index() | |
| agent_costs = agent_costs.rename(columns={ | |
| 'agent_name': 'agent_name_temp', | |
| 'total_cost': 'Total Cost' | |
| }) | |
| # Drop existing Total Cost column if it exists | |
| if 'Total Cost' in results_df.columns: | |
| results_df = results_df.drop('Total Cost', axis=1) | |
| # create temp column that is whatever is in agent_name [x] because of url we added to agent_name | |
| results_df['agent_name_temp'] = results_df['Agent Name'].apply(lambda x: x.split('[')[1].split(']')[0] if '[' in x else x) | |
| # Update costs in results | |
| results_df = results_df.merge(agent_costs, on='agent_name_temp', how='left') | |
| # Drop temp column | |
| results_df = results_df.drop('agent_name_temp', axis=1) | |
| if aggregate: | |
| # Aggregate results | |
| results_df = results_df.groupby('Agent Name').agg({ | |
| 'Date': 'first', | |
| 'Total Cost': 'mean', | |
| 'Accuracy': 'mean', | |
| 'Precision': 'mean', | |
| 'Recall': 'mean', | |
| 'F1 Score': 'mean', | |
| 'AUC': 'mean', | |
| 'Overall Score': 'mean', | |
| 'Vectorization Score': 'mean', | |
| 'Fathomnet Score': 'mean', | |
| 'Feedback Score': 'mean', | |
| 'House Price Score': 'mean', | |
| 'Spaceship Titanic Score': 'mean', | |
| 'AMP Parkinsons Disease Progression Prediction Score': 'mean', | |
| 'CIFAR10 Score': 'mean', | |
| 'IMDB Score': 'mean', | |
| 'Level 1 Accuracy': 'mean', | |
| 'Level 2 Accuracy': 'mean', | |
| 'Level 3 Accuracy': 'mean', | |
| 'Verified': 'first', | |
| 'Traces': 'first', | |
| 'Runs': 'first', | |
| 'Accuracy CI': 'first', | |
| 'Total Cost CI': 'first' | |
| }).reset_index() | |
| # Round the cost values | |
| results_df['Total Cost'] = results_df['Total Cost'].round(3) | |
| # Sort by Accuracy in descending order | |
| results_df = results_df.sort_values('Accuracy', ascending=False) | |
| return results_df | |
| def check_token_usage_data(self, benchmark_name): | |
| """Debug helper to check token usage data""" | |
| with self.get_conn(benchmark_name) as conn: | |
| query = ''' | |
| SELECT * FROM token_usage | |
| WHERE benchmark_name = ? | |
| ''' | |
| df = pd.read_sql_query(query, conn, params=(benchmark_name,)) | |
| return df | |
| def get_models_for_benchmark(self, benchmark_name): | |
| """Get list of unique model names used in a benchmark""" | |
| with self.get_conn(benchmark_name) as conn: | |
| query = ''' | |
| SELECT DISTINCT model_name | |
| FROM token_usage | |
| WHERE benchmark_name = ? | |
| ''' | |
| df = pd.read_sql_query(query, conn, params=(benchmark_name,)) | |
| return df['model_name'].tolist() | |
| def get_all_agents(self, benchmark_name): | |
| """Get list of all agent names for a benchmark""" | |
| with self.get_conn(benchmark_name) as conn: | |
| query = ''' | |
| SELECT DISTINCT agent_name | |
| FROM parsed_results | |
| WHERE benchmark_name = ? | |
| ''' | |
| df = pd.read_sql_query(query, conn, params=(benchmark_name,)) | |
| return df['agent_name'].tolist() | |
| def get_total_benchmarks(self): | |
| """Get the total number of unique benchmarks in the database""" | |
| benchmarks = set() | |
| for db_file in self.db_dir.glob('*.db'): | |
| benchmarks.add(db_file.stem.replace('_', '/')) | |
| return len(benchmarks) -1 # TODO hardcoded -1 for mlagentbench | |
| def get_total_agents(self): | |
| """Get the total number of unique agents across all benchmarks""" | |
| total_agents = set() | |
| # Use the parsed_results table since it's guaranteed to have all benchmark-agent pairs | |
| for db_file in self.db_dir.glob('*.db'): | |
| # skip mlagentbench | |
| if db_file.stem == 'mlagentbench': | |
| continue # TODO remove hardcoded skip for mlagentbench | |
| benchmark_name = db_file.stem.replace('_', '/') | |
| with self.get_conn(benchmark_name) as conn: | |
| query = ''' | |
| SELECT DISTINCT benchmark_name, agent_name | |
| FROM parsed_results | |
| ''' | |
| results = conn.execute(query).fetchall() | |
| # Add each benchmark-agent pair to the set | |
| total_agents.update(results) | |
| return len(total_agents) | |
| if __name__ == '__main__': | |
| preprocessor = TracePreprocessor() | |
| preprocessor.preprocess_traces() |