benchmark

Running

App Files Files Community

benchmark / app.py

cdminix

Update app.py

63e255e verified about 1 year ago

raw

history blame contribute delete

11.9 kB

	from pathlib import Path
	import json
	import os

	import gradio as gr
	from huggingface_hub import snapshot_download
	from gradio_leaderboard import Leaderboard, SelectColumns
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	from ttsds.benchmarks.benchmark import BenchmarkCategory
	from ttsds import BenchmarkSuite

	from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, TAGS
	from src.texts import LLM_BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT, CITATION_TEXT
	from src.css_html_js import custom_css


	def filter_dfs(tags, lb):
	global f_b_df, f_a_df
	is_agg = False
	if "Environment" in lb.columns:
	is_agg = True
	if is_agg:
	lb = f_a_df.copy()
	else:
	lb = f_b_df.copy()
	if tags and len(lb) > 0:
	lb = lb[lb["Tags"].apply(lambda x: any(tag in x for tag in tags))]
	lb = rounded_df(lb)
	return lb

	def change_mean(env, lb):
	global f_b_df, f_a_df
	lb = f_a_df.copy()
	if env:
	mean_cols = [col for col in lb.columns if str(col) not in ["Mean", "Environment", "Model", "Tags"]]
	else:
	mean_cols = [col for col in lb.columns if str(col) not in ["Mean", "Model", "Tags"]]
	lb["Mean"] = lb[mean_cols].mean(axis=1)
	lb = rounded_df(lb)
	return lb

	def restart_space():
	API.restart_space(repo_id=REPO_ID)


	def submit_eval(model_name, model_tags, web_url, hf_url, code_url, paper_url, inference_details, file_path):
	model_id = model_name.lower().replace(" ", "_")
	# check if model already exists
	if Path(f"{EVAL_REQUESTS_PATH}/{model_id}.json").exists():
	return "Model already exists in the evaluation queue"
	# check which urls are valid
	if web_url and not web_url.startswith("http"):
	return "Please enter a valid URL"
	if hf_url and not hf_url.startswith("http"):
	return "Please enter a valid URL"
	if code_url and not code_url.startswith("http"):
	return "Please enter a valid URL"
	if paper_url and not paper_url.startswith("http"):
	return "Please enter a valid URL"
	# move file to correct location
	if not file_path.endswith(".tar.gz"):
	return "Please upload a .tar.gz file"
	Path(file_path).rename(f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz")
	# build display name - use web_url to link text if available, and emojis for the other urls
	display_name = model_name + " "
	if web_url:
	display_name = f"[{display_name}]({web_url}) "
	if hf_url:
	display_name += f"[🤗]({hf_url})"
	if code_url:
	display_name += f"[💻]({code_url})"
	if paper_url:
	display_name += f"[📄]({paper_url})"
	request_obj = {
	"model_name": model_name,
	"display_name": display_name,
	"model_tags": model_tags,
	"web_url": web_url,
	"hf_url": hf_url,
	"code_url": code_url,
	"paper_url": paper_url,
	"inference_details": inference_details,
	"status": "pending",
	}
	try:
	with open(f"{EVAL_REQUESTS_PATH}/{model_id}.json", "w") as f:
	json.dump(request_obj, f)
	API.upload_file(
	path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.json",
	path_in_repo=f"{model_id}.json",
	repo_id=QUEUE_REPO,
	repo_type="dataset",
	commit_message=f"Add {model_name} to evaluation queue",
	)
	API.upload_file(
	path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz",
	path_in_repo=f"{model_id}.tar.gz",
	repo_id=QUEUE_REPO,
	repo_type="dataset",
	commit_message=f"Add {model_name} to evaluation queue",
	)
	except error as e:
	os.remove(f"{EVAL_REQUESTS_PATH}/{model_id}.json")
	return f"Error: {e}"

	return "Model submitted successfully 🎉"


	### Space initialisation
	try:
	print(EVAL_REQUESTS_PATH)
	snapshot_download(
	repo_id=QUEUE_REPO,
	local_dir=EVAL_REQUESTS_PATH,
	repo_type="dataset",
	tqdm_class=None,
	etag_timeout=30,
	token=TOKEN,
	)
	except Exception:
	restart_space()
	try:
	print(EVAL_RESULTS_PATH)
	snapshot_download(
	repo_id=RESULTS_REPO,
	local_dir=EVAL_RESULTS_PATH,
	repo_type="dataset",
	tqdm_class=None,
	etag_timeout=30,
	token=TOKEN,
	)
	except Exception:
	restart_space()


	def rounded_df(df):
	df = df.copy()
	for col in df.columns:
	if isinstance(df[col].values[0], float):
	df[col] = df[col].apply(lambda x: round(x, 2))
	return df

	results_df = pd.read_csv(EVAL_RESULTS_PATH + "/results.csv")

	agg_df = BenchmarkSuite.aggregate_df(results_df)
	agg_df = agg_df.pivot(index="dataset", columns="benchmark_category", values="score")
	agg_df.rename(columns={"OVERALL": "General"}, inplace=True)
	agg_df.columns = [x.capitalize() for x in agg_df.columns]
	mean_cols = [col for col in agg_df.columns if str(col) not in ["Mean", "Environment", "Model", "Tags"]]
	agg_df["Mean"] = agg_df[mean_cols].mean(axis=1)
	# make sure mean is the first column
	agg_df = agg_df[["Mean"] + [col for col in agg_df.columns if col != "Mean"]]
	agg_df["Tags"] = ""
	agg_df.reset_index(inplace=True)
	agg_df.rename(columns={"dataset": "Model"}, inplace=True)
	agg_df.sort_values("Mean", ascending=False, inplace=True)

	benchmark_df = results_df.pivot(index="dataset", columns="benchmark_name", values="score")

	# get benchmark name order by category
	benchmark_order = list(results_df.sort_values("benchmark_category")["benchmark_name"].unique())
	benchmark_df = benchmark_df[benchmark_order]
	benchmark_df = benchmark_df.reset_index()
	benchmark_df.rename(columns={"dataset": "Model"}, inplace=True)
	# set index
	benchmark_df.set_index("Model", inplace=True)
	benchmark_df["Mean"] = benchmark_df.mean(axis=1)
	# make sure mean is the first column
	benchmark_df = benchmark_df[["Mean"] + [col for col in benchmark_df.columns if col != "Mean"]]
	benchmark_df["Tags"] = ""
	benchmark_df.reset_index(inplace=True)
	benchmark_df.sort_values("Mean", ascending=False, inplace=True)

	# get details for each model
	model_detail_files = Path(EVAL_REQUESTS_PATH).glob("*.json")
	model_details = {}
	for model_detail_file in model_detail_files:
	with open(model_detail_file) as f:
	model_detail = json.load(f)
	model_details[model_detail_file.stem] = model_detail

	# replace .tar.gz
	benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))
	agg_df["Model"] = agg_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))

	benchmark_df["Tags"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))
	agg_df["Tags"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))

	benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))
	agg_df["Model"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))

	f_b_df = benchmark_df.copy()
	f_a_df = agg_df.copy()


	def init_leaderboard(dataframe):
	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")
	df_types = []
	for col in dataframe.columns:
	if col == "Model":
	df_types.append("markdown")
	elif col == "Tags":
	df_types.append("markdown")
	else:
	df_types.append("number")
	cols = list(dataframe.columns)
	cols.remove("Tags")
	return Leaderboard(
	value=rounded_df(dataframe),
	select_columns=SelectColumns(
	default_selection=cols,
	cant_deselect=["Model", "Mean"],
	label="Select Columns to Display:",
	),
	search_columns=["Model", "Tags"],
	filter_columns=[],
	interactive=False,
	datatype=df_types,
	)


	app = gr.Blocks(css=custom_css, title="TTS Benchmark Leaderboard")

	with app:
	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏅 TTSDS Scores", elem_id="llm-benchmark-tab-table", id=0):
	with gr.Group():
	env = gr.Checkbox(value=True, label="Exclude environment from mean.")
	gr.Markdown("Environment measures how well the system can reproduce noise in the training data. This doesn't correlate with human judgements for 'naturalness'")
	tags = gr.Dropdown(
	TAGS,
	value=[],
	multiselect=True,
	label="Tags",
	info="Select tags to filter the leaderboard. You can suggest new tags here: https://huggingface.co/spaces/ttsds/benchmark/discussions/1",
	)
	leaderboard = init_leaderboard(f_a_df)
	tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
	env.change(change_mean, [env, leaderboard], [leaderboard])
	with gr.TabItem("🏅 Individual Benchmarks", elem_id="llm-benchmark-tab-table", id=1):
	tags = gr.Dropdown(
	TAGS,
	value=[],
	multiselect=True,
	label="Tags",
	info="Select tags to filter the leaderboard",
	)
	leaderboard = init_leaderboard(f_b_df)
	tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
	with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
	gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
	with gr.TabItem("🚀 Submit here!", elem_id="llm-benchmark-tab-table", id=3):
	with gr.Column():
	with gr.Row():
	gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
	with gr.Row():
	gr.Markdown("# ✉️✨ Submit a TTS dataset here!", elem_classes="markdown-text")
	with gr.Row():
	with gr.Column():
	model_name_textbox = gr.Textbox(label="Model name")
	model_tags_dropdown = gr.Dropdown(
	label="Model tags",
	choices=TAGS,
	multiselect=True,
	)
	website_url_textbox = gr.Textbox(label="Website URL (optional)")
	hf_url_textbox = gr.Textbox(label="Huggingface URL (optional)")
	code_url_textbox = gr.Textbox(label="Code URL (optional)")
	paper_url_textbox = gr.Textbox(label="Paper URL (optional)")
	inference_details_textbox = gr.TextArea(label="Inference details (optional)")
	file_input = gr.File(file_types=[".gz"], interactive=True, label=".tar.gz TTS dataset")
	submit_button = gr.Button("Submit Eval")
	submission_result = gr.Markdown()
	submit_button.click(
	submit_eval,
	[
	model_name_textbox,
	model_tags_dropdown,
	website_url_textbox,
	hf_url_textbox,
	code_url_textbox,
	paper_url_textbox,
	inference_details_textbox,
	file_input,
	],
	submission_result,
	)

	with gr.Row():
	with gr.Accordion("Citation", open=False):
	gr.Markdown(f"Copy the BibTeX citation to cite this source:\n\n```bibtext\n{CITATION_TEXT}\n```")

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=5*86400)
	scheduler.start()

	app.queue(default_concurrency_limit=40).launch()