Spaces:
Sleeping
Sleeping
update
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- .gitattributes +40 -0
- app.py +83 -20
- results-cot/Mixtral-8x7B-Instruct-v0.1.csv +3 -0
- results-cot/{gpt-4v-CoT-Azure.csv β Mixtral-8x7B-Instruct-v0.1.jpg} +2 -2
- results-cot/Mixtral-8x7B-Instruct-v0.1.pkl +3 -0
- results-cot/{gpt-4v-CoT-Azure.jpg β Mixtral-8x7B-Instruct-v0.1.png} +2 -2
- results-cot/Qwen1.5-72B-Chat.csv +3 -0
- results-cot/{gpt-4v-CoT-Azure.pkl β Qwen1.5-72B-Chat.jpg} +2 -2
- results-cot/Qwen1.5-72B-Chat.pkl +3 -0
- results-cot/{gpt-4v-CoT-Azure.png β Qwen1.5-72B-Chat.png} +2 -2
- results-cot/gemma-7b-it.csv +3 -0
- results-cot/gemma-7b-it.jpg +3 -0
- results-cot/gemma-7b-it.pkl +3 -0
- results-cot/gemma-7b-it.png +3 -0
- results-cot/{gpt-3.5-CoT.csv β gpt-3.5-turbo-0125.csv} +0 -0
- results-cot/{gpt-3.5-CoT.jpg β gpt-3.5-turbo-0125.jpg} +0 -0
- results-cot/{gpt-3.5-CoT.pkl β gpt-3.5-turbo-0125.pkl} +0 -0
- results-cot/{gpt-3.5-CoT.png β gpt-3.5-turbo-0125.png} +0 -0
- results-vision-CoT/gemini-pro-vision-CoT.csv +0 -3
- results-vision-CoT/gemini-pro-vision-CoT.jpg +0 -3
- results-vision-CoT/gemini-pro-vision-CoT.pkl +0 -3
- results-vision-CoT/gemini-pro-vision-CoT.png +0 -3
- results-vision/gemini-pro-vision-CoT.csv +0 -3
- results-vision/gemini-pro-vision-CoT.jpg +0 -3
- results-vision/gemini-pro-vision-CoT.pkl +0 -3
- results-vision/gemini-pro-vision-CoT.png +0 -3
- results-vision/gpt-4v-CoT.csv +0 -3
- results-vision/gpt-4v-CoT.jpg +0 -3
- results-vision/gpt-4v-CoT.pkl +0 -3
- results-vision/gpt-4v-CoT.png +0 -3
- results/CodeLlama-70b-Instruct-hf.csv +3 -0
- results/{CodeLlama-70B.jpg β CodeLlama-70b-Instruct-hf.jpg} +0 -0
- results/{CodeLlama-70B.pkl β CodeLlama-70b-Instruct-hf.pkl} +0 -0
- results/{CodeLlama-70B.png β CodeLlama-70b-Instruct-hf.png} +0 -0
- results/Llama-2-70b-chat-hf.csv +3 -0
- results/Mistral-7B-Instruct-v0.2.csv +3 -0
- results/Mixtral-8x7B-Instruct-v0.1.csv +3 -0
- results/{Mixtral-8x7B-Instruct-0.1.jpg β Mixtral-8x7B-Instruct-v0.1.jpg} +0 -0
- results/{Mixtral-8x7B-Instruct-0.1.pkl β Mixtral-8x7B-Instruct-v0.1.pkl} +0 -0
- results/{Mixtral-8x7B-Instruct-0.1.png β Mixtral-8x7B-Instruct-v0.1.png} +0 -0
- results/Qwen1.5-72B-Chat.csv +3 -0
- results/StripedHyena-Nous-7B.csv +3 -0
- results/Yi-34B-Chat.csv +3 -0
- results/claude-3-haiku-20240307.csv +3 -0
- results/{Claude-3-Haiku.jpg β claude-3-haiku-20240307.jpg} +0 -0
- results/{Claude-3-Haiku.pkl β claude-3-haiku-20240307.pkl} +0 -0
- results/{Claude-3-Haiku.png β claude-3-haiku-20240307.png} +0 -0
- results/claude-3-opus-20240229.csv +3 -0
- results/{Claude-3-Opus.jpg β claude-3-opus-20240229.jpg} +0 -0
- results/{Claude-3-Opus.pkl β claude-3-opus-20240229.pkl} +0 -0
.gitattributes
CHANGED
|
@@ -115,3 +115,43 @@ results-cot/gpt-4v-CoT-Azure.pkl filter=lfs diff=lfs merge=lfs -text
|
|
| 115 |
results-cot/gpt-4v-CoT-Azure.csv filter=lfs diff=lfs merge=lfs -text
|
| 116 |
results-vision-CoT/gemini-pro-vision-CoT.csv filter=lfs diff=lfs merge=lfs -text
|
| 117 |
results-cot/gpt-3.5-CoT.csv filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
results-cot/gpt-4v-CoT-Azure.csv filter=lfs diff=lfs merge=lfs -text
|
| 116 |
results-vision-CoT/gemini-pro-vision-CoT.csv filter=lfs diff=lfs merge=lfs -text
|
| 117 |
results-cot/gpt-3.5-CoT.csv filter=lfs diff=lfs merge=lfs -text
|
| 118 |
+
results/claude-3-haiku-20240307.csv filter=lfs diff=lfs merge=lfs -text
|
| 119 |
+
results/claude-3-opus-20240229.csv filter=lfs diff=lfs merge=lfs -text
|
| 120 |
+
results-cot/gemma-7b-it.csv filter=lfs diff=lfs merge=lfs -text
|
| 121 |
+
results-cot/gpt-3.5-turbo-0125.csv filter=lfs diff=lfs merge=lfs -text
|
| 122 |
+
results/gpt-3.5-turbo-0125.csv filter=lfs diff=lfs merge=lfs -text
|
| 123 |
+
results-cot/Mixtral-8x7B-Instruct-v0.1.csv filter=lfs diff=lfs merge=lfs -text
|
| 124 |
+
results/gemma-7b-it.csv filter=lfs diff=lfs merge=lfs -text
|
| 125 |
+
results-cot/Qwen1.5-72B-Chat.csv filter=lfs diff=lfs merge=lfs -text
|
| 126 |
+
results/CodeLlama-70b-Instruct-hf.csv filter=lfs diff=lfs merge=lfs -text
|
| 127 |
+
results/Mixtral-8x7B-Instruct-v0.1.csv filter=lfs diff=lfs merge=lfs -text
|
| 128 |
+
results-cot/gemma-7b-it.pkl filter=lfs diff=lfs merge=lfs -text
|
| 129 |
+
results/claude-3-haiku-20240307.pkl filter=lfs diff=lfs merge=lfs -text
|
| 130 |
+
results/gemma-7b-it.pkl filter=lfs diff=lfs merge=lfs -text
|
| 131 |
+
results-cot/gpt-3.5-turbo-0125.pkl filter=lfs diff=lfs merge=lfs -text
|
| 132 |
+
results-cot/Mixtral-8x7B-Instruct-v0.1.pkl filter=lfs diff=lfs merge=lfs -text
|
| 133 |
+
results/Mixtral-8x7B-Instruct-v0.1.pkl filter=lfs diff=lfs merge=lfs -text
|
| 134 |
+
results/claude-3-opus-20240229.pkl filter=lfs diff=lfs merge=lfs -text
|
| 135 |
+
results-cot/Qwen1.5-72B-Chat.pkl filter=lfs diff=lfs merge=lfs -text
|
| 136 |
+
results/CodeLlama-70b-Instruct-hf.pkl filter=lfs diff=lfs merge=lfs -text
|
| 137 |
+
results/gpt-3.5-turbo-0125.pkl filter=lfs diff=lfs merge=lfs -text
|
| 138 |
+
results/claude-3-haiku-20240307.jpg filter=lfs diff=lfs merge=lfs -text
|
| 139 |
+
results/claude-3-opus-20240229.jpg filter=lfs diff=lfs merge=lfs -text
|
| 140 |
+
results/gpt-3.5-turbo-0125.jpg filter=lfs diff=lfs merge=lfs -text
|
| 141 |
+
results-cot/gpt-3.5-turbo-0125.jpg filter=lfs diff=lfs merge=lfs -text
|
| 142 |
+
results/Mixtral-8x7B-Instruct-v0.1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 143 |
+
results-cot/Qwen1.5-72B-Chat.jpg filter=lfs diff=lfs merge=lfs -text
|
| 144 |
+
results/gemma-7b-it.jpg filter=lfs diff=lfs merge=lfs -text
|
| 145 |
+
results-cot/Mixtral-8x7B-Instruct-v0.1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 146 |
+
results-cot/gemma-7b-it.jpg filter=lfs diff=lfs merge=lfs -text
|
| 147 |
+
results/CodeLlama-70b-Instruct-hf.jpg filter=lfs diff=lfs merge=lfs -text
|
| 148 |
+
results-cot/gemma-7b-it.png filter=lfs diff=lfs merge=lfs -text
|
| 149 |
+
results-cot/gpt-3.5-turbo-0125.png filter=lfs diff=lfs merge=lfs -text
|
| 150 |
+
results/gpt-3.5-turbo-0125.png filter=lfs diff=lfs merge=lfs -text
|
| 151 |
+
results/CodeLlama-70b-Instruct-hf.png filter=lfs diff=lfs merge=lfs -text
|
| 152 |
+
results/Mixtral-8x7B-Instruct-v0.1.png filter=lfs diff=lfs merge=lfs -text
|
| 153 |
+
results/claude-3-opus-20240229.png filter=lfs diff=lfs merge=lfs -text
|
| 154 |
+
results-cot/Mixtral-8x7B-Instruct-v0.1.png filter=lfs diff=lfs merge=lfs -text
|
| 155 |
+
results-cot/Qwen1.5-72B-Chat.png filter=lfs diff=lfs merge=lfs -text
|
| 156 |
+
results/claude-3-haiku-20240307.png filter=lfs diff=lfs merge=lfs -text
|
| 157 |
+
results/gemma-7b-it.png filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -10,7 +10,7 @@ vision_results = glob("results-vision/*.pkl")
|
|
| 10 |
# Load CoT text benchmark results
|
| 11 |
cot_text_results = glob("results-cot/*.pkl")
|
| 12 |
# Load CoT vision benchmark results
|
| 13 |
-
cot_vision_results = glob("results-vision-CoT/*.pkl")
|
| 14 |
|
| 15 |
# Function to load data, add model type and name
|
| 16 |
def load_data(files, model_type):
|
|
@@ -27,18 +27,22 @@ def load_data(files, model_type):
|
|
| 27 |
data = load_data(csv_results, "Text Only")
|
| 28 |
vision_data = load_data(vision_results, "Vision")
|
| 29 |
cot_text_data = load_data(cot_text_results, "CoT Text Only")
|
| 30 |
-
cot_vision_data = load_data(cot_vision_results, "CoT Vision")
|
| 31 |
|
| 32 |
# Combine all data into a single DataFrame
|
| 33 |
all_data = pd.concat(
|
| 34 |
-
[data, vision_data, cot_text_data
|
| 35 |
)
|
| 36 |
|
| 37 |
all_model_names = all_data["Model Name"].unique()
|
| 38 |
all_text_only_model_names = list(
|
| 39 |
all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique()
|
| 40 |
)
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
## Continue with the cold code --
|
| 44 |
# TODO: Update me to read from all_data for later
|
|
@@ -50,7 +54,7 @@ vision_data = {file: pd.read_pickle(file) for file in vision_results}
|
|
| 50 |
# Load the CoT text files into a dict
|
| 51 |
cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
|
| 52 |
# Load the CoT vision files into a dict
|
| 53 |
-
cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}
|
| 54 |
|
| 55 |
|
| 56 |
def calculate_accuracy(df):
|
|
@@ -96,13 +100,13 @@ def process_data(data):
|
|
| 96 |
text_data_for_df = process_data(data)
|
| 97 |
vision_data_for_df = process_data(vision_data)
|
| 98 |
cot_text_data_for_df = process_data(cot_text_data)
|
| 99 |
-
cot_vision_data_for_df = process_data(cot_vision_data)
|
| 100 |
|
| 101 |
# Create DataFrames
|
| 102 |
accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names)
|
| 103 |
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
| 104 |
cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
|
| 105 |
-
cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)
|
| 106 |
|
| 107 |
# Function to finalize DataFrame
|
| 108 |
def finalize_df(df):
|
|
@@ -117,7 +121,7 @@ def finalize_df(df):
|
|
| 117 |
accuracy_df = finalize_df(accuracy_df)
|
| 118 |
vision_accuracy_df = finalize_df(vision_accuracy_df)
|
| 119 |
cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
|
| 120 |
-
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
|
| 121 |
|
| 122 |
|
| 123 |
def load_heatmap(evt: gr.SelectData):
|
|
@@ -176,6 +180,43 @@ def calculate_order_by_first_substring(selected_models):
|
|
| 176 |
return text_only_filtered, number_of_queries, number_of_fsms
|
| 177 |
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
with gr.Blocks() as demo:
|
| 180 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
| 181 |
with gr.Tab("Text-only Benchmark"):
|
|
@@ -196,8 +237,8 @@ with gr.Blocks() as demo:
|
|
| 196 |
fn=load_vision_heatmap, outputs=[heatmap_image_vision]
|
| 197 |
)
|
| 198 |
|
| 199 |
-
with gr.Tab("
|
| 200 |
-
gr.Markdown("#
|
| 201 |
cot_leader_board_text = gr.Dataframe(
|
| 202 |
cot_text_accuracy_df, headers=headers_with_icons
|
| 203 |
)
|
|
@@ -207,16 +248,16 @@ with gr.Blocks() as demo:
|
|
| 207 |
fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
|
| 208 |
)
|
| 209 |
|
| 210 |
-
with gr.Tab("
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
|
| 221 |
with gr.Tab("Constraint Text-only Results"):
|
| 222 |
gr.Markdown("## Constraint Text-only Leaderboard by first substring")
|
|
@@ -240,4 +281,26 @@ with gr.Blocks() as demo:
|
|
| 240 |
queue=True,
|
| 241 |
)
|
| 242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
demo.launch()
|
|
|
|
| 10 |
# Load CoT text benchmark results
|
| 11 |
cot_text_results = glob("results-cot/*.pkl")
|
| 12 |
# Load CoT vision benchmark results
|
| 13 |
+
# cot_vision_results = glob("results-vision-CoT/*.pkl")
|
| 14 |
|
| 15 |
# Function to load data, add model type and name
|
| 16 |
def load_data(files, model_type):
|
|
|
|
| 27 |
data = load_data(csv_results, "Text Only")
|
| 28 |
vision_data = load_data(vision_results, "Vision")
|
| 29 |
cot_text_data = load_data(cot_text_results, "CoT Text Only")
|
| 30 |
+
# cot_vision_data = load_data(cot_vision_results, "CoT Vision")
|
| 31 |
|
| 32 |
# Combine all data into a single DataFrame
|
| 33 |
all_data = pd.concat(
|
| 34 |
+
[data, vision_data, cot_text_data], ignore_index=True
|
| 35 |
)
|
| 36 |
|
| 37 |
all_model_names = all_data["Model Name"].unique()
|
| 38 |
all_text_only_model_names = list(
|
| 39 |
all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique()
|
| 40 |
)
|
| 41 |
+
all_cot_text_only_models = list(
|
| 42 |
+
all_data[all_data["Model Type"] == "CoT Text Only"]["Model Name"].unique()
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
|
| 47 |
## Continue with the cold code --
|
| 48 |
# TODO: Update me to read from all_data for later
|
|
|
|
| 54 |
# Load the CoT text files into a dict
|
| 55 |
cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
|
| 56 |
# Load the CoT vision files into a dict
|
| 57 |
+
# cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}
|
| 58 |
|
| 59 |
|
| 60 |
def calculate_accuracy(df):
|
|
|
|
| 100 |
text_data_for_df = process_data(data)
|
| 101 |
vision_data_for_df = process_data(vision_data)
|
| 102 |
cot_text_data_for_df = process_data(cot_text_data)
|
| 103 |
+
# cot_vision_data_for_df = process_data(cot_vision_data)
|
| 104 |
|
| 105 |
# Create DataFrames
|
| 106 |
accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names)
|
| 107 |
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
| 108 |
cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
|
| 109 |
+
# cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)
|
| 110 |
|
| 111 |
# Function to finalize DataFrame
|
| 112 |
def finalize_df(df):
|
|
|
|
| 121 |
accuracy_df = finalize_df(accuracy_df)
|
| 122 |
vision_accuracy_df = finalize_df(vision_accuracy_df)
|
| 123 |
cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
|
| 124 |
+
# cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
|
| 125 |
|
| 126 |
|
| 127 |
def load_heatmap(evt: gr.SelectData):
|
|
|
|
| 180 |
return text_only_filtered, number_of_queries, number_of_fsms
|
| 181 |
|
| 182 |
|
| 183 |
+
|
| 184 |
+
def calculate_order_by_first_substring_cot(selected_models):
|
| 185 |
+
|
| 186 |
+
first_columns = all_data[all_data["substring_index"] == 1]
|
| 187 |
+
query_ids_df = first_columns[first_columns["Model Type"] == "CoT Text Only"]
|
| 188 |
+
query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
|
| 189 |
+
|
| 190 |
+
query_ids_df = query_ids_df.groupby("query_id").filter(
|
| 191 |
+
lambda x: x["parsed_judge_response"].eq(1).all()
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
fsm_ids = query_ids_df.fsm_id.unique()
|
| 195 |
+
|
| 196 |
+
text_only = all_data[all_data["Model Type"] == "CoT Text Only"]
|
| 197 |
+
text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
|
| 198 |
+
|
| 199 |
+
query_ids = text_only_filtered.query_id.unique()
|
| 200 |
+
text_only_filtered = (
|
| 201 |
+
text_only_filtered.groupby(["Model Name"])["parsed_judge_response"]
|
| 202 |
+
.mean()
|
| 203 |
+
.reset_index()
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100
|
| 207 |
+
text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True)
|
| 208 |
+
|
| 209 |
+
text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply(
|
| 210 |
+
lambda x: round(x, 2)
|
| 211 |
+
)
|
| 212 |
+
text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True)
|
| 213 |
+
|
| 214 |
+
number_of_queries = len(query_ids)
|
| 215 |
+
number_of_fsms = len(fsm_ids)
|
| 216 |
+
|
| 217 |
+
return text_only_filtered, number_of_queries, number_of_fsms
|
| 218 |
+
|
| 219 |
+
|
| 220 |
with gr.Blocks() as demo:
|
| 221 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
| 222 |
with gr.Tab("Text-only Benchmark"):
|
|
|
|
| 237 |
fn=load_vision_heatmap, outputs=[heatmap_image_vision]
|
| 238 |
)
|
| 239 |
|
| 240 |
+
with gr.Tab("Text-only Benchmark (CoT)"):
|
| 241 |
+
gr.Markdown("# Text-only Leaderboard (CoT)")
|
| 242 |
cot_leader_board_text = gr.Dataframe(
|
| 243 |
cot_text_accuracy_df, headers=headers_with_icons
|
| 244 |
)
|
|
|
|
| 248 |
fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
|
| 249 |
)
|
| 250 |
|
| 251 |
+
# with gr.Tab("Vision Benchmark (CoT)"):
|
| 252 |
+
# gr.Markdown("# Vision Benchmark Leaderboard (CoT)")
|
| 253 |
+
# cot_leader_board_vision = gr.Dataframe(
|
| 254 |
+
# cot_vision_accuracy_df, headers=headers_with_icons
|
| 255 |
+
# )
|
| 256 |
+
# gr.Markdown("## Heatmap")
|
| 257 |
+
# cot_heatmap_image_vision = gr.Image(label="", show_label=False)
|
| 258 |
+
# cot_leader_board_vision.select(
|
| 259 |
+
# fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
|
| 260 |
+
# )
|
| 261 |
|
| 262 |
with gr.Tab("Constraint Text-only Results"):
|
| 263 |
gr.Markdown("## Constraint Text-only Leaderboard by first substring")
|
|
|
|
| 281 |
queue=True,
|
| 282 |
)
|
| 283 |
|
| 284 |
+
|
| 285 |
+
with gr.Tab("Constraint Text-only Results (CoT)"):
|
| 286 |
+
gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
|
| 287 |
+
included_models_cot = gr.CheckboxGroup(
|
| 288 |
+
label="Models to include",
|
| 289 |
+
choices=all_cot_text_only_models,
|
| 290 |
+
value=all_cot_text_only_models,
|
| 291 |
+
interactive=True,
|
| 292 |
+
)
|
| 293 |
+
with gr.Row():
|
| 294 |
+
number_of_queries_cot = gr.Textbox(label="Number of included queries")
|
| 295 |
+
number_of_fsms_cot = gr.Textbox(label="Number of included FSMs")
|
| 296 |
+
|
| 297 |
+
constrained_leader_board_text_cot = gr.Dataframe()
|
| 298 |
+
|
| 299 |
+
included_models_cot.select(
|
| 300 |
+
fn=calculate_order_by_first_substring_cot,
|
| 301 |
+
inputs=[included_models_cot],
|
| 302 |
+
outputs=[constrained_leader_board_text_cot, number_of_queries_cot, number_of_fsms_cot],
|
| 303 |
+
queue=True,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
demo.launch()
|
results-cot/Mixtral-8x7B-Instruct-v0.1.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:093e919d90609c3be8d6818cf56ca018214da3a42b78aeaf85f92581b72c5ad4
|
| 3 |
+
size 19494123
|
results-cot/{gpt-4v-CoT-Azure.csv β Mixtral-8x7B-Instruct-v0.1.jpg}
RENAMED
|
File without changes
|
results-cot/Mixtral-8x7B-Instruct-v0.1.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:686692584c6ba027c454d699bbf585b95e5c99bfc426810ea74b327a975b9cf3
|
| 3 |
+
size 19489822
|
results-cot/{gpt-4v-CoT-Azure.jpg β Mixtral-8x7B-Instruct-v0.1.png}
RENAMED
|
File without changes
|
results-cot/Qwen1.5-72B-Chat.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:32681449776facf1084405001e69ed7926b79c69f9717fb159e3eb064b333636
|
| 3 |
+
size 15795431
|
results-cot/{gpt-4v-CoT-Azure.pkl β Qwen1.5-72B-Chat.jpg}
RENAMED
|
File without changes
|
results-cot/Qwen1.5-72B-Chat.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c20383298d4b6482ca7c30bf91822e24099dc67b71a3be10271005e25208c40
|
| 3 |
+
size 15778970
|
results-cot/{gpt-4v-CoT-Azure.png β Qwen1.5-72B-Chat.png}
RENAMED
|
File without changes
|
results-cot/gemma-7b-it.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f8535fa3f2ef5a94b1b552859930e0476ca0f3c77ec4c277893a9ab9ef45d6c3
|
| 3 |
+
size 16793758
|
results-cot/gemma-7b-it.jpg
ADDED
|
Git LFS Details
|
results-cot/gemma-7b-it.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c581027f8b78df5934117276cec3e53613f5ac953d045f71af4121b3ec2e1a4
|
| 3 |
+
size 16822239
|
results-cot/gemma-7b-it.png
ADDED
|
Git LFS Details
|
results-cot/{gpt-3.5-CoT.csv β gpt-3.5-turbo-0125.csv}
RENAMED
|
File without changes
|
results-cot/{gpt-3.5-CoT.jpg β gpt-3.5-turbo-0125.jpg}
RENAMED
|
File without changes
|
results-cot/{gpt-3.5-CoT.pkl β gpt-3.5-turbo-0125.pkl}
RENAMED
|
File without changes
|
results-cot/{gpt-3.5-CoT.png β gpt-3.5-turbo-0125.png}
RENAMED
|
File without changes
|
results-vision-CoT/gemini-pro-vision-CoT.csv
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:1ebebe1d6caee19a4f714bf13eaba72e7a0b5d15281c407cd4dc53a2820ad312
|
| 3 |
-
size 6184119
|
|
|
|
|
|
|
|
|
|
|
|
results-vision-CoT/gemini-pro-vision-CoT.jpg
DELETED
Git LFS Details
|
results-vision-CoT/gemini-pro-vision-CoT.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:049d575dbad9da04496fea752e19f915bcec445b13f3010f9c67544012c936ff
|
| 3 |
-
size 6144275
|
|
|
|
|
|
|
|
|
|
|
|
results-vision-CoT/gemini-pro-vision-CoT.png
DELETED
Git LFS Details
|
results-vision/gemini-pro-vision-CoT.csv
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:1ebebe1d6caee19a4f714bf13eaba72e7a0b5d15281c407cd4dc53a2820ad312
|
| 3 |
-
size 6184119
|
|
|
|
|
|
|
|
|
|
|
|
results-vision/gemini-pro-vision-CoT.jpg
DELETED
Git LFS Details
|
results-vision/gemini-pro-vision-CoT.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:049d575dbad9da04496fea752e19f915bcec445b13f3010f9c67544012c936ff
|
| 3 |
-
size 6144275
|
|
|
|
|
|
|
|
|
|
|
|
results-vision/gemini-pro-vision-CoT.png
DELETED
Git LFS Details
|
results-vision/gpt-4v-CoT.csv
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:04b4de1a7a4280354c89609d15282109ee60f8f58129960dc0edbb046b12a5c6
|
| 3 |
-
size 6374181
|
|
|
|
|
|
|
|
|
|
|
|
results-vision/gpt-4v-CoT.jpg
DELETED
Git LFS Details
|
results-vision/gpt-4v-CoT.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:52ae5e417e011db84976acd51a024eae7ccea1e686b7f3f0e8158cd77be4f847
|
| 3 |
-
size 6320889
|
|
|
|
|
|
|
|
|
|
|
|
results-vision/gpt-4v-CoT.png
DELETED
Git LFS Details
|
results/CodeLlama-70b-Instruct-hf.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3726905a1656174f3c29edfced6f2eec63222f6be8965c0d970264901d8cfc75
|
| 3 |
+
size 16476347
|
results/{CodeLlama-70B.jpg β CodeLlama-70b-Instruct-hf.jpg}
RENAMED
|
File without changes
|
results/{CodeLlama-70B.pkl β CodeLlama-70b-Instruct-hf.pkl}
RENAMED
|
File without changes
|
results/{CodeLlama-70B.png β CodeLlama-70b-Instruct-hf.png}
RENAMED
|
File without changes
|
results/Llama-2-70b-chat-hf.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42a31de917b05ed5405474a348d072426474a8fb2ce7ff462dbb121e25f4b6ad
|
| 3 |
+
size 20760268
|
results/Mistral-7B-Instruct-v0.2.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29ad4985661fc41e659a631fc74ba433cd08a571048f11436ccf87ff74f0db09
|
| 3 |
+
size 27242025
|
results/Mixtral-8x7B-Instruct-v0.1.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a93e2b963a5ac8129b5284f3fd7987964ef96fa0e64194de704a3549c611de1f
|
| 3 |
+
size 17978176
|
results/{Mixtral-8x7B-Instruct-0.1.jpg β Mixtral-8x7B-Instruct-v0.1.jpg}
RENAMED
|
File without changes
|
results/{Mixtral-8x7B-Instruct-0.1.pkl β Mixtral-8x7B-Instruct-v0.1.pkl}
RENAMED
|
File without changes
|
results/{Mixtral-8x7B-Instruct-0.1.png β Mixtral-8x7B-Instruct-v0.1.png}
RENAMED
|
File without changes
|
results/Qwen1.5-72B-Chat.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ba395c0b55330f689827527831e57e50ae9d824b6635b2bb569713afcf26d4b
|
| 3 |
+
size 14219193
|
results/StripedHyena-Nous-7B.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f662367ea0d33a368aaa7a72cfeed41d2f3dc05be6289a6fe485a028c7cb98d5
|
| 3 |
+
size 29219512
|
results/Yi-34B-Chat.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7f09fb5f46ca144490bcb42ec89dd27f169680493501c211bf2bcfcd908da1c
|
| 3 |
+
size 20485423
|
results/claude-3-haiku-20240307.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45623535997485afdee5b0312f2b5fdcc26cf531fbb56b6c3af6e126dfbe7b0f
|
| 3 |
+
size 19570166
|
results/{Claude-3-Haiku.jpg β claude-3-haiku-20240307.jpg}
RENAMED
|
File without changes
|
results/{Claude-3-Haiku.pkl β claude-3-haiku-20240307.pkl}
RENAMED
|
File without changes
|
results/{Claude-3-Haiku.png β claude-3-haiku-20240307.png}
RENAMED
|
File without changes
|
results/claude-3-opus-20240229.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d902999bcee4798b81644b2ff0ea78280dd46bc310909154c1ef089adf82789
|
| 3 |
+
size 20131397
|
results/{Claude-3-Opus.jpg β claude-3-opus-20240229.jpg}
RENAMED
|
File without changes
|
results/{Claude-3-Opus.pkl β claude-3-opus-20240229.pkl}
RENAMED
|
File without changes
|