JunsWan commited on
Commit
6e2102e
ยท
verified ยท
1 Parent(s): 6205075

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -193
app.py CHANGED
@@ -1,204 +1,168 @@
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
47
  )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
  interactive=False,
 
 
 
 
89
  )
90
 
 
91
 
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("๐Ÿ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("๐Ÿ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("๐Ÿš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"โœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"๐Ÿ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"โณ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# โœ‰๏ธโœจ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
191
- with gr.Row():
192
- with gr.Accordion("๐Ÿ“™ Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  elem_id="citation-button",
198
- show_copy_button=True,
199
  )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
  import gradio as gr
 
3
  import pandas as pd
4
+ import json
5
+ from constants import *
6
+ from datetime import datetime
7
+ from utils_display import model_info
8
+ from constants import column_names
9
+ import pytz
10
+ from data_utils import post_processing
11
+
12
+ LAST_UPDATED = None
13
+ INTRO_MD = ""
14
+ with open("_about_us.md", "r") as f:
15
+ ABOUT_MD = f.read()
16
+
17
+ with open("_header.md", "r") as f:
18
+ HEADER_MD = f.read()
19
+
20
+ raw_data = None
21
+ original_df = None
22
+ raw_puzzle_data = None
23
+ puzzle_df = None
24
+ available_models = list(model_info.keys())
25
+
26
+ def _gstr(text):
27
+ return gr.Text(text, visible=False)
28
+
29
+ def _tab_leaderboard():
30
+ global original_df
31
+
32
+ df =original_df.copy()
33
+
34
+ df.insert(0, "#", range(1, 1 + len(df)))
35
+
36
+ if "Open Source" in df.columns:
37
+ df["Open Source"] = df["Open Source"].apply(lambda x: "โœ…" if x else "โŒ")
38
+
39
+ leaderboard_table = gr.components.Dataframe(
40
+ value=df,
41
+ datatype=["number", "markdown", "bool", "number", "number", "number", "number"],
42
+ elem_id="leaderboard-table",
43
+ interactive=False,
44
+ visible=True,
45
+ column_widths=[50, 200, 100, 120, 120, 120, 130],
46
+ wrap=True,
47
+ height=800
48
  )
49
+
50
+ return leaderboard_table
51
+
52
+ def _tab_leaderboard_puzzle():
53
+ global puzzle_df
54
+
55
+ df =puzzle_df.copy()
56
+
57
+ df.insert(0, "#", range(1, 1 + len(df)))
58
+
59
+ leaderboard_puzzle_table = gr.components.Dataframe(
60
+ value=df,
61
+ datatype=["number", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"],
62
+ elem_id="leaderboard-puzzle-table",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  interactive=False,
64
+ visible=True,
65
+ column_widths=[50, 200, 150, 150, 150, 150, 150, 150, 150, 150, 150,150, 150],
66
+ wrap=True,
67
+ height=800
68
  )
69
 
70
+ return leaderboard_puzzle_table
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ def _tab_submit():
74
+ markdown_text = """
75
+ Please create an issue on our [Github](https://github.com/ljcleo/hardcore-logic) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
76
+ If you would like to do local testing, please read our code [here](https://github.com/ljcleo/hardcore-logic/tree/master/src/evaluator)
77
+ and apply for the access for the [HardcoreLogic](https://hf.co/dataset/?/?) that contains the truth solutions.
78
+ """
79
+
80
+ gr.Markdown("## ๐Ÿš€ Evaluate your models\n\n" + markdown_text, elem_classes="markdown-text")
81
+
82
+ def build_demo():
83
+ global original_df, available_models, gpt4t_dfs, haiku_dfs, llama_dfs
84
+
85
+ with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
86
+ gr.HTML(BANNER, elem_id="banner")
87
+
88
+ # convert LAST_UPDATED to the PDT time
89
+ LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
90
+ header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
91
+ gr.Markdown(header_md_text, elem_classes="markdown-text")
92
+
93
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
94
+ # ๐Ÿ… Leaderboard
95
+ with gr.TabItem("๐Ÿ… Leaderboard", elem_id="od-benchmark-tab-table", id=0):
96
+ _tab_leaderboard()
97
+
98
+ # ๐ŸŽฏ Accuracy for each puzzl
99
+ with gr.TabItem("๐ŸŽฏ Accuracy for each puzzle",elem_id="od-benchmark-tab-table", id=1):
100
+ _tab_leaderboard_puzzle()
101
+
102
+ '''
103
+ # ๐Ÿš€ Evaluate your models
104
+ with gr.TabItem("๐Ÿš€ Evaluate your models", elem_id="od-benchmark-tab-table", id=3):
105
+ _tab_submit()
106
+ '''
107
+
108
+ '''
109
+ # ๐Ÿ“ฎ About Us
110
+ with gr.TabItem("๐Ÿ“ฎ About Us", elem_id="od-benchmark-tab-table", id=4):
111
+ gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
112
+ '''
113
+
114
+ # ๐Ÿ“š Citation ๅŒบๅŸŸ
115
+ with gr.Accordion("๐Ÿ“š Citation", open=False):
116
+ gr.Textbox(
117
+ value=CITATION_TEXT,
118
+ lines=7,
119
+ label="Copy this BibTeX to cite us",
120
  elem_id="citation-button",
121
+ show_copy_button=True
122
  )
123
 
124
+ return demo
125
+
126
+
127
+ def data_load(result_file,puzzle_file):
128
+ global raw_data, original_df, raw_puzzle_data, puzzle_df
129
+ print(f"Loading {result_file}")
130
+ column_names_main = column_names.copy()
131
+ column_puzzle_main = column_names_puzzle.copy()
132
+ main_ordered_columns = ORDERED_COLUMN_NAMES
133
+ puzzle_main_ordered_columns =ORDERED_COLUMN_NAMES_PUZZLE
134
+ click_url = True
135
+ with open(result_file, "r") as f:
136
+ raw_data = json.load(f)
137
+ for d in raw_data:
138
+ for k, v in d.items():
139
+ try:
140
+ d[k] = float(v)
141
+ except:
142
+ pass
143
+ with open(puzzle_file, "r") as f:
144
+ raw_puzzle_data = json.load(f)
145
+ for d in raw_puzzle_data:
146
+ for k, v in d.items():
147
+ try:
148
+ d[k] = float(v)
149
+ except:
150
+ pass
151
+ original_df = pd.DataFrame(raw_data)
152
+ original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
153
+ puzzle_df = pd.DataFrame(raw_puzzle_data)
154
+ puzzle_df = post_processing(puzzle_df, column_puzzle_main, ordered_columns=puzzle_main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
155
+ print(f"original_df.columns: {original_df.columns}")
156
+ print(f"puzzle_df.columns: {puzzle_df.columns}")
157
+
158
+
159
+ if __name__ == "__main__":
160
+ parser = argparse.ArgumentParser()
161
+ parser.add_argument("--share", action="store_true")
162
+ parser.add_argument("--result_file", help="Path to results table", default="HardcoreLogic-Eval/results_dirs/hardcorelogic.summary.json")
163
+ parser.add_argument("--puzzle_file", help="Path to results(puzzle) table", default="HardcoreLogic-Eval/results_dirs/hardcorelogic.puzzle.json")
164
+ args = parser.parse_args()
165
+ data_load(args.result_file,args.puzzle_file)
166
+ print(original_df)
167
+ demo = build_demo()
168
+ demo.launch(share=args.share, height=3000, width="100%")