File size: 5,911 Bytes
6e2102e
ca2f11f
 
6e2102e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca2f11f
6e2102e
 
 
 
 
 
 
 
 
 
 
 
 
 
ca2f11f
6e2102e
 
 
 
ca2f11f
 
6e2102e
ca2f11f
 
6e2102e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca2f11f
6e2102e
ca2f11f
 
6e2102e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b814215
 
6e2102e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import argparse
import gradio as gr
import pandas as pd
import json
from constants import *
from datetime import datetime
from utils_display import model_info
from constants import column_names
import pytz
from data_utils import post_processing

LAST_UPDATED = None
INTRO_MD = ""
with open("_about_us.md", "r") as f:
    ABOUT_MD = f.read()

with open("_header.md", "r") as f:
    HEADER_MD = f.read()

raw_data = None
original_df = None
raw_puzzle_data = None
puzzle_df = None
available_models = list(model_info.keys())

def _gstr(text):
    return gr.Text(text, visible=False)

def _tab_leaderboard():
    global original_df

    df =original_df.copy()

    df.insert(0, "#", range(1, 1 + len(df)))

    if "Open Source" in df.columns:
        df["Open Source"] = df["Open Source"].apply(lambda x: "โœ…" if x else "โŒ")

    leaderboard_table = gr.components.Dataframe(
        value=df,
        datatype=["number", "markdown", "bool", "number", "number", "number", "number"],
        elem_id="leaderboard-table",
        interactive=False,
        visible=True,
        column_widths=[50, 200, 100, 120, 120, 120, 130],
        wrap=True,
        height=800
    )

    return leaderboard_table

def _tab_leaderboard_puzzle():
    global puzzle_df

    df =puzzle_df.copy()

    df.insert(0, "#", range(1, 1 + len(df)))

    leaderboard_puzzle_table = gr.components.Dataframe(
        value=df,
        datatype=["number", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"],
        elem_id="leaderboard-puzzle-table",
        interactive=False,
        visible=True,
        column_widths=[50, 200, 150, 150, 150, 150, 150, 150, 150, 150, 150,150, 150],
        wrap=True,
        height=800
    )

    return leaderboard_puzzle_table


def _tab_submit():
    markdown_text = """
    Please create an issue on our [Github](https://github.com/ljcleo/hardcore-logic) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
    If you would like to do local testing, please read our code [here](https://github.com/ljcleo/hardcore-logic/tree/master/src/evaluator)
    and apply for the access for the [HardcoreLogic](https://hf.co/dataset/?/?) that contains the truth solutions.
    """

    gr.Markdown("## ๐Ÿš€ Evaluate your models\n\n" + markdown_text, elem_classes="markdown-text")

def build_demo():
    global original_df, available_models, gpt4t_dfs, haiku_dfs, llama_dfs

    with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
        gr.HTML(BANNER, elem_id="banner")

        # convert LAST_UPDATED to the PDT time
        LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
        header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
        gr.Markdown(header_md_text, elem_classes="markdown-text")

        with gr.Tabs(elem_classes="tab-buttons") as tabs:
            # ๐Ÿ… Leaderboard
            with gr.TabItem("๐Ÿ… Leaderboard", elem_id="od-benchmark-tab-table", id=0):
                _tab_leaderboard()

            # ๐ŸŽฏ Accuracy for each puzzl
            with gr.TabItem("๐ŸŽฏ Accuracy for each puzzle",elem_id="od-benchmark-tab-table", id=1):
                _tab_leaderboard_puzzle()

            '''
            # ๐Ÿš€ Evaluate your models
            with gr.TabItem("๐Ÿš€ Evaluate your models", elem_id="od-benchmark-tab-table", id=3):
                _tab_submit()
            '''

            '''
            # ๐Ÿ“ฎ About Us
            with gr.TabItem("๐Ÿ“ฎ About Us", elem_id="od-benchmark-tab-table", id=4):
                gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
            '''

        # ๐Ÿ“š Citation ๅŒบๅŸŸ
        with gr.Accordion("๐Ÿ“š Citation", open=False):
            gr.Textbox(
                value=CITATION_TEXT,
                lines=7,
                label="Copy this BibTeX to cite us",
                elem_id="citation-button",
                show_copy_button=True
            )

    return demo


def data_load(result_file,puzzle_file):
    global raw_data, original_df, raw_puzzle_data, puzzle_df
    print(f"Loading {result_file}")
    column_names_main = column_names.copy()
    column_puzzle_main = column_names_puzzle.copy()
    main_ordered_columns = ORDERED_COLUMN_NAMES
    puzzle_main_ordered_columns =ORDERED_COLUMN_NAMES_PUZZLE
    click_url = True
    with open(result_file, "r") as f:
        raw_data = json.load(f)
    for d in raw_data:
        for k, v in d.items():
            try:
                d[k] = float(v)
            except:
                pass
    with open(puzzle_file, "r") as f:
        raw_puzzle_data = json.load(f)
    for d in raw_puzzle_data:
        for k, v in d.items():
            try:
                d[k] = float(v)
            except:
                pass
    original_df = pd.DataFrame(raw_data)
    original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
    puzzle_df = pd.DataFrame(raw_puzzle_data)
    puzzle_df = post_processing(puzzle_df, column_puzzle_main, ordered_columns=puzzle_main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
    print(f"original_df.columns: {original_df.columns}")
    print(f"puzzle_df.columns: {puzzle_df.columns}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--share", action="store_true")
    parser.add_argument("--result_file", help="Path to results table", default="hardcorelogic.summary.json")
    parser.add_argument("--puzzle_file", help="Path to results(puzzle) table", default="hardcorelogic.puzzle.json")
    args = parser.parse_args()
    data_load(args.result_file,args.puzzle_file)
    print(original_df)
    demo = build_demo()
    demo.launch(share=args.share, height=3000, width="100%")