Spaces:
Build error
Build error
add llmonitor & start scoring
Browse files- pages/index.js +22 -16
- run/database.db +0 -0
- run/queriers.py +4 -0
- run/requirements.txt +2 -1
- run/run.py +86 -9
pages/index.js
CHANGED
|
@@ -47,7 +47,7 @@ export default function Home({ prompts, models }) {
|
|
| 47 |
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 48 |
</Head>
|
| 49 |
<main>
|
| 50 |
-
<h1>
|
| 51 |
<br />
|
| 52 |
<p>
|
| 53 |
Benchmarks like HellaSwag are a bit too abstract for me to get a sense
|
|
@@ -69,13 +69,13 @@ export default function Home({ prompts, models }) {
|
|
| 69 |
<br />
|
| 70 |
<p>
|
| 71 |
{`view: `}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
<a href="#" onClick={() => changeView("prompt")}>
|
| 73 |
-
|
| 74 |
</a>{" "}
|
| 75 |
-
/{" "}
|
| 76 |
-
<a href="#" onClick={() => changeView("model")}>
|
| 77 |
-
all models
|
| 78 |
-
</a>
|
| 79 |
</p>
|
| 80 |
<br />
|
| 81 |
{viewBy === "prompt" ? (
|
|
@@ -103,16 +103,22 @@ export default function Home({ prompts, models }) {
|
|
| 103 |
</>
|
| 104 |
) : (
|
| 105 |
<ul>
|
| 106 |
-
{models
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
<
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
</ul>
|
| 117 |
)}
|
| 118 |
<br />
|
|
|
|
| 47 |
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 48 |
</Head>
|
| 49 |
<main>
|
| 50 |
+
<h1>Crowdsourced LLM Benchmark</h1>
|
| 51 |
<br />
|
| 52 |
<p>
|
| 53 |
Benchmarks like HellaSwag are a bit too abstract for me to get a sense
|
|
|
|
| 69 |
<br />
|
| 70 |
<p>
|
| 71 |
{`view: `}
|
| 72 |
+
<a href="#" onClick={() => changeView("model")}>
|
| 73 |
+
models
|
| 74 |
+
</a>{" "}
|
| 75 |
+
/
|
| 76 |
<a href="#" onClick={() => changeView("prompt")}>
|
| 77 |
+
prompts
|
| 78 |
</a>{" "}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
</p>
|
| 80 |
<br />
|
| 81 |
{viewBy === "prompt" ? (
|
|
|
|
| 103 |
</>
|
| 104 |
) : (
|
| 105 |
<ul>
|
| 106 |
+
{models
|
| 107 |
+
.score((s) => s.score)
|
| 108 |
+
.map((model, i) => (
|
| 109 |
+
<li key={i}>
|
| 110 |
+
{model.name} -{" "}
|
| 111 |
+
<Link
|
| 112 |
+
href={`/model/${model.api_id
|
| 113 |
+
.split("/")
|
| 114 |
+
.pop()
|
| 115 |
+
.toLowerCase()}`}
|
| 116 |
+
>
|
| 117 |
+
results
|
| 118 |
+
</Link>{" "}
|
| 119 |
+
- score: {model.score}
|
| 120 |
+
</li>
|
| 121 |
+
))}
|
| 122 |
</ul>
|
| 123 |
)}
|
| 124 |
<br />
|
run/database.db
CHANGED
|
Binary files a/run/database.db and b/run/database.db differ
|
|
|
run/queriers.py
CHANGED
|
@@ -4,6 +4,8 @@ import json
|
|
| 4 |
import requests
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
|
|
|
|
|
|
|
| 7 |
load_dotenv()
|
| 8 |
|
| 9 |
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
|
|
@@ -15,6 +17,8 @@ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
|
| 15 |
|
| 16 |
MAX_TOKENS = 300
|
| 17 |
|
|
|
|
|
|
|
| 18 |
def together(model, params):
|
| 19 |
def format_prompt(prompt, prompt_type):
|
| 20 |
if prompt_type == "language":
|
|
|
|
| 4 |
import requests
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
|
| 7 |
+
from llmonitor import monitor
|
| 8 |
+
|
| 9 |
load_dotenv()
|
| 10 |
|
| 11 |
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
|
|
|
|
| 17 |
|
| 18 |
MAX_TOKENS = 300
|
| 19 |
|
| 20 |
+
monitor(openai)
|
| 21 |
+
|
| 22 |
def together(model, params):
|
| 23 |
def format_prompt(prompt, prompt_type):
|
| 24 |
if prompt_type == "language":
|
run/requirements.txt
CHANGED
|
@@ -2,4 +2,5 @@ openai
|
|
| 2 |
pandas
|
| 3 |
requests
|
| 4 |
python-dotenv
|
| 5 |
-
gradio
|
|
|
|
|
|
| 2 |
pandas
|
| 3 |
requests
|
| 4 |
python-dotenv
|
| 5 |
+
gradio
|
| 6 |
+
llmonitor
|
run/run.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import sqlite3
|
| 2 |
import time
|
| 3 |
-
|
|
|
|
| 4 |
from queriers import together, cohere, openai_func, openrouter, ai21, alephalpha
|
| 5 |
|
| 6 |
db = sqlite3.connect("./database.db")
|
|
@@ -22,6 +23,12 @@ models = [dict(model) for model in models]
|
|
| 22 |
prompts = cursor.execute("SELECT * FROM prompts").fetchall()
|
| 23 |
prompts = [dict(prompt) for prompt in prompts]
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def insert_result(modelId, promptId, result, duration, rate):
|
| 26 |
cursor.execute(
|
| 27 |
"INSERT INTO results (model, prompt, result, duration, rate) VALUES (?, ?, ?, ?, ?)",
|
|
@@ -89,15 +96,85 @@ def ask_prompt(prompt, model):
|
|
| 89 |
total_benchmarks = len(models) * len(prompts)
|
| 90 |
print(f"Running {total_benchmarks} benchmarks")
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
-
for
|
| 94 |
-
if
|
| 95 |
-
|
| 96 |
-
for prompt in prompts:
|
| 97 |
-
if prompt["type"] != "code" and model["type"] == "code":
|
| 98 |
-
print("Skipping non-code benchmark for code model")
|
| 99 |
-
continue
|
| 100 |
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
db.close()
|
|
|
|
| 1 |
import sqlite3
|
| 2 |
import time
|
| 3 |
+
from termcolor import colored
|
| 4 |
+
from llmonitor import agent
|
| 5 |
from queriers import together, cohere, openai_func, openrouter, ai21, alephalpha
|
| 6 |
|
| 7 |
db = sqlite3.connect("./database.db")
|
|
|
|
| 23 |
prompts = cursor.execute("SELECT * FROM prompts").fetchall()
|
| 24 |
prompts = [dict(prompt) for prompt in prompts]
|
| 25 |
|
| 26 |
+
|
| 27 |
+
def get_results():
|
| 28 |
+
results = cursor.execute("SELECT * FROM results").fetchall()
|
| 29 |
+
print(results[0].keys())
|
| 30 |
+
return [dict(result) for result in results]
|
| 31 |
+
|
| 32 |
def insert_result(modelId, promptId, result, duration, rate):
|
| 33 |
cursor.execute(
|
| 34 |
"INSERT INTO results (model, prompt, result, duration, rate) VALUES (?, ?, ?, ?, ?)",
|
|
|
|
| 96 |
total_benchmarks = len(models) * len(prompts)
|
| 97 |
print(f"Running {total_benchmarks} benchmarks")
|
| 98 |
|
| 99 |
+
# # Run prompts
|
| 100 |
+
# for model in models:
|
| 101 |
+
# if model["type"] == "language":
|
| 102 |
+
# continue
|
| 103 |
+
# for prompt in prompts:
|
| 104 |
+
# if prompt["type"] != "code" and model["type"] == "code":
|
| 105 |
+
# print("Skipping non-code benchmark for code model")
|
| 106 |
+
# continue
|
| 107 |
+
|
| 108 |
+
# ask_prompt(prompt, model)
|
| 109 |
+
|
| 110 |
+
# Calculate scores
|
| 111 |
+
results = get_results()
|
| 112 |
+
|
| 113 |
+
@agent(name="RateResult")
|
| 114 |
+
def rate_result(result):
|
| 115 |
+
rubrics = cursor.execute(
|
| 116 |
+
"SELECT * FROM rubrics WHERE prompt = ?",
|
| 117 |
+
(result["prompt"],)
|
| 118 |
+
).fetchall()
|
| 119 |
+
|
| 120 |
+
has_rubrics = len(rubrics) > 0
|
| 121 |
+
|
| 122 |
+
if not has_rubrics:
|
| 123 |
+
return
|
| 124 |
+
|
| 125 |
+
print(colored('---------------------------', 'white'))
|
| 126 |
+
print(colored('----------RATING-----------', 'white'))
|
| 127 |
+
print(colored('---------------------------', 'white'))
|
| 128 |
+
print(colored(result["result"], 'cyan'))
|
| 129 |
+
print(colored('---------------------------', 'white'))
|
| 130 |
+
|
| 131 |
+
score = None
|
| 132 |
+
|
| 133 |
+
for rubric in rubrics:
|
| 134 |
+
|
| 135 |
+
print('Rubric: '+colored(rubric["grading"], 'magenta'))
|
| 136 |
+
|
| 137 |
+
if result["result"].strip() == "":
|
| 138 |
+
score = 0
|
| 139 |
+
else:
|
| 140 |
+
grading_text = (
|
| 141 |
+
f'You help verify that the following answer match this condition: the answer {rubric["grading"]}. Note: the answer might be imcomplete, in which case do your best to assess based on what the full result would be.\n\n'
|
| 142 |
+
f'\n\n--START OF THE ANSWER--\n{result["result"]}\n--END OF THE ANSWER--\n\n'
|
| 143 |
+
f'Take a deep breath and explain step by step how you come to the conclusion.'
|
| 144 |
+
f'Finally, reply on the last line with YES if the following answer matches this condition (otherwies reply NO).'
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# get gpt-4 model
|
| 148 |
+
gpt4 = next((item for item in models if item['api_id'] == 'gpt-4'), None)
|
| 149 |
+
|
| 150 |
+
prompt = { }
|
| 151 |
+
|
| 152 |
+
response_text = openai_func(gpt4, {"text": grading_text})
|
| 153 |
+
|
| 154 |
+
print(colored(f"-> {response_text}", 'yellow'))
|
| 155 |
+
|
| 156 |
+
last_line = response_text.splitlines()[-1]
|
| 157 |
+
|
| 158 |
+
# If it includes a yes, then it's valid
|
| 159 |
+
if "YES" in last_line:
|
| 160 |
+
print(colored(f'Valid! + {rubric["points"]} points', 'green'))
|
| 161 |
+
score = rubric["points"] if score is None else score + rubric["points"]
|
| 162 |
+
|
| 163 |
+
print('Final score: '+colored(score, 'cyan'))
|
| 164 |
+
|
| 165 |
+
return score
|
| 166 |
+
|
| 167 |
+
|
| 168 |
|
| 169 |
+
for result in results:
|
| 170 |
+
if not result["score"]:
|
| 171 |
+
score = rate_result(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
if score is not None:
|
| 174 |
+
cursor.execute(
|
| 175 |
+
"UPDATE results SET score = ? WHERE id == ?",
|
| 176 |
+
(score, result["id"])
|
| 177 |
+
)
|
| 178 |
+
db.commit()
|
| 179 |
|
| 180 |
db.close()
|