Spaces:

llmonitor
/

benchmarks

Build error

App Files Files Community

benchmarks / pages /index.js

vincelwt's picture

initial commit

4ffd659 unverified about 2 years ago

5.47 kB

	import getDB from "@/utils/getDB"
	import Head from "next/head"
	import Link from "next/link"
	import { useRouter } from "next/router"
	import { useEffect, useMemo, useState } from "react"

	// import styles from '@/styles/Home.module.css'

	export const getStaticProps = async () => {
	const db = await getDB()

	const prompts = await db.all(`SELECT * FROM prompts ORDER BY text ASC`)

	// get all models that have at least 1 result
	const models = await db.all(
	`SELECT * FROM models WHERE id IN (SELECT DISTINCT model FROM results) ORDER BY name ASC`
	)

	return { props: { prompts, models } }
	}

	export default function Home({ prompts, models }) {
	const router = useRouter()

	const [viewBy, setViewBy] = useState(router.query.viewBy \|\| "prompt")

	const changeView = (viewBy) => {
	router.push({ query: { viewBy } })
	}

	useEffect(() => {
	if (router.query.viewBy) setViewBy(router.query.viewBy)
	}, [router.query.viewBy])

	const types = useMemo(() => {
	return Array.from(new Set(prompts.map((p) => p.type)))
	}, [prompts])

	return (
	<>
	<Head>
	<title>LLM Benchmarks</title>
	<meta
	name="description"
	content="Human-readable benchmarks of 60+ open-source and proprietary LLMs."
	/>
	<meta name="viewport" content="width=device-width, initial-scale=1" />
	</Head>
	<main>
	<h1>Asking 60+ LLMs a set of 20 questions</h1>
	<br />
	<p>
	Benchmarks like HellaSwag are a bit too abstract for me to get a sense
	of how well they perform in real-world workflows.
	</p>
	<br />

	<p>
	I had the idea of writing a script that asks prompts testing basic
	reasoning, instruction following, and creativity on around 60 models
	that I could get my hands on through inferences API.
	</p>
	<br />
	<p>
	The script stored all the answers in a SQLite database, and those are
	the raw results.
	</p>
	<br />

	<table style={{ maxWidth: 600 }}>
	<th>
	<p>
	Edit: as this got popular, I added an email form to receive
	notifications for future benchmark results:
	</p>
	<iframe
	src="https://embeds.beehiiv.com/65bd6af1-2dea-417a-baf2-b65bc27e1610?slim=true"
	height="52"
	frameborder="0"
	scrolling="no"
	style={{
	width: 400,
	border: "none",
	transform: "scale(0.8)",
	transformOrigin: "left",
	}}
	></iframe>
	<br />
	<small>(no spam, max 1 email per month)</small>
	</th>
	</table>
	<br />
	<br />
	<p>
	{`view: `}
	<a href="#" onClick={() => changeView("prompt")}>
	all prompts
	</a>{" "}
	/{" "}
	<a href="#" onClick={() => changeView("model")}>
	all models
	</a>
	</p>
	<br />
	{viewBy === "prompt" ? (
	<>
	{types.map((type, k) => (
	<div key={k}>
	<p>{type}:</p>
	<br />
	<ul>
	{prompts
	.filter((p) => p.type === type)
	.map((prompt, i) => (
	<li key={i}>
	<pre style={{ maxWidth: 800 }}>
	{prompt.text}
	<br />
	<br />
	<Link href={`/${prompt.slug}`}>results</Link>
	</pre>
	</li>
	))}
	</ul>
	</div>
	))}
	</>
	) : (
	<ul>
	{models.map((model, i) => (
	<li key={i}>
	{model.name} -{" "}
	<Link
	href={`/model/${model.api_id.split("/").pop().toLowerCase()}`}
	>
	results
	</Link>
	</li>
	))}
	</ul>
	)}
	<br />
	<br />
	<h3>Notes</h3>
	<br />
	<ul>
	<li>
	I used a temperature of 0 and a max token limit of 240 for each test
	(that's why a lot of answers are cropped). The rest are default
	settings.
	</li>
	<li>
	I made this with a mix of APIs from OpenRouter, TogetherAI, OpenAI,
	Cohere, Aleph Alpha & AI21.
	</li>
	<li>
	<b>This is imperfect.</b> I want to improve this by using better
	stop sequences and prompt formatting tailored to each model. But
	hopefully it can already make picking models a bit easier.
	</li>
	<li>
	Ideas for the future: public votes to compute an ELO rating, compare
	2 models side by side, community-submitted prompts (open to
	suggestions)
	</li>
	<li>
	Prompt suggestions, feedback or say hi: vince [at] llmonitor.com
	</li>
	<li>
	{`Shameless plug: I'm building an `}
	<a href="https://github.com/llmonitor/llmonitor" target="_blank">
	open-source observability tool for AI devs.
	</a>
	</li>
	</ul>
	</main>
	</>
	)
	}