Spaces:
Build error
Build error
| import db, { getModels } from "@/utils/db" | |
| import Link from "next/link" | |
| export default async function Leaderboard() { | |
| const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics` | |
| const models = await getModels() | |
| return ( | |
| <> | |
| <p> | |
| Traditional LLMs benchmarks have drawbacks: they quickly become part of | |
| training datasets and are hard to relate-to in terms of real-world | |
| use-cases. | |
| </p> | |
| <p> | |
| I made this as an experiment to address these issues. Here the dataset | |
| is dynamic (changes every week) and composed of crowdsourced real-world | |
| prompts. | |
| </p> | |
| <p> | |
| We then use GPT-4 to grade each model's response against a set of | |
| rubrics (more details on the about page). The prompt dataset is easily | |
| explorable as the score is only 1 dimension. | |
| </p> | |
| <p> | |
| The results are stored in Postgres database and those are the raw | |
| results. | |
| </p> | |
| <br /> | |
| <table style={{ maxWidth: 600 }}> | |
| <thead> | |
| <tr> | |
| <th width={70}>Rank</th> | |
| <th width={250}>Model</th> | |
| <th>Score</th> | |
| <th>Results</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {models | |
| .filter((s) => s.total_score) | |
| .map((model, i) => ( | |
| <tr key={i}> | |
| <td>{model.rank}</td> | |
| <td>{model.name}</td> | |
| <td> | |
| {parseInt((model.total_score / potentialPoints.total) * 100)} | |
| </td> | |
| <td> | |
| <Link | |
| href={`/${model.api_id.split("/").pop().toLowerCase()}`} | |
| > | |
| view | |
| </Link> | |
| </td> | |
| </tr> | |
| ))} | |
| </tbody> | |
| </table> | |
| </> | |
| ) | |
| } | |