Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -20,7 +20,6 @@ import openai
|
|
| 20 |
import threading
|
| 21 |
import time
|
| 22 |
from collections import Counter
|
| 23 |
-
from model_suggestions import add_suggestion, get_suggestions_html
|
| 24 |
from release_notes import get_release_notes_html
|
| 25 |
|
| 26 |
|
|
@@ -85,7 +84,7 @@ def call_ollama_api(model, prompt):
|
|
| 85 |
)
|
| 86 |
|
| 87 |
try:
|
| 88 |
-
logger.info("Starting API call")
|
| 89 |
response = client.chat.completions.create(
|
| 90 |
model=model,
|
| 91 |
messages=[
|
|
@@ -100,10 +99,10 @@ def call_ollama_api(model, prompt):
|
|
| 100 |
],
|
| 101 |
timeout=180
|
| 102 |
)
|
| 103 |
-
logger.info("Received response")
|
| 104 |
|
| 105 |
if not response or not response.choices:
|
| 106 |
-
logger.error("Empty response received")
|
| 107 |
return [
|
| 108 |
{"role": "user", "content": prompt},
|
| 109 |
{"role": "assistant", "content": "Error: Empty response from the model"}
|
|
@@ -111,7 +110,7 @@ def call_ollama_api(model, prompt):
|
|
| 111 |
|
| 112 |
content = response.choices[0].message.content
|
| 113 |
if not content:
|
| 114 |
-
logger.error("Empty content received")
|
| 115 |
return [
|
| 116 |
{"role": "user", "content": prompt},
|
| 117 |
{"role": "assistant", "content": "Error: Empty content from the model"}
|
|
@@ -124,30 +123,37 @@ def call_ollama_api(model, prompt):
|
|
| 124 |
thinking_content = thinking_match.group(1).strip()
|
| 125 |
main_content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
|
| 126 |
|
| 127 |
-
logger.info("Found thinking content
|
| 128 |
return [
|
| 129 |
{"role": "user", "content": prompt},
|
| 130 |
{"role": "assistant", "content": f"{main_content}\n\n<details><summary>🤔 View thinking process</summary>\n\n{thinking_content}\n\n</details>"}
|
| 131 |
]
|
| 132 |
|
| 133 |
# If no thinking tags, return normal content
|
| 134 |
-
logger.info("No thinking tags found
|
| 135 |
return [
|
| 136 |
{"role": "user", "content": prompt},
|
| 137 |
{"role": "assistant", "content": content.strip()}
|
| 138 |
]
|
| 139 |
|
| 140 |
except requests.exceptions.Timeout:
|
| 141 |
-
logger.error("Timeout error after 180 seconds")
|
| 142 |
return [
|
| 143 |
{"role": "user", "content": prompt},
|
| 144 |
{"role": "assistant", "content": "Error: Model response timed out after 180 seconds"}
|
| 145 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
except Exception as e:
|
| 147 |
-
logger.error(f"Error calling Ollama API: {str(e)}", exc_info=True)
|
| 148 |
return [
|
| 149 |
{"role": "user", "content": prompt},
|
| 150 |
-
{"role": "assistant", "content":
|
| 151 |
]
|
| 152 |
|
| 153 |
# Generate responses using two randomly selected models
|
|
@@ -278,12 +284,11 @@ def record_vote(prompt, left_response, right_response, left_model, right_model,
|
|
| 278 |
return (
|
| 279 |
gr.update(value=result_message, visible=True), # Show result as Markdown
|
| 280 |
get_leaderboard(), # Update leaderboard
|
| 281 |
-
get_elo_leaderboard(),
|
| 282 |
gr.update(interactive=False), # Disable left vote button
|
| 283 |
gr.update(interactive=False), # Disable right vote button
|
| 284 |
gr.update(interactive=False), # Disable tie button
|
| 285 |
-
gr.update(visible=True)
|
| 286 |
-
get_leaderboard_chart() # Update leaderboard chart
|
| 287 |
)
|
| 288 |
|
| 289 |
def get_leaderboard_chart():
|
|
@@ -426,7 +431,20 @@ with gr.Blocks(css="""
|
|
| 426 |
|
| 427 |
# Leaderboard Tab (now first)
|
| 428 |
with gr.Tab("Leaderboard"):
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
# Battle Arena Tab (now second)
|
| 432 |
with gr.Tab("Battle Arena"):
|
|
@@ -473,50 +491,23 @@ with gr.Blocks(css="""
|
|
| 473 |
|
| 474 |
new_battle_btn = gr.Button("New Battle")
|
| 475 |
|
| 476 |
-
# Performance Chart Tab
|
| 477 |
-
with gr.Tab("Performance Chart"):
|
| 478 |
-
leaderboard_chart = gr.Plot(label="Model Performance Chart")
|
| 479 |
-
|
| 480 |
# ELO Leaderboard Tab
|
| 481 |
with gr.Tab("ELO Leaderboard"):
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
with
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
suggestion_status = gr.Markdown("Submit a model to see it listed below!")
|
| 495 |
-
suggestions_list = gr.HTML(get_suggestions_html())
|
| 496 |
-
refresh_suggestions_btn = gr.Button("Refresh List")
|
| 497 |
-
|
| 498 |
-
# Update button click handlers
|
| 499 |
-
submit_suggestion_btn.click(
|
| 500 |
-
add_suggestion,
|
| 501 |
-
inputs=[model_url_input],
|
| 502 |
-
outputs=[suggestion_status]
|
| 503 |
-
).then(
|
| 504 |
-
lambda: (
|
| 505 |
-
get_suggestions_html(), # Update suggestions list
|
| 506 |
-
"" # Clear model URL input
|
| 507 |
-
),
|
| 508 |
-
outputs=[
|
| 509 |
-
suggestions_list,
|
| 510 |
-
model_url_input
|
| 511 |
-
]
|
| 512 |
-
)
|
| 513 |
-
|
| 514 |
-
refresh_suggestions_btn.click(
|
| 515 |
-
get_suggestions_html,
|
| 516 |
-
outputs=[suggestions_list]
|
| 517 |
)
|
| 518 |
|
| 519 |
-
#
|
| 520 |
with gr.Tab("Latest Updates"):
|
| 521 |
release_notes = gr.HTML(get_release_notes_html())
|
| 522 |
refresh_notes_btn = gr.Button("Refresh Updates")
|
|
@@ -541,14 +532,14 @@ with gr.Blocks(css="""
|
|
| 541 |
lambda *args: record_vote(*args, "Left is better"),
|
| 542 |
inputs=[prompt_input, left_output, right_output, left_model, right_model],
|
| 543 |
outputs=[result, leaderboard, elo_leaderboard, left_vote_btn,
|
| 544 |
-
right_vote_btn, tie_btn, model_names_row
|
| 545 |
)
|
| 546 |
|
| 547 |
right_vote_btn.click(
|
| 548 |
lambda *args: record_vote(*args, "Right is better"),
|
| 549 |
inputs=[prompt_input, left_output, right_output, left_model, right_model],
|
| 550 |
outputs=[result, leaderboard, elo_leaderboard, left_vote_btn,
|
| 551 |
-
right_vote_btn, tie_btn, model_names_row
|
| 552 |
)
|
| 553 |
|
| 554 |
tie_btn.click(
|
|
@@ -561,13 +552,12 @@ with gr.Blocks(css="""
|
|
| 561 |
new_battle,
|
| 562 |
outputs=[prompt_input, left_output, right_output, left_model,
|
| 563 |
right_model, left_vote_btn, right_vote_btn, tie_btn,
|
| 564 |
-
result, leaderboard, model_names_row,
|
| 565 |
)
|
| 566 |
|
| 567 |
-
# Update leaderboard
|
| 568 |
demo.load(get_leaderboard, outputs=leaderboard)
|
| 569 |
demo.load(get_elo_leaderboard, outputs=elo_leaderboard)
|
| 570 |
-
demo.load(get_leaderboard_chart, outputs=leaderboard_chart)
|
| 571 |
|
| 572 |
if __name__ == "__main__":
|
| 573 |
# Initialize ELO ratings before launching the app
|
|
|
|
| 20 |
import threading
|
| 21 |
import time
|
| 22 |
from collections import Counter
|
|
|
|
| 23 |
from release_notes import get_release_notes_html
|
| 24 |
|
| 25 |
|
|
|
|
| 84 |
)
|
| 85 |
|
| 86 |
try:
|
| 87 |
+
logger.info(f"Starting API call for model: {model}")
|
| 88 |
response = client.chat.completions.create(
|
| 89 |
model=model,
|
| 90 |
messages=[
|
|
|
|
| 99 |
],
|
| 100 |
timeout=180
|
| 101 |
)
|
| 102 |
+
logger.info(f"Received response for model: {model}")
|
| 103 |
|
| 104 |
if not response or not response.choices:
|
| 105 |
+
logger.error(f"Empty response received for model: {model}")
|
| 106 |
return [
|
| 107 |
{"role": "user", "content": prompt},
|
| 108 |
{"role": "assistant", "content": "Error: Empty response from the model"}
|
|
|
|
| 110 |
|
| 111 |
content = response.choices[0].message.content
|
| 112 |
if not content:
|
| 113 |
+
logger.error(f"Empty content received for model: {model}")
|
| 114 |
return [
|
| 115 |
{"role": "user", "content": prompt},
|
| 116 |
{"role": "assistant", "content": "Error: Empty content from the model"}
|
|
|
|
| 123 |
thinking_content = thinking_match.group(1).strip()
|
| 124 |
main_content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
|
| 125 |
|
| 126 |
+
logger.info(f"Found thinking content for model: {model}")
|
| 127 |
return [
|
| 128 |
{"role": "user", "content": prompt},
|
| 129 |
{"role": "assistant", "content": f"{main_content}\n\n<details><summary>🤔 View thinking process</summary>\n\n{thinking_content}\n\n</details>"}
|
| 130 |
]
|
| 131 |
|
| 132 |
# If no thinking tags, return normal content
|
| 133 |
+
logger.info(f"No thinking tags found for model: {model}")
|
| 134 |
return [
|
| 135 |
{"role": "user", "content": prompt},
|
| 136 |
{"role": "assistant", "content": content.strip()}
|
| 137 |
]
|
| 138 |
|
| 139 |
except requests.exceptions.Timeout:
|
| 140 |
+
logger.error(f"Timeout error after 180 seconds for model: {model}")
|
| 141 |
return [
|
| 142 |
{"role": "user", "content": prompt},
|
| 143 |
{"role": "assistant", "content": "Error: Model response timed out after 180 seconds"}
|
| 144 |
]
|
| 145 |
+
except openai.BadRequestError as e:
|
| 146 |
+
error_msg = str(e)
|
| 147 |
+
logger.error(f"Bad request error for model: {model}. Error: {error_msg}")
|
| 148 |
+
return [
|
| 149 |
+
{"role": "user", "content": prompt},
|
| 150 |
+
{"role": "assistant", "content": "Error: Unable to get response from the model"}
|
| 151 |
+
]
|
| 152 |
except Exception as e:
|
| 153 |
+
logger.error(f"Error calling Ollama API for model: {model}. Error: {str(e)}", exc_info=True)
|
| 154 |
return [
|
| 155 |
{"role": "user", "content": prompt},
|
| 156 |
+
{"role": "assistant", "content": "Error: Unable to get response from the model"}
|
| 157 |
]
|
| 158 |
|
| 159 |
# Generate responses using two randomly selected models
|
|
|
|
| 284 |
return (
|
| 285 |
gr.update(value=result_message, visible=True), # Show result as Markdown
|
| 286 |
get_leaderboard(), # Update leaderboard
|
| 287 |
+
get_elo_leaderboard(), # Update ELO leaderboard
|
| 288 |
gr.update(interactive=False), # Disable left vote button
|
| 289 |
gr.update(interactive=False), # Disable right vote button
|
| 290 |
gr.update(interactive=False), # Disable tie button
|
| 291 |
+
gr.update(visible=True) # Show model names
|
|
|
|
| 292 |
)
|
| 293 |
|
| 294 |
def get_leaderboard_chart():
|
|
|
|
| 431 |
|
| 432 |
# Leaderboard Tab (now first)
|
| 433 |
with gr.Tab("Leaderboard"):
|
| 434 |
+
gr.Markdown("""
|
| 435 |
+
### Main Leaderboard
|
| 436 |
+
This leaderboard uses a scoring system that balances win rate and total battles. The score is calculated using the formula:
|
| 437 |
+
**Score = Win Rate * (1 - 1 / (Total Battles + 1))**
|
| 438 |
+
|
| 439 |
+
This formula rewards models with higher win rates and more battles. As the number of battles increases, the score approaches the win rate.
|
| 440 |
+
""")
|
| 441 |
+
leaderboard = gr.Dataframe(
|
| 442 |
+
headers=["Model", "Score", "Wins", "Losses", "Total Battles", "Win Rate"],
|
| 443 |
+
row_count=10,
|
| 444 |
+
col_count=6,
|
| 445 |
+
interactive=False,
|
| 446 |
+
label="Leaderboard"
|
| 447 |
+
)
|
| 448 |
|
| 449 |
# Battle Arena Tab (now second)
|
| 450 |
with gr.Tab("Battle Arena"):
|
|
|
|
| 491 |
|
| 492 |
new_battle_btn = gr.Button("New Battle")
|
| 493 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
# ELO Leaderboard Tab
|
| 495 |
with gr.Tab("ELO Leaderboard"):
|
| 496 |
+
gr.Markdown("""
|
| 497 |
+
### ELO Rating System
|
| 498 |
+
This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
|
| 499 |
+
Initial ratings are based on model size, with larger models starting at higher ratings.
|
| 500 |
+
The ELO rating is calculated based on wins and losses, with adjustments made based on the relative strengths of opponents.
|
| 501 |
+
""")
|
| 502 |
+
elo_leaderboard = gr.Dataframe(
|
| 503 |
+
headers=["Model", "ELO Rating", "Wins", "Losses", "Total Battles", "Win Rate"],
|
| 504 |
+
row_count=10,
|
| 505 |
+
col_count=6,
|
| 506 |
+
interactive=False,
|
| 507 |
+
label="ELO Leaderboard"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
)
|
| 509 |
|
| 510 |
+
# Latest Updates Tab
|
| 511 |
with gr.Tab("Latest Updates"):
|
| 512 |
release_notes = gr.HTML(get_release_notes_html())
|
| 513 |
refresh_notes_btn = gr.Button("Refresh Updates")
|
|
|
|
| 532 |
lambda *args: record_vote(*args, "Left is better"),
|
| 533 |
inputs=[prompt_input, left_output, right_output, left_model, right_model],
|
| 534 |
outputs=[result, leaderboard, elo_leaderboard, left_vote_btn,
|
| 535 |
+
right_vote_btn, tie_btn, model_names_row]
|
| 536 |
)
|
| 537 |
|
| 538 |
right_vote_btn.click(
|
| 539 |
lambda *args: record_vote(*args, "Right is better"),
|
| 540 |
inputs=[prompt_input, left_output, right_output, left_model, right_model],
|
| 541 |
outputs=[result, leaderboard, elo_leaderboard, left_vote_btn,
|
| 542 |
+
right_vote_btn, tie_btn, model_names_row]
|
| 543 |
)
|
| 544 |
|
| 545 |
tie_btn.click(
|
|
|
|
| 552 |
new_battle,
|
| 553 |
outputs=[prompt_input, left_output, right_output, left_model,
|
| 554 |
right_model, left_vote_btn, right_vote_btn, tie_btn,
|
| 555 |
+
result, leaderboard, model_names_row, tie_count]
|
| 556 |
)
|
| 557 |
|
| 558 |
+
# Update leaderboard on launch
|
| 559 |
demo.load(get_leaderboard, outputs=leaderboard)
|
| 560 |
demo.load(get_elo_leaderboard, outputs=elo_leaderboard)
|
|
|
|
| 561 |
|
| 562 |
if __name__ == "__main__":
|
| 563 |
# Initialize ELO ratings before launching the app
|