Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,6 +31,8 @@ from common import (
|
|
| 31 |
BATTLE_RULES,
|
| 32 |
EVAL_DESCRIPTION,
|
| 33 |
VOTING_HEADER,
|
|
|
|
|
|
|
| 34 |
)
|
| 35 |
from leaderboard import (
|
| 36 |
get_leaderboard,
|
|
@@ -153,8 +155,10 @@ def get_ip(request: gr.Request) -> str:
|
|
| 153 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
| 154 |
|
| 155 |
|
| 156 |
-
def get_vote_message(choice: str, model_a: str, model_b: str) -> str:
|
| 157 |
-
"""Generate appropriate message based on vote and model rankings.
|
|
|
|
|
|
|
| 158 |
voting_data = get_current_votes()
|
| 159 |
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
| 160 |
rankings = get_model_rankings(leaderboard)
|
|
@@ -162,19 +166,13 @@ def get_vote_message(choice: str, model_a: str, model_b: str) -> str:
|
|
| 162 |
pos_b = rankings.get(model_b, 0)
|
| 163 |
|
| 164 |
if choice == "Tie":
|
| 165 |
-
return
|
| 166 |
-
|
| 167 |
-
# Get chosen and rejected models based on vote
|
| 168 |
-
model_chosen = model_a if choice == "A" else model_b
|
| 169 |
-
model_rejected = model_b if choice == "A" else model_a
|
| 170 |
-
pos_chosen = pos_a if choice == "A" else pos_b
|
| 171 |
-
pos_rejected = pos_b if choice == "A" else pos_a
|
| 172 |
|
| 173 |
# Check if vote aligns with leaderboard
|
| 174 |
if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
|
| 175 |
-
return
|
| 176 |
else:
|
| 177 |
-
return
|
| 178 |
|
| 179 |
|
| 180 |
def vote(
|
|
@@ -227,19 +225,38 @@ def vote(
|
|
| 227 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
| 228 |
)
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
# Generate vote message
|
| 231 |
-
message = get_vote_message(choice, model_a, model_b)
|
| 232 |
|
| 233 |
-
# Return updates for UI components
|
| 234 |
return [
|
| 235 |
gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
|
| 236 |
gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
|
| 237 |
gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
|
| 238 |
-
gr.update(value=
|
| 239 |
-
gr.update(value=
|
| 240 |
gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
|
| 241 |
gr.update(value="π² New round", variant="primary"), # random_btn
|
| 242 |
-
gr.Info(message, title
|
| 243 |
]
|
| 244 |
|
| 245 |
|
|
@@ -311,7 +328,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 311 |
with gr.Column(scale=1):
|
| 312 |
with gr.Group():
|
| 313 |
human_input = gr.TextArea(
|
| 314 |
-
label="π©
|
| 315 |
lines=10,
|
| 316 |
placeholder="Enter the human message here..."
|
| 317 |
)
|
|
@@ -368,12 +385,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 368 |
|
| 369 |
gr.Markdown("<br>")
|
| 370 |
|
| 371 |
-
#
|
| 372 |
with gr.Accordion("π Evaluator Prompt", open=False):
|
| 373 |
-
gr.
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
|
| 378 |
with gr.TabItem("Leaderboard"):
|
| 379 |
with gr.Row():
|
|
@@ -406,11 +429,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 406 |
|
| 407 |
with gr.TabItem("Policy"):
|
| 408 |
gr.Markdown(POLICY_CONTENT)
|
|
|
|
| 409 |
|
| 410 |
# Define state variables for model tracking
|
| 411 |
model_a_state = gr.State()
|
| 412 |
model_b_state = gr.State()
|
| 413 |
final_prompt_state = gr.State()
|
|
|
|
|
|
|
| 414 |
|
| 415 |
# Update variable inputs based on the eval prompt
|
| 416 |
#def update_variables(eval_prompt):
|
|
@@ -550,12 +576,50 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 550 |
],
|
| 551 |
)
|
| 552 |
|
| 553 |
-
#
|
| 554 |
-
def
|
| 555 |
-
|
| 556 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
|
| 558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
(
|
| 560 |
response_a,
|
| 561 |
response_b,
|
|
@@ -564,18 +628,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 564 |
model_a,
|
| 565 |
model_b,
|
| 566 |
final_prompt,
|
| 567 |
-
) = submit_prompt(
|
| 568 |
|
| 569 |
# Parse the responses
|
| 570 |
score_a, critique_a = parse_model_response(response_a)
|
| 571 |
score_b, critique_b = parse_model_response(response_b)
|
| 572 |
|
| 573 |
-
#
|
| 574 |
-
|
| 575 |
-
|
|
|
|
| 576 |
|
| 577 |
# Update the last_submission state with the current values
|
| 578 |
-
last_submission.value =
|
| 579 |
|
| 580 |
return (
|
| 581 |
score_a,
|
|
@@ -598,9 +663,10 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 598 |
gr.update(value="π²"), # random_btn
|
| 599 |
)
|
| 600 |
|
|
|
|
| 601 |
send_btn.click(
|
| 602 |
fn=submit_and_store,
|
| 603 |
-
inputs=[
|
| 604 |
outputs=[
|
| 605 |
score_a,
|
| 606 |
critique_a,
|
|
|
|
| 31 |
BATTLE_RULES,
|
| 32 |
EVAL_DESCRIPTION,
|
| 33 |
VOTING_HEADER,
|
| 34 |
+
DEFAULT_EVAL_PROMPT_EDITABLE,
|
| 35 |
+
FIXED_EVAL_SUFFIX,
|
| 36 |
)
|
| 37 |
from leaderboard import (
|
| 38 |
get_leaderboard,
|
|
|
|
| 155 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
| 156 |
|
| 157 |
|
| 158 |
+
def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
|
| 159 |
+
"""Generate appropriate message based on vote and model rankings.
|
| 160 |
+
Returns (title, message) tuple."""
|
| 161 |
+
# Get current rankings
|
| 162 |
voting_data = get_current_votes()
|
| 163 |
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
| 164 |
rankings = get_model_rankings(leaderboard)
|
|
|
|
| 166 |
pos_b = rankings.get(model_b, 0)
|
| 167 |
|
| 168 |
if choice == "Tie":
|
| 169 |
+
return "It's a tie!", "Keep voting responsibly π€"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
# Check if vote aligns with leaderboard
|
| 172 |
if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
|
| 173 |
+
return "The favourite wins!", "Keep voting responsibly π€"
|
| 174 |
else:
|
| 175 |
+
return "The underdog wins!", "Keep voting responsibly π€"
|
| 176 |
|
| 177 |
|
| 178 |
def vote(
|
|
|
|
| 225 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
| 226 |
)
|
| 227 |
|
| 228 |
+
# Get model positions for display
|
| 229 |
+
voting_data = get_current_votes()
|
| 230 |
+
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
| 231 |
+
rankings = get_model_rankings(leaderboard)
|
| 232 |
+
pos_a = rankings.get(model_a, 0)
|
| 233 |
+
pos_b = rankings.get(model_b, 0)
|
| 234 |
+
|
| 235 |
+
# Format model names with positions and win/loss indicators
|
| 236 |
+
if choice == "Tie":
|
| 237 |
+
model_a_display = f"*Model: {model_a} (Position #{pos_a})*"
|
| 238 |
+
model_b_display = f"*Model: {model_b} (Position #{pos_b})*"
|
| 239 |
+
else:
|
| 240 |
+
winner = model_a if choice == "A" else model_b
|
| 241 |
+
loser = model_b if choice == "A" else model_a
|
| 242 |
+
winner_pos = pos_a if choice == "A" else pos_b
|
| 243 |
+
loser_pos = pos_b if choice == "A" else pos_a
|
| 244 |
+
|
| 245 |
+
model_a_display = f"*Model: {model_a} {'β
' if choice == 'A' else 'β'} (Position #{pos_a})*"
|
| 246 |
+
model_b_display = f"*Model: {model_b} {'β
' if choice == 'B' else 'β'} (Position #{pos_b})*"
|
| 247 |
+
|
| 248 |
# Generate vote message
|
| 249 |
+
title, message = get_vote_message(choice, model_a, model_b)
|
| 250 |
|
|
|
|
| 251 |
return [
|
| 252 |
gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
|
| 253 |
gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
|
| 254 |
gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
|
| 255 |
+
gr.update(value=model_a_display), # model_name_a
|
| 256 |
+
gr.update(value=model_b_display), # model_name_b
|
| 257 |
gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
|
| 258 |
gr.update(value="π² New round", variant="primary"), # random_btn
|
| 259 |
+
gr.Info(message, title=title), # success message
|
| 260 |
]
|
| 261 |
|
| 262 |
|
|
|
|
| 328 |
with gr.Column(scale=1):
|
| 329 |
with gr.Group():
|
| 330 |
human_input = gr.TextArea(
|
| 331 |
+
label="π© User Input",
|
| 332 |
lines=10,
|
| 333 |
placeholder="Enter the human message here..."
|
| 334 |
)
|
|
|
|
| 385 |
|
| 386 |
gr.Markdown("<br>")
|
| 387 |
|
| 388 |
+
# Update Evaluator Prompt Accordion
|
| 389 |
with gr.Accordion("π Evaluator Prompt", open=False):
|
| 390 |
+
eval_prompt_editable = gr.TextArea(
|
| 391 |
+
value=DEFAULT_EVAL_PROMPT_EDITABLE,
|
| 392 |
+
label="Evaluation Criteria",
|
| 393 |
+
lines=12
|
| 394 |
+
)
|
| 395 |
+
with gr.Row(visible=False) as edit_buttons_row: # Make buttons row initially hidden
|
| 396 |
+
cancel_prompt_btn = gr.Button("Cancel")
|
| 397 |
+
save_prompt_btn = gr.Button("Save", variant="primary")
|
| 398 |
+
gr.Markdown("*The sample being evaluated is always appended as:*")
|
| 399 |
+
gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
|
| 400 |
|
| 401 |
with gr.TabItem("Leaderboard"):
|
| 402 |
with gr.Row():
|
|
|
|
| 429 |
|
| 430 |
with gr.TabItem("Policy"):
|
| 431 |
gr.Markdown(POLICY_CONTENT)
|
| 432 |
+
gr.Markdown(ACKNOWLEDGEMENTS)
|
| 433 |
|
| 434 |
# Define state variables for model tracking
|
| 435 |
model_a_state = gr.State()
|
| 436 |
model_b_state = gr.State()
|
| 437 |
final_prompt_state = gr.State()
|
| 438 |
+
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
|
| 439 |
+
is_editing = gr.State(False) # Track editing state
|
| 440 |
|
| 441 |
# Update variable inputs based on the eval prompt
|
| 442 |
#def update_variables(eval_prompt):
|
|
|
|
| 576 |
],
|
| 577 |
)
|
| 578 |
|
| 579 |
+
# Add handlers for save/cancel buttons
|
| 580 |
+
def save_prompt(new_prompt, previous_prompt):
|
| 581 |
+
return [
|
| 582 |
+
gr.update(value=new_prompt), # Update the prompt
|
| 583 |
+
new_prompt, # Update the previous prompt state
|
| 584 |
+
gr.update(visible=False) # Hide the buttons
|
| 585 |
+
]
|
| 586 |
+
|
| 587 |
+
def cancel_prompt(previous_prompt):
|
| 588 |
+
return [
|
| 589 |
+
gr.update(value=previous_prompt), # Revert to previous prompt
|
| 590 |
+
previous_prompt, # Keep the previous prompt state
|
| 591 |
+
gr.update(visible=False) # Hide the buttons
|
| 592 |
+
]
|
| 593 |
+
|
| 594 |
+
def show_edit_buttons(current_value, previous_value):
|
| 595 |
+
# Show buttons only if the current value differs from the previous value
|
| 596 |
+
return gr.update(visible=current_value != previous_value)
|
| 597 |
+
|
| 598 |
+
# Add handlers for save/cancel buttons and prompt changes
|
| 599 |
+
save_prompt_btn.click(
|
| 600 |
+
fn=save_prompt,
|
| 601 |
+
inputs=[eval_prompt_editable, eval_prompt_previous],
|
| 602 |
+
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
cancel_prompt_btn.click(
|
| 606 |
+
fn=cancel_prompt,
|
| 607 |
+
inputs=[eval_prompt_previous],
|
| 608 |
+
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
|
| 609 |
+
)
|
| 610 |
+
|
| 611 |
+
eval_prompt_editable.change(
|
| 612 |
+
fn=show_edit_buttons,
|
| 613 |
+
inputs=[eval_prompt_editable, eval_prompt_previous],
|
| 614 |
+
outputs=edit_buttons_row
|
| 615 |
+
)
|
| 616 |
|
| 617 |
+
# Update the submit function to combine editable and fixed parts
|
| 618 |
+
def submit_and_store(editable_prompt, *variables):
|
| 619 |
+
# Combine the editable prompt with fixed suffix
|
| 620 |
+
full_prompt = editable_prompt + FIXED_EVAL_SUFFIX
|
| 621 |
+
|
| 622 |
+
# Get the responses using the full prompt
|
| 623 |
(
|
| 624 |
response_a,
|
| 625 |
response_b,
|
|
|
|
| 628 |
model_a,
|
| 629 |
model_b,
|
| 630 |
final_prompt,
|
| 631 |
+
) = submit_prompt(full_prompt, *variables)
|
| 632 |
|
| 633 |
# Parse the responses
|
| 634 |
score_a, critique_a = parse_model_response(response_a)
|
| 635 |
score_b, critique_b = parse_model_response(response_b)
|
| 636 |
|
| 637 |
+
# Only append "/ 5" if using the default prompt
|
| 638 |
+
if editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
|
| 639 |
+
score_a = f"{score_a} / 5"
|
| 640 |
+
score_b = f"{score_b} / 5"
|
| 641 |
|
| 642 |
# Update the last_submission state with the current values
|
| 643 |
+
last_submission.value = {"prompt": full_prompt, "variables": variables}
|
| 644 |
|
| 645 |
return (
|
| 646 |
score_a,
|
|
|
|
| 663 |
gr.update(value="π²"), # random_btn
|
| 664 |
)
|
| 665 |
|
| 666 |
+
# Update the click handler to use the editable prompt
|
| 667 |
send_btn.click(
|
| 668 |
fn=submit_and_store,
|
| 669 |
+
inputs=[eval_prompt_editable, human_input, ai_response],
|
| 670 |
outputs=[
|
| 671 |
score_a,
|
| 672 |
critique_a,
|