Spaces:
Sleeping
Sleeping
Cleaning up UI.
Browse files
app.py
CHANGED
|
@@ -199,27 +199,31 @@ class QuizApp:
|
|
| 199 |
|
| 200 |
def format_answer(self, answer: str, dataset_name: str) -> str:
|
| 201 |
"""Format answer based on dataset type for better readability"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
if dataset_name == "openai/gsm8k":
|
| 203 |
-
# GSM8K has specific formatting with equations and final answer
|
| 204 |
-
# Replace <<...>> with proper math formatting
|
| 205 |
-
import re
|
| 206 |
-
|
| 207 |
-
# Convert <<equation>> to LaTeX
|
| 208 |
-
answer = re.sub(r"<<([^>]+)>>", r"$\\1$", answer)
|
| 209 |
# Format the final answer line
|
| 210 |
-
answer = answer.replace("####", "\n\
|
| 211 |
-
# Ensure proper line breaks
|
| 212 |
-
answer =
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
# MMLU answers are usually single letters or short phrases
|
| 216 |
-
return answer
|
| 217 |
-
elif dataset_name == "rajpurkar/squad":
|
| 218 |
-
# SQuAD answers might need context
|
| 219 |
-
return answer
|
| 220 |
-
else:
|
| 221 |
-
# Default formatting for other datasets
|
| 222 |
-
return answer
|
| 223 |
|
| 224 |
def check_answer(self, user_answer: str) -> Tuple[bool, str]:
|
| 225 |
"""Check if the user's answer is correct"""
|
|
@@ -242,7 +246,7 @@ class QuizApp:
|
|
| 242 |
is_correct = user_letter == correct_letter
|
| 243 |
|
| 244 |
if is_correct:
|
| 245 |
-
return True,
|
| 246 |
else:
|
| 247 |
choices = question_data[config["choices_field"]]
|
| 248 |
correct_choice = (
|
|
@@ -256,7 +260,7 @@ class QuizApp:
|
|
| 256 |
)
|
| 257 |
return (
|
| 258 |
False,
|
| 259 |
-
f
|
| 260 |
)
|
| 261 |
|
| 262 |
elif question_type == "true_false":
|
|
@@ -265,11 +269,11 @@ class QuizApp:
|
|
| 265 |
is_correct = user_bool == correct_answer
|
| 266 |
|
| 267 |
if is_correct:
|
| 268 |
-
return True,
|
| 269 |
else:
|
| 270 |
return (
|
| 271 |
False,
|
| 272 |
-
f
|
| 273 |
)
|
| 274 |
|
| 275 |
elif question_type == "binary_choice":
|
|
@@ -278,7 +282,7 @@ class QuizApp:
|
|
| 278 |
is_correct = user_idx == correct_answer_idx
|
| 279 |
|
| 280 |
if is_correct:
|
| 281 |
-
return True,
|
| 282 |
else:
|
| 283 |
correct_letter = "A" if correct_answer_idx == 0 else "B"
|
| 284 |
option_field = (
|
|
@@ -293,7 +297,7 @@ class QuizApp:
|
|
| 293 |
)
|
| 294 |
return (
|
| 295 |
False,
|
| 296 |
-
f
|
| 297 |
)
|
| 298 |
|
| 299 |
elif question_type in ["qa", "extractive_qa"]:
|
|
@@ -325,36 +329,44 @@ class QuizApp:
|
|
| 325 |
else:
|
| 326 |
final_answer = correct_answer
|
| 327 |
|
| 328 |
-
#
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
# Check if answers match
|
| 333 |
-
is_correct = False
|
| 334 |
-
|
| 335 |
-
# If both have numbers, compare the numbers
|
| 336 |
-
if correct_numbers and user_numbers:
|
| 337 |
-
# Convert to float for comparison to handle decimals
|
| 338 |
-
try:
|
| 339 |
-
correct_num = float(
|
| 340 |
-
correct_numbers[-1]
|
| 341 |
-
) # Take the last number as final answer
|
| 342 |
-
user_num = float(user_numbers[-1]) # Take the last number from user
|
| 343 |
-
is_correct = (
|
| 344 |
-
abs(correct_num - user_num) < 0.0001
|
| 345 |
-
) # Small tolerance for float comparison
|
| 346 |
-
except ValueError:
|
| 347 |
-
# Fall back to string comparison
|
| 348 |
-
is_correct = correct_numbers[-1] == user_numbers[-1]
|
| 349 |
else:
|
| 350 |
-
#
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
if is_correct:
|
| 357 |
-
return True,
|
| 358 |
else:
|
| 359 |
logging.info(f"Raw answer (QA): {repr(correct_answer)}")
|
| 360 |
logging.info(f"Extracted final answer: {repr(final_answer)}")
|
|
@@ -364,9 +376,11 @@ class QuizApp:
|
|
| 364 |
formatted_answer = self.format_answer(
|
| 365 |
correct_answer, self.current_dataset_name
|
| 366 |
)
|
|
|
|
|
|
|
| 367 |
return (
|
| 368 |
False,
|
| 369 |
-
f
|
| 370 |
)
|
| 371 |
|
| 372 |
return False, "Unknown question type"
|
|
@@ -399,11 +413,11 @@ def start_quiz(dataset_choice: str, num_questions: int):
|
|
| 399 |
if not dataset_id:
|
| 400 |
return (
|
| 401 |
"Please select a dataset",
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
gr.update(visible=False),
|
| 405 |
-
gr.update(visible=False),
|
| 406 |
-
|
| 407 |
)
|
| 408 |
|
| 409 |
success, message = quiz_app.load_dataset_questions(dataset_id, num_questions)
|
|
@@ -414,29 +428,29 @@ def start_quiz(dataset_choice: str, num_questions: int):
|
|
| 414 |
if q_type in ["multiple_choice", "true_false", "binary_choice"]:
|
| 415 |
return (
|
| 416 |
message,
|
| 417 |
-
question,
|
| 418 |
-
gr.update(choices=choices, visible=True, value=None),
|
| 419 |
-
gr.update(visible=False),
|
| 420 |
-
gr.update(visible=True),
|
| 421 |
-
f"Question 1/{quiz_app.total_questions}",
|
| 422 |
)
|
| 423 |
else:
|
| 424 |
return (
|
| 425 |
message,
|
| 426 |
-
question,
|
| 427 |
-
gr.update(visible=False),
|
| 428 |
-
gr.update(visible=True, value=""),
|
| 429 |
-
gr.update(visible=True),
|
| 430 |
-
f"Question 1/{quiz_app.total_questions}",
|
| 431 |
)
|
| 432 |
else:
|
| 433 |
return (
|
| 434 |
message,
|
| 435 |
-
|
| 436 |
-
gr.update(visible=False),
|
| 437 |
-
gr.update(visible=False),
|
| 438 |
-
gr.update(visible=False),
|
| 439 |
-
|
| 440 |
)
|
| 441 |
|
| 442 |
|
|
@@ -462,7 +476,7 @@ def next_question():
|
|
| 462 |
|
| 463 |
if quiz_app.current_question_idx >= quiz_app.total_questions:
|
| 464 |
# Quiz complete
|
| 465 |
-
final_score = f
|
| 466 |
return (
|
| 467 |
gr.update(value=final_score, visible=True),
|
| 468 |
"",
|
|
@@ -478,22 +492,22 @@ def next_question():
|
|
| 478 |
if q_type in ["multiple_choice", "true_false", "binary_choice"]:
|
| 479 |
return (
|
| 480 |
gr.update(value="", visible=False), # Clear feedback
|
| 481 |
-
question,
|
| 482 |
gr.update(choices=choices, visible=True, value=None),
|
| 483 |
gr.update(visible=False),
|
| 484 |
gr.update(visible=True),
|
| 485 |
gr.update(visible=False),
|
| 486 |
-
f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}",
|
| 487 |
)
|
| 488 |
else:
|
| 489 |
return (
|
| 490 |
gr.update(value="", visible=False), # Clear feedback
|
| 491 |
-
question,
|
| 492 |
gr.update(visible=False),
|
| 493 |
gr.update(visible=True, value=""),
|
| 494 |
gr.update(visible=True),
|
| 495 |
gr.update(visible=False),
|
| 496 |
-
f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}",
|
| 497 |
)
|
| 498 |
|
| 499 |
|
|
@@ -504,33 +518,40 @@ with gr.Blocks(title="HuggingFace Evaluation Dataset Quiz") as demo:
|
|
| 504 |
"Test yourself with questions from popular HuggingFace evaluation datasets!"
|
| 505 |
)
|
| 506 |
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
)
|
| 518 |
-
|
| 519 |
-
start_button = gr.Button("Start Quiz", variant="primary")
|
| 520 |
-
status_message = gr.Textbox(label="Status", interactive=False)
|
| 521 |
-
|
| 522 |
-
with gr.Tab("Quiz"):
|
| 523 |
-
progress_text = gr.Textbox(label="Progress", value="0/0", interactive=False)
|
| 524 |
-
question_display = gr.Textbox(label="Question", lines=5, interactive=False)
|
| 525 |
-
|
| 526 |
-
# Answer inputs (one will be visible at a time)
|
| 527 |
-
answer_radio = gr.Radio(label="Select your answer", visible=False)
|
| 528 |
-
answer_textbox = gr.Textbox(label="Type your answer", visible=False)
|
| 529 |
-
|
| 530 |
-
submit_button = gr.Button("Submit Answer", variant="primary", visible=False)
|
| 531 |
|
| 532 |
-
|
| 533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
|
| 535 |
# Connect events
|
| 536 |
start_button.click(
|
|
|
|
| 199 |
|
| 200 |
def format_answer(self, answer: str, dataset_name: str) -> str:
|
| 201 |
"""Format answer based on dataset type for better readability"""
|
| 202 |
+
import re
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
# Convert <<equation>> to show the math clearly
|
| 206 |
+
# Extract the equation and its result, show just the result with equation in parentheses
|
| 207 |
+
def format_equation(match):
|
| 208 |
+
equation = match.group(1)
|
| 209 |
+
# Check if it's in format "calculation=result"
|
| 210 |
+
if '=' in equation:
|
| 211 |
+
parts = equation.split('=')
|
| 212 |
+
if len(parts) == 2:
|
| 213 |
+
calculation, result = parts[0], parts[1]
|
| 214 |
+
return f"{result} (={calculation})"
|
| 215 |
+
return f"[{equation}]"
|
| 216 |
+
|
| 217 |
+
answer = re.sub(r"<<([^>]+)>>", format_equation, answer)
|
| 218 |
+
|
| 219 |
+
# Dataset-specific formatting
|
| 220 |
if dataset_name == "openai/gsm8k":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
# Format the final answer line
|
| 222 |
+
answer = answer.replace("####", "\n\nFinal Answer:")
|
| 223 |
+
# Ensure proper line breaks after periods for readability
|
| 224 |
+
answer = re.sub(r'\. (?=[A-Z])', '.\n', answer)
|
| 225 |
+
|
| 226 |
+
return answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
def check_answer(self, user_answer: str) -> Tuple[bool, str]:
|
| 229 |
"""Check if the user's answer is correct"""
|
|
|
|
| 246 |
is_correct = user_letter == correct_letter
|
| 247 |
|
| 248 |
if is_correct:
|
| 249 |
+
return True, '✅ Correct!'
|
| 250 |
else:
|
| 251 |
choices = question_data[config["choices_field"]]
|
| 252 |
correct_choice = (
|
|
|
|
| 260 |
)
|
| 261 |
return (
|
| 262 |
False,
|
| 263 |
+
f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
|
| 264 |
)
|
| 265 |
|
| 266 |
elif question_type == "true_false":
|
|
|
|
| 269 |
is_correct = user_bool == correct_answer
|
| 270 |
|
| 271 |
if is_correct:
|
| 272 |
+
return True, '✅ Correct!'
|
| 273 |
else:
|
| 274 |
return (
|
| 275 |
False,
|
| 276 |
+
f'❌ Incorrect\n\nThe correct answer was {correct_answer}',
|
| 277 |
)
|
| 278 |
|
| 279 |
elif question_type == "binary_choice":
|
|
|
|
| 282 |
is_correct = user_idx == correct_answer_idx
|
| 283 |
|
| 284 |
if is_correct:
|
| 285 |
+
return True, '✅ Correct!'
|
| 286 |
else:
|
| 287 |
correct_letter = "A" if correct_answer_idx == 0 else "B"
|
| 288 |
option_field = (
|
|
|
|
| 297 |
)
|
| 298 |
return (
|
| 299 |
False,
|
| 300 |
+
f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
|
| 301 |
)
|
| 302 |
|
| 303 |
elif question_type in ["qa", "extractive_qa"]:
|
|
|
|
| 329 |
else:
|
| 330 |
final_answer = correct_answer
|
| 331 |
|
| 332 |
+
# First check if user answer is empty
|
| 333 |
+
if not user_answer or not user_answer.strip():
|
| 334 |
+
is_correct = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
else:
|
| 336 |
+
# Extract numbers from both answers for comparison
|
| 337 |
+
correct_numbers = re.findall(r"-?\d+\.?\d*", final_answer)
|
| 338 |
+
user_numbers = re.findall(r"-?\d+\.?\d*", user_answer)
|
| 339 |
+
|
| 340 |
+
# Check if answers match
|
| 341 |
+
is_correct = False
|
| 342 |
+
|
| 343 |
+
# If both have numbers, compare the numbers
|
| 344 |
+
if correct_numbers and user_numbers:
|
| 345 |
+
# Convert to float for comparison to handle decimals
|
| 346 |
+
try:
|
| 347 |
+
correct_num = float(
|
| 348 |
+
correct_numbers[-1]
|
| 349 |
+
) # Take the last number as final answer
|
| 350 |
+
user_num = float(user_numbers[-1]) # Take the last number from user
|
| 351 |
+
is_correct = (
|
| 352 |
+
abs(correct_num - user_num) < 0.0001
|
| 353 |
+
) # Small tolerance for float comparison
|
| 354 |
+
except ValueError:
|
| 355 |
+
# Fall back to string comparison
|
| 356 |
+
is_correct = correct_numbers[-1] == user_numbers[-1]
|
| 357 |
+
elif correct_numbers and not user_numbers:
|
| 358 |
+
# If correct answer has numbers but user answer doesn't, it's wrong
|
| 359 |
+
is_correct = False
|
| 360 |
+
else:
|
| 361 |
+
# Fall back to substring matching for non-numeric answers
|
| 362 |
+
# But ensure both strings are non-empty
|
| 363 |
+
is_correct = (
|
| 364 |
+
user_answer.lower().strip() in correct_answer.lower()
|
| 365 |
+
or correct_answer.lower() in user_answer.lower().strip()
|
| 366 |
+
) and len(user_answer.strip()) > 0
|
| 367 |
|
| 368 |
if is_correct:
|
| 369 |
+
return True, '✅ Correct!'
|
| 370 |
else:
|
| 371 |
logging.info(f"Raw answer (QA): {repr(correct_answer)}")
|
| 372 |
logging.info(f"Extracted final answer: {repr(final_answer)}")
|
|
|
|
| 376 |
formatted_answer = self.format_answer(
|
| 377 |
correct_answer, self.current_dataset_name
|
| 378 |
)
|
| 379 |
+
# Debug: log the formatted answer
|
| 380 |
+
logging.info(f"Formatted answer with LaTeX: {repr(formatted_answer)}")
|
| 381 |
return (
|
| 382 |
False,
|
| 383 |
+
f'❌ Incorrect\n\nThe correct answer was:\n\n{formatted_answer}',
|
| 384 |
)
|
| 385 |
|
| 386 |
return False, "Unknown question type"
|
|
|
|
| 413 |
if not dataset_id:
|
| 414 |
return (
|
| 415 |
"Please select a dataset",
|
| 416 |
+
gr.update(visible=False), # question_display
|
| 417 |
+
gr.update(visible=False), # answer_radio
|
| 418 |
+
gr.update(visible=False), # answer_textbox
|
| 419 |
+
gr.update(visible=False), # submit_button
|
| 420 |
+
gr.update(visible=False), # progress_text
|
| 421 |
)
|
| 422 |
|
| 423 |
success, message = quiz_app.load_dataset_questions(dataset_id, num_questions)
|
|
|
|
| 428 |
if q_type in ["multiple_choice", "true_false", "binary_choice"]:
|
| 429 |
return (
|
| 430 |
message,
|
| 431 |
+
gr.update(value=question, visible=True), # question_display
|
| 432 |
+
gr.update(choices=choices, visible=True, value=None), # answer_radio
|
| 433 |
+
gr.update(visible=False), # answer_textbox
|
| 434 |
+
gr.update(visible=True), # submit_button
|
| 435 |
+
gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True), # progress_text
|
| 436 |
)
|
| 437 |
else:
|
| 438 |
return (
|
| 439 |
message,
|
| 440 |
+
gr.update(value=question, visible=True), # question_display
|
| 441 |
+
gr.update(visible=False), # answer_radio
|
| 442 |
+
gr.update(visible=True, value=""), # answer_textbox
|
| 443 |
+
gr.update(visible=True), # submit_button
|
| 444 |
+
gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True), # progress_text
|
| 445 |
)
|
| 446 |
else:
|
| 447 |
return (
|
| 448 |
message,
|
| 449 |
+
gr.update(visible=False), # question_display
|
| 450 |
+
gr.update(visible=False), # answer_radio
|
| 451 |
+
gr.update(visible=False), # answer_textbox
|
| 452 |
+
gr.update(visible=False), # submit_button
|
| 453 |
+
gr.update(visible=False), # progress_text
|
| 454 |
)
|
| 455 |
|
| 456 |
|
|
|
|
| 476 |
|
| 477 |
if quiz_app.current_question_idx >= quiz_app.total_questions:
|
| 478 |
# Quiz complete
|
| 479 |
+
final_score = f'🎉 Quiz Complete!\n\nYour score: {quiz_app.score}/{quiz_app.total_questions} ({quiz_app.score / quiz_app.total_questions * 100:.1f}%)'
|
| 480 |
return (
|
| 481 |
gr.update(value=final_score, visible=True),
|
| 482 |
"",
|
|
|
|
| 492 |
if q_type in ["multiple_choice", "true_false", "binary_choice"]:
|
| 493 |
return (
|
| 494 |
gr.update(value="", visible=False), # Clear feedback
|
| 495 |
+
gr.update(value=question), # question_display
|
| 496 |
gr.update(choices=choices, visible=True, value=None),
|
| 497 |
gr.update(visible=False),
|
| 498 |
gr.update(visible=True),
|
| 499 |
gr.update(visible=False),
|
| 500 |
+
gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
|
| 501 |
)
|
| 502 |
else:
|
| 503 |
return (
|
| 504 |
gr.update(value="", visible=False), # Clear feedback
|
| 505 |
+
gr.update(value=question), # question_display
|
| 506 |
gr.update(visible=False),
|
| 507 |
gr.update(visible=True, value=""),
|
| 508 |
gr.update(visible=True),
|
| 509 |
gr.update(visible=False),
|
| 510 |
+
gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
|
| 511 |
)
|
| 512 |
|
| 513 |
|
|
|
|
| 518 |
"Test yourself with questions from popular HuggingFace evaluation datasets!"
|
| 519 |
)
|
| 520 |
|
| 521 |
+
# Dataset Selection Section
|
| 522 |
+
with gr.Row():
|
| 523 |
+
dataset_dropdown = gr.Dropdown(
|
| 524 |
+
choices=[config["name"] for config in EVAL_DATASETS.values()],
|
| 525 |
+
label="Select Dataset",
|
| 526 |
+
value=list(EVAL_DATASETS.values())[0]["name"],
|
| 527 |
+
)
|
| 528 |
+
num_questions_slider = gr.Slider(
|
| 529 |
+
minimum=5, maximum=20, value=10, step=1, label="Number of Questions"
|
| 530 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
|
| 532 |
+
start_button = gr.Button("Start Quiz", variant="primary")
|
| 533 |
+
status_message = gr.Textbox(label="Status", interactive=False)
|
| 534 |
+
|
| 535 |
+
# Quiz Section - shown when quiz starts
|
| 536 |
+
gr.Markdown("---") # Separator
|
| 537 |
+
|
| 538 |
+
progress_text = gr.Textbox(label="Progress", value="0/0", interactive=False, visible=False)
|
| 539 |
+
question_display = gr.Textbox(label="Question", lines=5, interactive=False, visible=False)
|
| 540 |
+
|
| 541 |
+
# Answer inputs (one will be visible at a time)
|
| 542 |
+
answer_radio = gr.Radio(label="Select your answer", visible=False)
|
| 543 |
+
answer_textbox = gr.Textbox(label="Type your answer (Raw number)", visible=False)
|
| 544 |
+
|
| 545 |
+
submit_button = gr.Button("Submit Answer", variant="primary", visible=False)
|
| 546 |
+
|
| 547 |
+
feedback_display = gr.Textbox(
|
| 548 |
+
label="Feedback",
|
| 549 |
+
visible=False,
|
| 550 |
+
lines=10,
|
| 551 |
+
max_lines=20,
|
| 552 |
+
interactive=False
|
| 553 |
+
)
|
| 554 |
+
next_button = gr.Button("Next Question", visible=False)
|
| 555 |
|
| 556 |
# Connect events
|
| 557 |
start_button.click(
|