Spaces:

Narsil
/

eval_playground

Sleeping

App Files Files Community

Narsil commited on Jul 23

Commit

5e4987c

unverified ·

1 Parent(s): e2152af

Cleaning up UI.

Browse files

Files changed (1) hide show

app.py +125 -104

app.py CHANGED Viewed

@@ -199,27 +199,31 @@ class QuizApp:
     def format_answer(self, answer: str, dataset_name: str) -> str:
         """Format answer based on dataset type for better readability"""
         if dataset_name == "openai/gsm8k":
-            # GSM8K has specific formatting with equations and final answer
-            # Replace <<...>> with proper math formatting
-            import re
-            # Convert <<equation>> to LaTeX
-            answer = re.sub(r"<<([^>]+)>>", r"$\\1$", answer)
             # Format the final answer line
-            answer = answer.replace("####", "\n\n**Final Answer:**")
-            # Ensure proper line breaks
-            answer = answer.replace(". ", ".\n")
-            return answer
-        elif dataset_name == "cais/mmlu":
-            # MMLU answers are usually single letters or short phrases
-            return answer
-        elif dataset_name == "rajpurkar/squad":
-            # SQuAD answers might need context
-            return answer
-        else:
-            # Default formatting for other datasets
-            return answer
     def check_answer(self, user_answer: str) -> Tuple[bool, str]:
         """Check if the user's answer is correct"""
@@ -242,7 +246,7 @@ class QuizApp:
             is_correct = user_letter == correct_letter
             if is_correct:
-                return True, "✅ **Correct!**"
             else:
                 choices = question_data[config["choices_field"]]
                 correct_choice = (
@@ -256,7 +260,7 @@ class QuizApp:
                 )
                 return (
                     False,
-                    f"❌ **Incorrect**\n\nThe correct answer was **{correct_letter}**:\n\n{formatted_answer}",
                 )
         elif question_type == "true_false":
@@ -265,11 +269,11 @@ class QuizApp:
             is_correct = user_bool == correct_answer
             if is_correct:
-                return True, "✅ **Correct!**"
             else:
                 return (
                     False,
-                    f"❌ **Incorrect**\n\nThe correct answer was **{correct_answer}**",
                 )
         elif question_type == "binary_choice":
@@ -278,7 +282,7 @@ class QuizApp:
             is_correct = user_idx == correct_answer_idx
             if is_correct:
-                return True, "✅ **Correct!**"
             else:
                 correct_letter = "A" if correct_answer_idx == 0 else "B"
                 option_field = (
@@ -293,7 +297,7 @@ class QuizApp:
                 )
                 return (
                     False,
-                    f"❌ **Incorrect**\n\nThe correct answer was **{correct_letter}**:\n\n{formatted_answer}",
                 )
         elif question_type in ["qa", "extractive_qa"]:
@@ -325,36 +329,44 @@ class QuizApp:
             else:
                 final_answer = correct_answer
-            # Extract numbers from both answers for comparison
-            correct_numbers = re.findall(r"-?\d+\.?\d*", final_answer)
-            user_numbers = re.findall(r"-?\d+\.?\d*", user_answer)
-            # Check if answers match
-            is_correct = False
-            # If both have numbers, compare the numbers
-            if correct_numbers and user_numbers:
-                # Convert to float for comparison to handle decimals
-                try:
-                    correct_num = float(
-                        correct_numbers[-1]
-                    )  # Take the last number as final answer
-                    user_num = float(user_numbers[-1])  # Take the last number from user
-                    is_correct = (
-                        abs(correct_num - user_num) < 0.0001
-                    )  # Small tolerance for float comparison
-                except ValueError:
-                    # Fall back to string comparison
-                    is_correct = correct_numbers[-1] == user_numbers[-1]
             else:
-                # Fall back to substring matching for non-numeric answers
-                is_correct = (
-                    user_answer.lower().strip() in correct_answer.lower()
-                    or correct_answer.lower() in user_answer.lower().strip()
-                )
             if is_correct:
-                return True, "✅ **Correct!**"
             else:
                 logging.info(f"Raw answer (QA): {repr(correct_answer)}")
                 logging.info(f"Extracted final answer: {repr(final_answer)}")
@@ -364,9 +376,11 @@ class QuizApp:
                 formatted_answer = self.format_answer(
                     correct_answer, self.current_dataset_name
                 )
                 return (
                     False,
-                    f"❌ **Incorrect**\n\n**The correct answer was:**\n\n{formatted_answer}",
                 )
         return False, "Unknown question type"
@@ -399,11 +413,11 @@ def start_quiz(dataset_choice: str, num_questions: int):
     if not dataset_id:
         return (
             "Please select a dataset",
-            "",
-            "",
-            gr.update(visible=False),
-            gr.update(visible=False),
-            "0/0",
         )
     success, message = quiz_app.load_dataset_questions(dataset_id, num_questions)
@@ -414,29 +428,29 @@ def start_quiz(dataset_choice: str, num_questions: int):
         if q_type in ["multiple_choice", "true_false", "binary_choice"]:
             return (
                 message,
-                question,
-                gr.update(choices=choices, visible=True, value=None),
-                gr.update(visible=False),
-                gr.update(visible=True),
-                f"Question 1/{quiz_app.total_questions}",
             )
         else:
             return (
                 message,
-                question,
-                gr.update(visible=False),
-                gr.update(visible=True, value=""),
-                gr.update(visible=True),
-                f"Question 1/{quiz_app.total_questions}",
             )
     else:
         return (
             message,
-            "",
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False),
-            "0/0",
         )
@@ -462,7 +476,7 @@ def next_question():
     if quiz_app.current_question_idx >= quiz_app.total_questions:
         # Quiz complete
-        final_score = f"## 🎉 Quiz Complete!\n\n**Your score:** {quiz_app.score}/{quiz_app.total_questions} ({quiz_app.score / quiz_app.total_questions * 100:.1f}%)"
         return (
             gr.update(value=final_score, visible=True),
             "",
@@ -478,22 +492,22 @@ def next_question():
     if q_type in ["multiple_choice", "true_false", "binary_choice"]:
         return (
             gr.update(value="", visible=False),  # Clear feedback
-            question,
             gr.update(choices=choices, visible=True, value=None),
             gr.update(visible=False),
             gr.update(visible=True),
             gr.update(visible=False),
-            f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}",
         )
     else:
         return (
             gr.update(value="", visible=False),  # Clear feedback
-            question,
             gr.update(visible=False),
             gr.update(visible=True, value=""),
             gr.update(visible=True),
             gr.update(visible=False),
-            f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}",
         )
@@ -504,33 +518,40 @@ with gr.Blocks(title="HuggingFace Evaluation Dataset Quiz") as demo:
         "Test yourself with questions from popular HuggingFace evaluation datasets!"
     )
-    with gr.Tabs():
-        with gr.Tab("Dataset Selection"):
-            with gr.Row():
-                dataset_dropdown = gr.Dropdown(
-                    choices=[config["name"] for config in EVAL_DATASETS.values()],
-                    label="Select Dataset",
-                    value=list(EVAL_DATASETS.values())[0]["name"],
-                )
-                num_questions_slider = gr.Slider(
-                    minimum=5, maximum=20, value=10, step=1, label="Number of Questions"
-                )
-            start_button = gr.Button("Start Quiz", variant="primary")
-            status_message = gr.Textbox(label="Status", interactive=False)
-        with gr.Tab("Quiz"):
-            progress_text = gr.Textbox(label="Progress", value="0/0", interactive=False)
-            question_display = gr.Textbox(label="Question", lines=5, interactive=False)
-            # Answer inputs (one will be visible at a time)
-            answer_radio = gr.Radio(label="Select your answer", visible=False)
-            answer_textbox = gr.Textbox(label="Type your answer", visible=False)
-            submit_button = gr.Button("Submit Answer", variant="primary", visible=False)
-            feedback_display = gr.Markdown(label="Feedback", visible=True)
-            next_button = gr.Button("Next Question", visible=False)
     # Connect events
     start_button.click(

     def format_answer(self, answer: str, dataset_name: str) -> str:
         """Format answer based on dataset type for better readability"""
+        import re
+        # Convert <<equation>> to show the math clearly
+        # Extract the equation and its result, show just the result with equation in parentheses
+        def format_equation(match):
+            equation = match.group(1)
+            # Check if it's in format "calculation=result"
+            if '=' in equation:
+                parts = equation.split('=')
+                if len(parts) == 2:
+                    calculation, result = parts[0], parts[1]
+                    return f"{result} (={calculation})"
+            return f"[{equation}]"
+        answer = re.sub(r"<<([^>]+)>>", format_equation, answer)
+        # Dataset-specific formatting
         if dataset_name == "openai/gsm8k":
             # Format the final answer line
+            answer = answer.replace("####", "\n\nFinal Answer:")
+            # Ensure proper line breaks after periods for readability
+            answer = re.sub(r'\. (?=[A-Z])', '.\n', answer)
+        return answer
     def check_answer(self, user_answer: str) -> Tuple[bool, str]:
         """Check if the user's answer is correct"""
             is_correct = user_letter == correct_letter
             if is_correct:
+                return True, '✅ Correct!'
             else:
                 choices = question_data[config["choices_field"]]
                 correct_choice = (
                 )
                 return (
                     False,
+                    f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
                 )
         elif question_type == "true_false":
             is_correct = user_bool == correct_answer
             if is_correct:
+                return True, '✅ Correct!'
             else:
                 return (
                     False,
+                    f'❌ Incorrect\n\nThe correct answer was {correct_answer}',
                 )
         elif question_type == "binary_choice":
             is_correct = user_idx == correct_answer_idx
             if is_correct:
+                return True, '✅ Correct!'
             else:
                 correct_letter = "A" if correct_answer_idx == 0 else "B"
                 option_field = (
                 )
                 return (
                     False,
+                    f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
                 )
         elif question_type in ["qa", "extractive_qa"]:
             else:
                 final_answer = correct_answer
+            # First check if user answer is empty
+            if not user_answer or not user_answer.strip():
+                is_correct = False
             else:
+                # Extract numbers from both answers for comparison
+                correct_numbers = re.findall(r"-?\d+\.?\d*", final_answer)
+                user_numbers = re.findall(r"-?\d+\.?\d*", user_answer)
+                # Check if answers match
+                is_correct = False
+                # If both have numbers, compare the numbers
+                if correct_numbers and user_numbers:
+                    # Convert to float for comparison to handle decimals
+                    try:
+                        correct_num = float(
+                            correct_numbers[-1]
+                        )  # Take the last number as final answer
+                        user_num = float(user_numbers[-1])  # Take the last number from user
+                        is_correct = (
+                            abs(correct_num - user_num) < 0.0001
+                        )  # Small tolerance for float comparison
+                    except ValueError:
+                        # Fall back to string comparison
+                        is_correct = correct_numbers[-1] == user_numbers[-1]
+                elif correct_numbers and not user_numbers:
+                    # If correct answer has numbers but user answer doesn't, it's wrong
+                    is_correct = False
+                else:
+                    # Fall back to substring matching for non-numeric answers
+                    # But ensure both strings are non-empty
+                    is_correct = (
+                        user_answer.lower().strip() in correct_answer.lower()
+                        or correct_answer.lower() in user_answer.lower().strip()
+                    ) and len(user_answer.strip()) > 0
             if is_correct:
+                return True, '✅ Correct!'
             else:
                 logging.info(f"Raw answer (QA): {repr(correct_answer)}")
                 logging.info(f"Extracted final answer: {repr(final_answer)}")
                 formatted_answer = self.format_answer(
                     correct_answer, self.current_dataset_name
                 )
+                # Debug: log the formatted answer
+                logging.info(f"Formatted answer with LaTeX: {repr(formatted_answer)}")
                 return (
                     False,
+                    f'❌ Incorrect\n\nThe correct answer was:\n\n{formatted_answer}',
                 )
         return False, "Unknown question type"
     if not dataset_id:
         return (
             "Please select a dataset",
+            gr.update(visible=False),  # question_display
+            gr.update(visible=False),  # answer_radio
+            gr.update(visible=False),  # answer_textbox
+            gr.update(visible=False),  # submit_button
+            gr.update(visible=False),  # progress_text
         )
     success, message = quiz_app.load_dataset_questions(dataset_id, num_questions)
         if q_type in ["multiple_choice", "true_false", "binary_choice"]:
             return (
                 message,
+                gr.update(value=question, visible=True),  # question_display
+                gr.update(choices=choices, visible=True, value=None),  # answer_radio
+                gr.update(visible=False),  # answer_textbox
+                gr.update(visible=True),  # submit_button
+                gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True),  # progress_text
             )
         else:
             return (
                 message,
+                gr.update(value=question, visible=True),  # question_display
+                gr.update(visible=False),  # answer_radio
+                gr.update(visible=True, value=""),  # answer_textbox
+                gr.update(visible=True),  # submit_button
+                gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True),  # progress_text
             )
     else:
         return (
             message,
+            gr.update(visible=False),  # question_display
+            gr.update(visible=False),  # answer_radio
+            gr.update(visible=False),  # answer_textbox
+            gr.update(visible=False),  # submit_button
+            gr.update(visible=False),  # progress_text
         )
     if quiz_app.current_question_idx >= quiz_app.total_questions:
         # Quiz complete
+        final_score = f'🎉 Quiz Complete!\n\nYour score: {quiz_app.score}/{quiz_app.total_questions} ({quiz_app.score / quiz_app.total_questions * 100:.1f}%)'
         return (
             gr.update(value=final_score, visible=True),
             "",
     if q_type in ["multiple_choice", "true_false", "binary_choice"]:
         return (
             gr.update(value="", visible=False),  # Clear feedback
+            gr.update(value=question),  # question_display
             gr.update(choices=choices, visible=True, value=None),
             gr.update(visible=False),
             gr.update(visible=True),
             gr.update(visible=False),
+            gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
         )
     else:
         return (
             gr.update(value="", visible=False),  # Clear feedback
+            gr.update(value=question),  # question_display
             gr.update(visible=False),
             gr.update(visible=True, value=""),
             gr.update(visible=True),
             gr.update(visible=False),
+            gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
         )
         "Test yourself with questions from popular HuggingFace evaluation datasets!"
     )
+    # Dataset Selection Section
+    with gr.Row():
+        dataset_dropdown = gr.Dropdown(
+            choices=[config["name"] for config in EVAL_DATASETS.values()],
+            label="Select Dataset",
+            value=list(EVAL_DATASETS.values())[0]["name"],
+        )
+        num_questions_slider = gr.Slider(
+            minimum=5, maximum=20, value=10, step=1, label="Number of Questions"
+        )
+    start_button = gr.Button("Start Quiz", variant="primary")
+    status_message = gr.Textbox(label="Status", interactive=False)
+    # Quiz Section - shown when quiz starts
+    gr.Markdown("---")  # Separator
+    progress_text = gr.Textbox(label="Progress", value="0/0", interactive=False, visible=False)
+    question_display = gr.Textbox(label="Question", lines=5, interactive=False, visible=False)
+    # Answer inputs (one will be visible at a time)
+    answer_radio = gr.Radio(label="Select your answer", visible=False)
+    answer_textbox = gr.Textbox(label="Type your answer (Raw number)", visible=False)
+    submit_button = gr.Button("Submit Answer", variant="primary", visible=False)
+    feedback_display = gr.Textbox(
+        label="Feedback",
+        visible=False,
+        lines=10,
+        max_lines=20,
+        interactive=False
+    )
+    next_button = gr.Button("Next Question", visible=False)
     # Connect events
     start_button.click(