Spaces:

AtlaAI
/

judge-arena

Running

App Files Files Community

bergr7f commited on Jan 10

Commit

bfa9f9a

2 Parent(s): bcd436c e0175c8

Merge branch 'feat/add-Flow-Judge-v0.1' into pr/8

Browse files

Files changed (4) hide show

app.py +11 -1
data/models.jsonl +2 -1
gen_api_answer.py +99 -18
prompts.py +54 -0

app.py CHANGED Viewed

@@ -14,7 +14,8 @@ from gen_api_answer import (
     get_model_response,
     parse_model_response,
     prometheus_parse_model_response,
-    atla_parse_model_response
 )
 from random_sample_generation import (
@@ -750,6 +751,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
         is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
         is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
         is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
         if is_prometheus_a:
             score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
@@ -757,6 +761,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
         elif is_atla_a:
             score_a_val, critique_a_val = atla_parse_model_response(response_a)
             score_a_val = f"{score_a_val} / 5"
         else:
             score_a_val, critique_a_val = parse_model_response(response_a)
             score_a_val = f"{score_a_val} / 5"
@@ -767,6 +774,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
         elif is_atla_b:
             score_b_val, critique_b_val = atla_parse_model_response(response_b)
             score_b_val = f"{score_b_val} / 5"
         else:
             score_b_val, critique_b_val = parse_model_response(response_b)
             score_b_val = f"{score_b_val} / 5"

     get_model_response,
     parse_model_response,
     prometheus_parse_model_response,
+    atla_parse_model_response,
+    flow_judge_parse_model_response,
 )
 from random_sample_generation import (
         is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
         is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
         is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
+        is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
+        is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
         if is_prometheus_a:
             score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
         elif is_atla_a:
             score_a_val, critique_a_val = atla_parse_model_response(response_a)
             score_a_val = f"{score_a_val} / 5"
+        elif is_flow_judge_a:
+            score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
+            score_a_val = f"{score_a_val} / 5"
         else:
             score_a_val, critique_a_val = parse_model_response(response_a)
             score_a_val = f"{score_a_val} / 5"
         elif is_atla_b:
             score_b_val, critique_b_val = atla_parse_model_response(response_b)
             score_b_val = f"{score_b_val} / 5"
+        elif is_flow_judge_b:
+            score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
+            score_b_val = f"{score_b_val} / 5"
         else:
             score_b_val, critique_b_val = parse_model_response(response_b)
             score_b_val = f"{score_b_val} / 5"

data/models.jsonl CHANGED Viewed

@@ -21,4 +21,5 @@
 {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus", "active": true}
 {"name": "Atla-8B-preview-2024-01-08", "organization": "Atla", "license": "Open Source", "api_model": "Atla-8B-preview-2024-01-08", "active": true}
 {"name": "Meta Llama 3.3 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "active": true}
-{"name": "QwQ 32B Preview", "organization": "Qwen", "license": "Open Source", "api_model": "Qwen/QwQ-32B-Preview", "active": true}

 {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus", "active": true}
 {"name": "Atla-8B-preview-2024-01-08", "organization": "Atla", "license": "Open Source", "api_model": "Atla-8B-preview-2024-01-08", "active": true}
 {"name": "Meta Llama 3.3 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "active": true}
+{"name": "QwQ 32B Preview", "organization": "Qwen", "license": "Open Source", "api_model": "Qwen/QwQ-32B-Preview", "active": true}
+{"name": "Flow-Judge-v0.1", "organization": "Flow AI", "license": "Open Source", "api_model": "Flow-Judge-v0.1-4.65bpw-exl2"}

gen_api_answer.py CHANGED Viewed

@@ -12,6 +12,7 @@ from prompts import (
     PROMETHEUS_PROMPT_WITH_REFERENCE,
     ATLA_PROMPT,
     ATLA_PROMPT_WITH_REFERENCE,
 )
 # Initialize clients
@@ -22,6 +23,8 @@ hf_api_key = os.getenv("HF_API_KEY")
 cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
 def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
     """Get response from OpenAI API"""
@@ -145,6 +148,30 @@ def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, m
         return str(content_items)  # Fallback if it's not a list
     except Exception as e:
         return f"Error with Cohere model {model_name}: {str(e)}"
 def get_model_response(
     model_name,
@@ -164,38 +191,64 @@ def get_model_response(
     # Determine if model is Prometheus or Atla
     is_prometheus = (organization == "Prometheus")
     is_atla = (organization == "Atla")
     # For non-Prometheus/Atla models, use the Judge system prompt
-    system_prompt = None if (is_prometheus or is_atla) else JUDGE_SYSTEM_PROMPT
     # Select the appropriate base prompt
     if is_atla:
         base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
-    elif use_reference:
-        base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE
     else:
-        base_prompt = PROMETHEUS_PROMPT
     # For non-Prometheus/non-Atla models, replace the specific instruction
-    if not (is_prometheus or is_atla):
         base_prompt = base_prompt.replace(
             '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
             '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
         )
     try:
-        # Format the prompt with the provided data, only using available keys
-        final_prompt = base_prompt.format(
-            human_input=prompt_data['human_input'],
-            ai_response=prompt_data['ai_response'],
-            ground_truth_input=prompt_data.get('ground_truth_input', ''),
-            eval_criteria=prompt_data['eval_criteria'],
-            score1_desc=prompt_data['score1_desc'],
-            score2_desc=prompt_data['score2_desc'],
-            score3_desc=prompt_data['score3_desc'],
-            score4_desc=prompt_data['score4_desc'],
-            score5_desc=prompt_data['score5_desc']
-        )
     except KeyError as e:
         return f"Error formatting prompt: Missing required field {str(e)}"
@@ -220,6 +273,10 @@ def get_model_response(
             return get_cohere_response(
                 api_model, final_prompt, system_prompt, max_tokens, temperature
             )
         else:
             # All other organizations use Together API
             return get_together_response(
@@ -306,7 +363,31 @@ def prometheus_parse_model_response(output):
     except Exception as e:
         print(f"Failed to parse response: {str(e)}")
         return "Error", f"Exception during parsing: {str(e)}"
 def atla_parse_model_response(output):
     """Parse response from ATLA model"""
     try:

     PROMETHEUS_PROMPT_WITH_REFERENCE,
     ATLA_PROMPT,
     ATLA_PROMPT_WITH_REFERENCE,
+    FLOW_JUDGE_PROMPT
 )
 # Initialize clients
 cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
+flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
 def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
     """Get response from OpenAI API"""
         return str(content_items)  # Fallback if it's not a list
     except Exception as e:
         return f"Error with Cohere model {model_name}: {str(e)}"
+def get_flow_judge_response(model_name, prompt, max_tokens=500, temperature=0.1, top_p=0.95) -> str:
+    """Get response from Flow Judge"""
+    try:
+        response = requests.post(
+            "https://tsukuyomi.tailfa581.ts.net/v1/chat/completions",
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {flow_judge_api_key}"
+            },
+            json={
+                "model": model_name,
+                "messages": [
+                    {"role": "user", "content": prompt}
+                ],
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+                "top_p": top_p
+            }
+        )
+        response.raise_for_status()
+        return response.json()["choices"][0]['message']['content']
+    except Exception as e:
+        return f"Error with Flow Judge completions model {model_name}: {str(e)}"
 def get_model_response(
     model_name,
     # Determine if model is Prometheus or Atla
     is_prometheus = (organization == "Prometheus")
     is_atla = (organization == "Atla")
+    is_flow_judge = (organization == "Flow AI")
     # For non-Prometheus/Atla models, use the Judge system prompt
+    system_prompt = None if (is_prometheus or is_atla or is_flow_judge) else JUDGE_SYSTEM_PROMPT
     # Select the appropriate base prompt
     if is_atla:
         base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
+    elif is_flow_judge:
+        base_prompt = FLOW_JUDGE_PROMPT
     else:
+        base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
     # For non-Prometheus/non-Atla models, replace the specific instruction
+    if not (is_prometheus or is_atla or is_flow_judge):
         base_prompt = base_prompt.replace(
             '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
             '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
         )
     try:
+        if not is_flow_judge:
+            # Format the prompt with the provided data, only using available keys
+            final_prompt = base_prompt.format(
+                human_input=prompt_data['human_input'],
+                ai_response=prompt_data['ai_response'],
+                ground_truth_input=prompt_data.get('ground_truth_input', ''),
+                eval_criteria=prompt_data['eval_criteria'],
+                score1_desc=prompt_data['score1_desc'],
+                score2_desc=prompt_data['score2_desc'],
+                score3_desc=prompt_data['score3_desc'],
+                score4_desc=prompt_data['score4_desc'],
+                score5_desc=prompt_data['score5_desc']
+            )
+        else:
+            human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
+            ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
+            ground_truth=prompt_data.get('ground_truth_input', '')
+            if ground_truth:
+                response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
+            else:
+                response_reference = ""
+            eval_criteria = prompt_data['eval_criteria']
+            score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
+            score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
+            score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
+            score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
+            score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
+            rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
+            if response_reference:
+                inputs = human_input + "\n"+ response_reference
+            else:
+                inputs = human_input
+            final_prompt = base_prompt.format(
+                INPUTS=inputs,
+                OUTPUT=ai_response,
+                EVALUATION_CRITERIA=eval_criteria,
+                RUBRIC=rubric
+            )
     except KeyError as e:
         return f"Error formatting prompt: Missing required field {str(e)}"
             return get_cohere_response(
                 api_model, final_prompt, system_prompt, max_tokens, temperature
             )
+        elif organization == "Flow AI":
+            return get_flow_judge_response(
+                api_model, final_prompt, max_tokens, temperature
+            )
         else:
             # All other organizations use Together API
             return get_together_response(
     except Exception as e:
         print(f"Failed to parse response: {str(e)}")
         return "Error", f"Exception during parsing: {str(e)}"
+def flow_judge_parse_model_response(output):
+    try:
+        print(f"Raw model response: {output}")
+        # Convert multiple line breaks to single ones and strip whitespace
+        output = re.sub(r'\n{2,}', '\n', output.strip())
+        # Compile regex patterns
+        feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
+        score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
+        feedback_match = feedback_pattern.search(output)
+        score_match = score_pattern.search(output)
+        if feedback_match or not score_match:
+            feedback = feedback_match.group(1).strip()
+            score = int(score_match.group(1).strip())
+            return str(score), feedback
+        return "Error", f"Failed to parse response: {output}"
+    except Exception as e:
+        print(f"Failed to parse response: {str(e)}")
+        return "Error", f"Exception during parsing: {str(e)}"
 def atla_parse_model_response(output):
     """Parse response from ATLA model"""
     try:

prompts.py CHANGED Viewed

@@ -90,6 +90,60 @@ Score 5: {score5_desc}
 ###Feedback:
 """
 # Judge system prompt for non-Prometheus models
 JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""

 ###Feedback:
 """
+# Define the Flow Judge prompt
+FLOW_JUDGE_PROMPT = """# GOAL
+Your job is to evaluate a task carried out by an AI system powered by a large \
+language model.
+You will be provided with the inputs and output of the task, as well as the evaluation criteria \
+and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
+criteria and scoring rubric provided.
+# INPUT
+Below are the inputs required for performing the task:
+<inputs>
+{INPUTS}
+</inputs>
+# OUTPUT
+Below is the output of the task:
+<output>
+{OUTPUT}
+</output>
+# EVALUATION CRITERIA AND SCORING RUBRIC
+Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
+<evaluation_criteria>
+{EVALUATION_CRITERIA}
+</evaluation_criteria>
+<scoring_rubric>
+{RUBRIC}
+</scoring_rubric>
+# INSTRUCTIONS FOR THE EVALUATION
+1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
+Review the evaluation criteria and scoring rubric to understand the different levels of \
+performance and the descriptions for each score.
+2. Review the inputs and output: Look at the inputs provided for the task. Examine the output \
+generated from completing the task.
+3. Compare output to score descriptions: Compare the output against the criteria and score \
+descriptions in the scoring rubric. For each criterion,decide which description best matches the \
+output.
+4. After comparing the output to the score descriptions, pay attention to the small details that \
+might impact the final score that you assign. Sometimes a small difference can dictate the final \
+score.
+5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
+to specific aspects of the output and comparing them to the rubric.
+6. Assign a final score based on the scoring rubric.
+## FORMAT FOR THE EVALUATION
+- Write the verbal feedback inside <feedback> tags without any additional surrounding text.
+- Write the numeric score inside <score> tags, without any additional surrounding text and always \
+after the feedback.
+Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
 # Judge system prompt for non-Prometheus models
 JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""