Spaces:

AtlaAI
/

selene

Paused

App Files Files Community

kaikaidai commited on Feb 19

Commit

782dcd8

verified ·

1 Parent(s): 941d431

Delete gen_api_answer.py

Browse files

Files changed (1) hide show

gen_api_answer.py +0 -484

gen_api_answer.py DELETED Viewed

@@ -1,484 +0,0 @@
-from openai import OpenAI
-import anthropic
-from together import Together
-import cohere
-import json
-import re
-import os
-import requests
-from prompts import (
-    JUDGE_SYSTEM_PROMPT,
-    PROMETHEUS_PROMPT,
-    PROMETHEUS_PROMPT_WITH_REFERENCE,
-    ATLA_PROMPT,
-    ATLA_PROMPT_WITH_REFERENCE,
-     FLOW_JUDGE_PROMPT
-)
-from transformers import AutoTokenizer
-# Initialize clients
-anthropic_client = anthropic.Anthropic()
-openai_client = OpenAI()
-together_client = Together()
-hf_api_key = os.getenv("HF_API_KEY")
-flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
-cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
-salesforce_api_key = os.getenv("SALESFORCE_API_KEY")
-def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
-    """Get response from OpenAI API"""
-    try:
-        response = openai_client.chat.completions.create(
-            model=model_name,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt},
-            ],
-            max_completion_tokens=max_tokens,
-            temperature=temperature,
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        return f"Error with OpenAI model {model_name}: {str(e)}"
-def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
-    """Get response from Anthropic API"""
-    try:
-        response = anthropic_client.messages.create(
-            model=model_name,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            system=system_prompt,
-            messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
-        )
-        return response.content[0].text
-    except Exception as e:
-        return f"Error with Anthropic model {model_name}: {str(e)}"
-def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
-    """Get response from Together API"""
-    try:
-        response = together_client.chat.completions.create(
-            model=model_name,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt},
-            ],
-            max_tokens=max_tokens,
-            temperature=temperature,
-            stream=False,
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        return f"Error with Together model {model_name}: {str(e)}"
-def get_prometheus_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
-    """Get response from Hugging Face model"""
-    try:
-        headers = {
-            "Accept": "application/json",
-            "Authorization": f"Bearer {hf_api_key}",
-            "Content-Type": "application/json"
-        }
-        # Create messages list for chat template
-        messages = []
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": prompt})
-        # Apply chat template
-        model_id = "prometheus-eval/prometheus-7b-v2.0"
-        tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
-        formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        payload = {
-            "inputs": formatted_prompt,
-            "parameters": {
-                "max_new_tokens": max_tokens,
-                "return_full_text": False,
-                "temperature": temperature
-            }
-        }
-        response = requests.post(
-            "https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
-            headers=headers,
-            json=payload
-        )
-        return response.json()[0]["generated_text"]
-    except Exception as e:
-        return f"Error with Hugging Face model {model_name}: {str(e)}"
-def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
-    """Get response from HF endpoint for Atla model"""
-    try:
-        headers = {
-            "Accept": "application/json",
-            "Authorization": f"Bearer {hf_api_key}",
-            "Content-Type": "application/json"
-        }
-        # Create messages list for chat template
-        messages = []
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": prompt})
-        # Apply chat template
-        model_id = "AtlaAI/Selene-1-Mini-Llama-3.1-8B"
-        tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
-        formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        payload = {
-            "inputs": formatted_prompt,
-            "parameters": {
-                "max_new_tokens": max_tokens,
-                "return_full_text": False,
-                "temperature": temperature,
-                "seed": 42,
-                "add_generation_prompt": True
-            }
-        }
-        response = requests.post(
-            "https://bkp9p28gri93egqh.us-east-1.aws.endpoints.huggingface.cloud",
-            headers=headers,
-            json=payload
-        )
-        return response.json()[0]["generated_text"]
-    except Exception as e:
-        return f"Error with Atla model {model_name}: {str(e)}"
-def get_flow_judge_response(model_name, prompt, max_tokens=2048, temperature=0.1, top_p=0.95) -> str:
-    """Get response from Flow Judge"""
-    try:
-        response = requests.post(
-            "https://arena.flow-ai.io/v1/chat/completions",
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {flow_judge_api_key}"
-            },
-            json={
-                "model": model_name,
-                "messages": [
-                    {"role": "user", "content": prompt}
-                ],
-                "max_tokens": max_tokens,
-                "temperature": temperature,
-                "top_p": top_p,
-                "stop": None
-            }
-        )
-        response.raise_for_status()
-        return response.json()["choices"][0]['message']['content']
-    except Exception as e:
-        return f"Error with Flow Judge completions model {model_name}: {str(e)}"
-def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
-    """Get response from Cohere API"""
-    try:
-        response = cohere_client.chat(
-            model=model_name,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt}
-            ],
-            max_tokens=max_tokens,
-            temperature=temperature
-        )
-        # Extract the text from the content items
-        content_items = response.message.content
-        if isinstance(content_items, list):
-            # Get the text from the first content item
-            return content_items[0].text
-        return str(content_items)  # Fallback if it's not a list
-    except Exception as e:
-        return f"Error with Cohere model {model_name}: {str(e)}"
-def get_salesforce_response(model_name, prompt, system_prompt=None, max_tokens=2048, temperature=0):
-    """Get response from Salesforce Research API"""
-    try:
-        headers = {
-            'accept': 'application/json',
-            "content-type": "application/json",
-            "X-Api-Key": salesforce_api_key,
-        }
-        # Create messages list
-        messages = []
-        messages.append({"role": "user", "content": prompt})
-        json_data = {
-            "prompts": messages,
-            "temperature": temperature,
-            "top_p": 1,
-            "max_tokens": max_tokens,
-        }
-        response = requests.post(
-            'https://gateway.salesforceresearch.ai/sfr-judge/process',
-            headers=headers,
-            json=json_data
-        )
-        response.raise_for_status()
-        return response.json()['result'][0]
-    except Exception as e:
-        return f"Error with Salesforce model {model_name}: {str(e)}"
-def get_model_response(
-    model_name,
-    model_info,
-    prompt_data,
-    use_reference=False,
-    max_tokens=500,
-    temperature=0
-):
-    """Get response from appropriate API based on model organization"""
-    if not model_info:
-        return "Model not found or unsupported."
-    api_model = model_info["api_model"]
-    organization = model_info["organization"]
-    # Determine if model is Prometheus, Atla, Flow Judge, or Salesforce
-    is_prometheus = (organization == "Prometheus")
-    is_atla = (organization == "Atla")
-    is_flow_judge = (organization == "Flow AI")
-    is_salesforce = (organization == "Salesforce")
-    # For non-Prometheus/Atla/Flow Judge/Salesforce models, use the Judge system prompt
-    system_prompt = None if (is_prometheus or is_atla or is_flow_judge or is_salesforce) else JUDGE_SYSTEM_PROMPT
-    # Select the appropriate base prompt
-    if is_atla or is_salesforce:  # Use same prompt for Atla and Salesforce
-        base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
-    elif is_flow_judge:
-        base_prompt = FLOW_JUDGE_PROMPT
-    else:
-        base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
-    # For non-Prometheus/non-Atla/non-Salesforce models, use Prometheus but replace the output format with JSON
-    if not (is_prometheus or is_atla or is_flow_judge or is_salesforce):
-        base_prompt = base_prompt.replace(
-            '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
-            '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
-        )
-    try:
-        if not is_flow_judge:
-            # Format the prompt with the provided data
-            final_prompt = base_prompt.format(
-                human_input=prompt_data['human_input'],
-                ai_response=prompt_data['ai_response'],
-                ground_truth_input=prompt_data.get('ground_truth_input', ''),
-                eval_criteria=prompt_data['eval_criteria']
-            )
-        else:
-            human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
-            ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
-            ground_truth = prompt_data.get('ground_truth_input', '')
-            if ground_truth:
-                response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
-            else:
-                response_reference = ""
-            # For Flow Judge, parse the scoring rubric from eval_criteria
-            eval_criteria_lines = prompt_data['eval_criteria'].split('\n')
-            rubric_lines = [line for line in eval_criteria_lines if line.startswith('Score ')]
-            rubric = '\n'.join(f"- {line}" for line in rubric_lines)
-            if response_reference:
-                inputs = human_input + "\n" + response_reference
-            else:
-                inputs = human_input
-            final_prompt = base_prompt.format(
-                INPUTS=inputs,
-                OUTPUT=ai_response,
-                EVALUATION_CRITERIA=prompt_data['eval_criteria'],
-                RUBRIC=rubric
-            )
-    except KeyError as e:
-        return f"Error formatting prompt: Missing required field {str(e)}"
-    try:
-        if organization == "OpenAI":
-            return get_openai_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature
-            )
-        elif organization == "Anthropic":
-            return get_anthropic_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature
-            )
-        elif organization == "Prometheus":
-            return get_prometheus_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
-            )
-        elif organization == "Atla":
-            return get_atla_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
-            )
-        elif organization == "Cohere":
-            return get_cohere_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature
-            )
-        elif organization == "Flow AI":
-            return get_flow_judge_response(
-                api_model, final_prompt
-            )
-        elif organization == "Salesforce":
-            response = get_salesforce_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature
-            )
-            return response
-        else:
-            # All other organizations use Together API
-            return get_together_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature
-            )
-    except Exception as e:
-        return f"Error with {organization} model {model_name}: {str(e)}"
-def parse_model_response(response):
-    try:
-        # Debug print
-        print(f"Raw model response: {response}")
-        # If response is already a dictionary, use it directly
-        if isinstance(response, dict):
-            return str(response.get("result", "N/A")), response.get("feedback", "N/A")
-        # First try to parse the entire response as JSON
-        try:
-            data = json.loads(response)
-            return str(data.get("result", "N/A")), data.get("feedback", "N/A")
-        except json.JSONDecodeError:
-            # If that fails, check if this is a Salesforce response (which uses ATLA format)
-            if "**Reasoning:**" in response or "**Result:**" in response:
-                # Use ATLA parser for Salesforce responses
-                return atla_parse_model_response(response)
-            # Otherwise try to find JSON within the response
-            json_match = re.search(r"{.*}", response, re.DOTALL)
-            if json_match:
-                data = json.loads(json_match.group(0))
-                return str(data.get("result", "N/A")), data.get("feedback", "N/A")
-            else:
-                return "Error", f"Invalid response format returned - here is the raw model response: {response}"
-    except Exception as e:
-        # Debug print for error case
-        print(f"Failed to parse response: {str(e)}")
-        # If the error message itself contains valid JSON, try to parse that
-        try:
-            error_json_match = re.search(r"{.*}", str(e), re.DOTALL)
-            if error_json_match:
-                data = json.loads(error_json_match.group(0))
-                return str(data.get("result", "N/A")), data.get("feedback", "N/A")
-        except:
-            pass
-        return "Error", f"Failed to parse response: {response}"
-def prometheus_parse_model_response(output):
-    try:
-        print(f"Raw model response: {output}")
-        output = output.strip()
-        # Remove "Feedback:" prefix if present (case insensitive)
-        output = re.sub(r'^feedback:\s*', '', output, flags=re.IGNORECASE)
-        # New pattern to match [RESULT] X at the beginning
-        begin_result_pattern = r'^\[RESULT\]\s*(\d+)\s*\n*(.*?)$'
-        begin_match = re.search(begin_result_pattern, output, re.DOTALL | re.IGNORECASE)
-        if begin_match:
-            score = int(begin_match.group(1))
-            feedback = begin_match.group(2).strip()
-            return str(score), feedback
-        # Existing patterns for end-of-string results...
-        pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
-        match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
-        if match:
-            feedback = match.group(1).strip()
-            score = int(match.group(2))
-            return str(score), feedback
-        # If no match, try to match "... Score: X"
-        pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
-        match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
-        if match:
-            feedback = match.group(1).strip()
-            score = int(match.group(2))
-            return str(score), feedback
-        # Pattern to handle [Score X] at the end
-        pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
-        match = re.search(pattern, output, re.DOTALL)
-        if match:
-            feedback = match.group(1).strip()
-            score = int(match.group(2))
-            return str(score), feedback
-        # Final fallback attempt
-        pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
-        match = re.search(pattern, output)
-        if match:
-            score = int(match.group(1))
-            feedback = output[:match.start()].rstrip()
-            # Remove any trailing brackets from feedback
-            feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
-            return str(score), feedback
-        return "Error", f"Failed to parse response: {output}"
-    except Exception as e:
-        print(f"Failed to parse response: {str(e)}")
-        return "Error", f"Exception during parsing: {str(e)}"
-def atla_parse_model_response(output):
-    """Parse response from ATLA model"""
-    try:
-        print(f"Raw Atla model response: {output}")
-        output = output.strip()
-        # Look for the Reasoning and Result sections
-        reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
-        result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
-        if reasoning_match and result_match:
-            feedback = reasoning_match.group(1).strip()
-            score = result_match.group(1)
-            return str(score), feedback
-        return "Error", f"Failed to parse ATLA response format: {output}"
-    except Exception as e:
-        print(f"Failed to parse ATLA response: {str(e)}")
-        return "Error", f"Exception during parsing: {str(e)}"
-def flow_judge_parse_model_response(output):
-    try:
-        print(f"Raw model response: {output}")
-        # Convert multiple line breaks to single ones and strip whitespace
-        output = re.sub(r'\n{2,}', '\n', output.strip())
-        # Compile regex patterns
-        feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
-        score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
-        feedback_match = feedback_pattern.search(output)
-        score_match = score_pattern.search(output)
-        if feedback_match or not score_match:
-            feedback = feedback_match.group(1).strip()
-            score = int(score_match.group(1).strip())
-            return str(score), feedback
-        return "Error", f"Failed to parse response: {output}"
-    except Exception as e:
-        print(f"Failed to parse response: {str(e)}")
-        return "Error", f"Exception during parsing: {str(e)}"