Spaces:
Runtime error
Runtime error
| import os | |
| from pathlib import Path | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import gradio as gr | |
| import jsonlines | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| from evaluation_utils import evaluate_response | |
| def get_split(): | |
| load_dotenv() | |
| split = os.getenv("SPLIT") | |
| if split == "train": | |
| return "evaluation on development set" | |
| elif split == "test": | |
| return "evaluation on test set" | |
| # Utility function to chunk a list into batches | |
| def chunk_list(data, chunk_size): | |
| for i in range(0, len(data), chunk_size): | |
| yield data[i:i + chunk_size] | |
| # Function to send an individual request to the OpenAI API | |
| def send_request(client, prompt, index): | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| temperature=0, | |
| seed=42, | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| max_tokens=1024, | |
| ) | |
| return index, response.choices[0].message.content | |
| def evaluate_prompt(prompt: str, num_samples: int = None, split: str = None, batch_size: int = 5, progress=gr.Progress()): | |
| progress(0, desc="Starting...") | |
| load_dotenv() | |
| if num_samples is None: | |
| num_samples = int(os.getenv("NUM_SAMPLES")) | |
| if split is None: | |
| split = os.getenv("SPLIT") | |
| assert split in ["train", "test"] | |
| # Define the path to the test.jsonl file | |
| test_file_path = Path(__file__).parent / f"{split}.jsonl" | |
| # Load the data from the jsonl file | |
| test_data = [] | |
| with jsonlines.open(test_file_path) as reader: | |
| for item in reader: | |
| test_data.append(item) | |
| test_data = [item for item in test_data if "'" not in item["shuffled_tokenized"] and "β" not in item["shuffled_tokenized"]] | |
| # Limit to first num_samples items for faster evaluation | |
| test_data = test_data[:num_samples] | |
| client = OpenAI(api_key=os.getenv('OPENAI_API_KEY')) | |
| responses = [None] * num_samples # Pre-allocate a list to store responses in order | |
| instantiated_prompts = [] | |
| # Create and process batches | |
| for batch_data in chunk_list(test_data, batch_size): | |
| # Prepare the prompts for this batch | |
| batch_prompts = [ | |
| prompt.replace("{% shuffled_sentence %}", test_item["shuffled_tokenized"]) | |
| for test_item in batch_data | |
| ] | |
| instantiated_prompts.extend(batch_prompts) | |
| # Send requests in parallel using ThreadPoolExecutor | |
| with ThreadPoolExecutor() as executor: | |
| futures = {executor.submit(send_request, client, item_prompt, i): i for i, item_prompt in enumerate(batch_prompts, start=len(instantiated_prompts) - len(batch_prompts))} | |
| for future in as_completed(futures): | |
| try: | |
| index, response = future.result() | |
| responses[index] = response # Store the response at the correct index | |
| except Exception as e: | |
| print(f"Request failed: {e}") | |
| responses[index] = "Error: Request failed" | |
| # Update progress after each batch | |
| progress(len(instantiated_prompts) / len(test_data), desc="Processing batches...") | |
| # Evaluate responses | |
| scores = [] | |
| for test_item, instantiated_prompt, response in zip(test_data, instantiated_prompts, responses): | |
| score = evaluate_response(test_item["original_tokenized"], response) | |
| scores.append(score) | |
| yield (test_item["original_sentence"], instantiated_prompt, response, score) | |