Spaces:
Sleeping
Sleeping
Initial commit
Browse files- app.py +82 -0
- app_utils.py +97 -0
- evaluation_utils.py +69 -0
- requirements.txt +5 -0
- test.jsonl +0 -0
- train.jsonl +0 -0
app.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from app_utils import evaluate_prompt, get_split
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
logging.basicConfig(level=logging.INFO)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
with gr.Blocks(title=f"Prompting Challenge ({get_split()}") as demo:
|
| 11 |
+
gr.Markdown(
|
| 12 |
+
f"""
|
| 13 |
+
# Prompting Challenge
|
| 14 |
+
### ({get_split()})
|
| 15 |
+
""" + """
|
| 16 |
+
The goal of this challenge is to prompt GPT-4 to "unscramble" a sentence.
|
| 17 |
+
|
| 18 |
+
The input is a sentence with scrambled word order, e.g.: *"are How ? you"*
|
| 19 |
+
|
| 20 |
+
GPT-4 should identify the original sentence, e.g.: *"How are you?"*
|
| 21 |
+
|
| 22 |
+
Enter your prompt template here. Use `{% shuffled_sentence %}` at the place where you want the shuffled sentence to be inserted.
|
| 23 |
+
"""
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
input_text = gr.Textbox(
|
| 27 |
+
lines=10,
|
| 28 |
+
label="Input Text",
|
| 29 |
+
value="Unscramble the following sentence: {% shuffled_sentence %}"
|
| 30 |
+
)
|
| 31 |
+
submit_button = gr.Button("Submit")
|
| 32 |
+
results_output = gr.HTML(label="Results")
|
| 33 |
+
|
| 34 |
+
def update_results(prompt):
|
| 35 |
+
result_tuples = list(evaluate_prompt(prompt))
|
| 36 |
+
if result_tuples:
|
| 37 |
+
total_score = sum(item_score for _, _, _, item_score in result_tuples)
|
| 38 |
+
score = total_score / len(result_tuples)
|
| 39 |
+
else:
|
| 40 |
+
score = 0
|
| 41 |
+
html_output = "<dl style='font-family: Arial, sans-serif;'>"
|
| 42 |
+
html_output += f"<h2 style='color: #333; margin-top: 20px; margin-bottom: 20px;'>Accuracy: {100 * score:.1f}%</h2>"
|
| 43 |
+
newline = '\n'
|
| 44 |
+
for index, (original, prompt, response, item_score) in enumerate(result_tuples, 1):
|
| 45 |
+
background_color = "#fff4ea" if item_score < 0.5 else "#e4ffe4" if item_score > 0.9 else "whitesmoke"
|
| 46 |
+
html_output += f"""
|
| 47 |
+
<div style='background-color: {background_color}; padding: 10px; margin-bottom: 20px;'>
|
| 48 |
+
<h3 style='color: #333; margin-top: 0;'>Test item #{index}</h3>
|
| 49 |
+
<dt style='padding: 5px;'>
|
| 50 |
+
<span style='font-weight: 600;'>Original Sentence:</span>
|
| 51 |
+
</dt>
|
| 52 |
+
<dd style='margin-left: 20px; padding: 5px;'>{original.replace(newline, "<br>")}</dd>
|
| 53 |
+
|
| 54 |
+
<dt style='padding: 5px;'>
|
| 55 |
+
<span style='font-weight: 600;'>Prompt:</span>
|
| 56 |
+
</dt>
|
| 57 |
+
<dd style='margin-left: 20px; padding: 5px;'>{prompt.replace(newline, "<br>")}</dd>
|
| 58 |
+
|
| 59 |
+
<dt style='padding: 5px;'>
|
| 60 |
+
<span style='font-weight: 600;'>Response by GPT-4:</span>
|
| 61 |
+
</dt>
|
| 62 |
+
<dd style='margin-left: 20px; padding: 5px;font-style: italic;'>{response.replace(newline, "<br>")}</dd>
|
| 63 |
+
<dt style='padding: 5px;'>
|
| 64 |
+
<span style='font-weight: 600;'>Score:</span>
|
| 65 |
+
</dt>
|
| 66 |
+
<dd style='margin-left: 20px; padding: 5px;'>
|
| 67 |
+
<span style='color: #333;'>{100 * item_score:.1f}%</span>
|
| 68 |
+
</dd>
|
| 69 |
+
</div>
|
| 70 |
+
"""
|
| 71 |
+
html_output += "</dl>"
|
| 72 |
+
return html_output
|
| 73 |
+
|
| 74 |
+
submit_button.click(
|
| 75 |
+
fn=update_results,
|
| 76 |
+
inputs=[input_text],
|
| 77 |
+
outputs=[results_output]
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
if __name__ == "__main__":
|
| 81 |
+
demo.launch()
|
| 82 |
+
# demo.launch(share=True)
|
app_utils.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import jsonlines
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
from evaluation_utils import evaluate_response
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_split():
|
| 13 |
+
load_dotenv()
|
| 14 |
+
split = os.getenv("SPLIT")
|
| 15 |
+
if split == "train":
|
| 16 |
+
return "evaluation on development set"
|
| 17 |
+
elif split == "test":
|
| 18 |
+
return "evaluation on test set"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Utility function to chunk a list into batches
|
| 22 |
+
def chunk_list(data, chunk_size):
|
| 23 |
+
for i in range(0, len(data), chunk_size):
|
| 24 |
+
yield data[i:i + chunk_size]
|
| 25 |
+
|
| 26 |
+
# Function to send an individual request to the OpenAI API
|
| 27 |
+
def send_request(client, prompt, index):
|
| 28 |
+
response = client.chat.completions.create(
|
| 29 |
+
model="gpt-4o-mini",
|
| 30 |
+
temperature=0,
|
| 31 |
+
seed=42,
|
| 32 |
+
messages=[
|
| 33 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 34 |
+
{"role": "user", "content": prompt},
|
| 35 |
+
],
|
| 36 |
+
max_tokens=1024,
|
| 37 |
+
)
|
| 38 |
+
return index, response.choices[0].message.content
|
| 39 |
+
|
| 40 |
+
def evaluate_prompt(prompt: str, num_samples: int = None, split: str = None, batch_size: int = 5, progress=gr.Progress()):
|
| 41 |
+
progress(0, desc="Starting...")
|
| 42 |
+
load_dotenv()
|
| 43 |
+
|
| 44 |
+
if num_samples is None:
|
| 45 |
+
num_samples = int(os.getenv("NUM_SAMPLES"))
|
| 46 |
+
|
| 47 |
+
if split is None:
|
| 48 |
+
split = os.getenv("SPLIT")
|
| 49 |
+
assert split in ["train", "test"]
|
| 50 |
+
|
| 51 |
+
# Define the path to the test.jsonl file
|
| 52 |
+
test_file_path = Path(__file__).parent / "out" / f"{split}.jsonl"
|
| 53 |
+
|
| 54 |
+
# Load the data from the jsonl file
|
| 55 |
+
test_data = []
|
| 56 |
+
with jsonlines.open(test_file_path) as reader:
|
| 57 |
+
for item in reader:
|
| 58 |
+
test_data.append(item)
|
| 59 |
+
|
| 60 |
+
# Limit to first num_samples items for faster evaluation
|
| 61 |
+
test_data = test_data[:num_samples]
|
| 62 |
+
|
| 63 |
+
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
| 64 |
+
|
| 65 |
+
responses = [None] * num_samples # Pre-allocate a list to store responses in order
|
| 66 |
+
instantiated_prompts = []
|
| 67 |
+
|
| 68 |
+
# Create and process batches
|
| 69 |
+
for batch_data in chunk_list(test_data, batch_size):
|
| 70 |
+
# Prepare the prompts for this batch
|
| 71 |
+
batch_prompts = [
|
| 72 |
+
prompt.replace("{% shuffled_sentence %}", test_item["shuffled_tokenized"])
|
| 73 |
+
for test_item in batch_data
|
| 74 |
+
]
|
| 75 |
+
instantiated_prompts.extend(batch_prompts)
|
| 76 |
+
|
| 77 |
+
# Send requests in parallel using ThreadPoolExecutor
|
| 78 |
+
with ThreadPoolExecutor() as executor:
|
| 79 |
+
futures = {executor.submit(send_request, client, item_prompt, i): i for i, item_prompt in enumerate(batch_prompts, start=len(instantiated_prompts) - len(batch_prompts))}
|
| 80 |
+
|
| 81 |
+
for future in as_completed(futures):
|
| 82 |
+
try:
|
| 83 |
+
index, response = future.result()
|
| 84 |
+
responses[index] = response # Store the response at the correct index
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"Request failed: {e}")
|
| 87 |
+
responses[index] = "Error: Request failed"
|
| 88 |
+
|
| 89 |
+
# Update progress after each batch
|
| 90 |
+
progress(len(instantiated_prompts) / len(test_data), desc="Processing batches...")
|
| 91 |
+
|
| 92 |
+
# Evaluate responses
|
| 93 |
+
scores = []
|
| 94 |
+
for test_item, instantiated_prompt, response in zip(test_data, instantiated_prompts, responses):
|
| 95 |
+
score = evaluate_response(test_item["original_tokenized"], response)
|
| 96 |
+
scores.append(score)
|
| 97 |
+
yield (test_item["original_sentence"], instantiated_prompt, response, score)
|
evaluation_utils.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import spacy
|
| 3 |
+
from fast_sentence_tokenize import tokenize_text
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def evaluate_response(original_tokenized: str, response: str) -> int:
|
| 7 |
+
"""
|
| 8 |
+
- Tokenize response string using spacy
|
| 9 |
+
- Create a list of response tokens
|
| 10 |
+
- Assign every original token a rank:
|
| 11 |
+
- Only look at the last mention of a token in the response
|
| 12 |
+
- Rank the tokens by how early they appear in the response (last mention only)
|
| 13 |
+
- Calculate ranking accuracy
|
| 14 |
+
|
| 15 |
+
Returns a value between 0 and 1
|
| 16 |
+
"""
|
| 17 |
+
original_tokenized = original_tokenized.strip().lower()
|
| 18 |
+
response = response.strip().lower()
|
| 19 |
+
|
| 20 |
+
# Tokenize response string using a simple regex-based tokenization
|
| 21 |
+
response_tokens = tokenize_text(response)
|
| 22 |
+
|
| 23 |
+
# Create a list of original tokens
|
| 24 |
+
original_tokens = original_tokenized.split()
|
| 25 |
+
|
| 26 |
+
# Create ranks for response tokens
|
| 27 |
+
response_token_ranks = {}
|
| 28 |
+
for token in original_tokens:
|
| 29 |
+
if token not in response_tokens:
|
| 30 |
+
return 0 # If any original token is missing from the response, return 0 immediately
|
| 31 |
+
|
| 32 |
+
# Create ranks for original tokens
|
| 33 |
+
original_token_ranks = {}
|
| 34 |
+
for i, token in enumerate(original_tokens):
|
| 35 |
+
original_token_ranks[token] = i
|
| 36 |
+
|
| 37 |
+
# Create ranks for response tokens
|
| 38 |
+
for token in original_tokens:
|
| 39 |
+
# Assign index of last occurrence of token in response
|
| 40 |
+
response_token_ranks[token] = len(response_tokens) - 1 - response_tokens[::-1].index(token)
|
| 41 |
+
|
| 42 |
+
# Normalize the response token ranks
|
| 43 |
+
sorted_ranks = sorted(set(response_token_ranks.values()))
|
| 44 |
+
rank_mapping = {old_rank: new_rank for new_rank, old_rank in enumerate(sorted_ranks)}
|
| 45 |
+
for token, rank in response_token_ranks.items():
|
| 46 |
+
response_token_ranks[token] = rank_mapping[rank]
|
| 47 |
+
|
| 48 |
+
# Calculate Kendall's tau
|
| 49 |
+
n = len(original_tokens)
|
| 50 |
+
concordant_pairs = 0
|
| 51 |
+
discordant_pairs = 0
|
| 52 |
+
|
| 53 |
+
for i in range(n):
|
| 54 |
+
for j in range(i + 1, n):
|
| 55 |
+
original_diff = original_token_ranks[original_tokens[i]] - original_token_ranks[original_tokens[j]]
|
| 56 |
+
response_diff = response_token_ranks[original_tokens[i]] - response_token_ranks[original_tokens[j]]
|
| 57 |
+
|
| 58 |
+
if original_diff * response_diff > 0:
|
| 59 |
+
concordant_pairs += 1
|
| 60 |
+
elif original_diff * response_diff < 0:
|
| 61 |
+
discordant_pairs += 1
|
| 62 |
+
|
| 63 |
+
total_pairs = n * (n - 1) // 2
|
| 64 |
+
kendall_tau = (concordant_pairs - discordant_pairs) / total_pairs
|
| 65 |
+
|
| 66 |
+
# Normalize Kendall's tau to be between 0 and 1
|
| 67 |
+
normalized_kendall_tau = (kendall_tau + 1) / 2
|
| 68 |
+
|
| 69 |
+
return normalized_kendall_tau
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spacy
|
| 2 |
+
jsonlines
|
| 3 |
+
openai
|
| 4 |
+
python-dotenv
|
| 5 |
+
fast-sentence-tokenize
|
test.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
train.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|