| import json | |
| import random | |
| import gradio as gr | |
| from difflib import SequenceMatcher | |
| with open("qwen_gsm8k_output.jsonl", "r") as file: | |
| qwen_dict = [json.loads(line) for line in file] | |
| with open("phi4_gsm8k_output.jsonl", "r") as file: | |
| phi4_dict = [json.loads(line) for line in file] | |
| models_data = { | |
| "microsoft/phi-4" : phi4_dict, | |
| "Qwen/Qwen2.5-14B" : qwen_dict, | |
| } | |
| models_no = { | |
| "microsoft/phi-4" : 172, | |
| "Qwen/Qwen2.5-14B" : 729, | |
| } | |
| starting_index = 0 | |
| starting_model = [model_name for model_name in models_data.keys()][0] | |
| description_template = """ | |
| This Space is inspired by [Luis Hunt's](https://www.linkedin.com/posts/louiswhunt_see-below-for-6882-pages-of-mmlu-and-gsm8k-activity-7281011488692047872-fWCE?utm_source=share&utm_medium=member_desktop) post. | |
| He highlights how current top performing models from major vendors are contaminated with benchmark data that is supposed to be used to assess their performance. | |
| This space aims to partially reproduce this work. | |
| I chose to look at the contamination of **Qwen/Qwen2.5-14B** and **microsoft/phi-4** by **GSM8K** dataset. | |
| For **{model_name}**, I found **{number}** GSM8K examples that had a least a 0.9 text similarity ratio between generated and original. | |
| """ | |
| def find_similar_chunks(original, output): | |
| matcher = SequenceMatcher(None, original, output) | |
| left = 0 | |
| highlighted_sequence = [] | |
| for _, j, n in matcher.get_matching_blocks(): | |
| if left < j: | |
| highlighted_sequence.append((output[left:j], None)) | |
| highlighted_sequence.append((output[j:j+n], 1)) | |
| left = j + n | |
| if j+n < len(output) - 1: | |
| highlighted_sequence.append((output[j+n:], None)) | |
| highlighted_sequence = highlighted_sequence[:-1] | |
| return highlighted_sequence | |
| def next_example(selected_model): | |
| new_example = random.choice(models_data[selected_model]) | |
| highlighted_output = find_similar_chunks(new_example["original"], new_example["output"]) | |
| return( | |
| [ | |
| new_example["prompt"], | |
| new_example["original"], | |
| highlighted_output, | |
| new_example["similarity_ratio"], | |
| new_example["seed"] | |
| ] | |
| ) | |
| def change_model(selected_model): | |
| example = models_data[selected_model][starting_index] | |
| highlighted_output = find_similar_chunks(example["original"], example["output"]) | |
| return( | |
| [ | |
| example["prompt"], | |
| example["original"], | |
| highlighted_output, | |
| example["similarity_ratio"], | |
| example["seed"], | |
| description_template.format(model_name=selected_model, number=models_no[selected_model]) | |
| ] | |
| ) | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| description_text = gr.Markdown(description_template.format(model_name=starting_model, number=models_no[starting_model])) | |
| with gr.Column(scale=1): | |
| pass | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| selected_model = gr.Dropdown( | |
| [model_name for model_name in models_data.keys()], | |
| value=[model_name for model_name in models_data.keys()][0], | |
| interactive=True, | |
| label="Model" | |
| ) | |
| with gr.Column(scale=4): | |
| prompt = gr.Textbox( | |
| label="Prompt", | |
| interactive=False, | |
| value=models_data[starting_model][starting_index]["prompt"], | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| original = gr.Textbox( | |
| label="Original", | |
| interactive=False, | |
| value=models_data[starting_model][starting_index]["original"], | |
| ) | |
| with gr.Column(scale=4): | |
| output = gr.HighlightedText( | |
| label="Output", | |
| color_map={"1": "yellow"}, | |
| value=find_similar_chunks(models_data[starting_model][starting_index]["original"], | |
| models_data[starting_model][starting_index]["output"]), | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| similarity = gr.Textbox( | |
| label="Similarity ratio", | |
| interactive=False, | |
| value=models_data[starting_model][starting_index]["similarity_ratio"], | |
| ) | |
| with gr.Column(scale=1): | |
| seed = gr.Textbox( | |
| label="Seed", | |
| interactive=False, | |
| value=models_data[starting_model][starting_index]["seed"], | |
| ) | |
| next_btn = gr.Button("Another example") | |
| next_btn.click(fn=next_example, | |
| inputs=[selected_model], | |
| outputs=[prompt, original, output, similarity, seed]) | |
| selected_model.change(fn=change_model, | |
| inputs=[selected_model], | |
| outputs=[prompt, original, output, similarity, seed, description_text]) | |
| demo.launch() |