Spaces:

acmc
/

whatsapp-chats-finetuning-formatter

Running

App Files Files Community

acmc commited on Apr 22, 2024

Commit

bf9e30f

1 Parent(s): 7e73556

Adapting to GCP

Browse files

Files changed (3) hide show

app.py +107 -32
utils.py +79 -27
validation.py +10 -10

app.py CHANGED Viewed

@@ -10,7 +10,6 @@ from utils import (
 )
 from validation import (
     check_format_errors,
-    check_token_counts,
     estimate_cost,
     get_distributions,
 )
@@ -22,44 +21,79 @@ def convert_to_dataset(files, do_spelling_correction, progress):
     for file in progress.tqdm(files, desc="Processing files"):
         if modified_dataset is None:
             # First file
-            modified_dataset = process_chat_file(file, do_spelling_correction=do_spelling_correction)
         else:
             # Concatenate the datasets
-            this_file_dataset = process_chat_file(file, do_spelling_correction=do_spelling_correction)
             modified_dataset = datasets.concatenate_datasets(
                 [modified_dataset, this_file_dataset]
             )
     return modified_dataset
-def file_upload_callback(files, system_prompt, do_spelling_correction, validation_split, progress=gr.Progress()):
     print(f"Processing {files}")
     full_system_prompt = f"""You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
 # Task
-A participant can send multiple messages in a row, delimited by '\"', in the following schema:
-{{string}}[]. Your answer always needs to be JSON compliant. Always start your answer with [\"
 # Information about me
 You should use the following information about me to answer:
-{system_prompt}
-# Example
-[{{\"role\":\"user\",\"content\":\"[\"Hello!\",\"How are you?\"]\"}},{{\"role\":\"assistant\",\"content\":\"[\"Hi!\",\"I'm doing great.\",\"What about you?\"]\"}},{{\"role\":\"user\",\"content\":\"[\"I'm doing well.\",\"Have you been travelling?\"]\"}}]
-Response:
-[{{\"role\":\"assistant\",\"content\":\"[\"Yes, I've been to many places.\",\"I love travelling.\"]\"}}]"""
-    # Avoid using the full system prompt for now, as it is too long and increases the cost of the training
-    full_system_prompt = system_prompt
-    dataset = convert_to_dataset(files=files, progress=progress, do_spelling_correction=do_spelling_correction)
     training_examples_ds = transform_conversations_dataset_into_training_examples(
-        conversations_ds=dataset, system_prompt=full_system_prompt
     )
     # Split into training and validation datasets (80% and 20%)
-    training_examples_ds = training_examples_ds.train_test_split(test_size=validation_split, seed=42)
-    training_examples_ds, validation_examples_ds = training_examples_ds["train"], training_examples_ds["test"]
-    format_errors = check_format_errors(training_examples_ds)
-    distributions = get_distributions(training_examples_ds)
-    cost_stats = estimate_cost(training_examples_ds)
     stats = {
         "Format Errors": format_errors,
@@ -76,8 +110,7 @@ Response:
     fig_num_assistant_tokens_per_example_plot = plt.figure()
     num_assistant_tokens_per_example_plot = plt.hist(
-        distributions["assistant_message_lens"],
-        bins=20
     )
     # The DownloadFile component requires a path to the file, it can't accept a buffer to keep the file in memory.
@@ -99,7 +132,7 @@ Response:
         stats,
         fig_num_messages_distribution_plot,
         fig_num_total_tokens_per_example_plot,
-        fig_num_assistant_tokens_per_example_plot
     )
@@ -151,6 +184,24 @@ with gr.Blocks(theme=theme) as demo:
         value="""Aldan is an AI researcher who loves to play around with AI systems, travelling and learning new things.""",
     )
     do_spelling_correction = gr.Checkbox(
         label="Do Spelling Correction (English)",
         info="Check this box if you want to perform spelling correction on the chat messages before generating the training examples.",
@@ -168,23 +219,41 @@ with gr.Blocks(theme=theme) as demo:
     submit = gr.Button(value="Submit", variant="primary")
-    output_file = gr.DownloadButton(label="Download Generated Training Examples", visible=False, variant="primary")
-    output_file_validation = gr.DownloadButton(label="Download Generated Validation Examples", visible=False, variant="secondary")
     # output_example = gr.JSON(label="Example Training Example")
     with gr.Group():
         # Statistics about the dataset
         gr.Markdown("## Statistics")
         written_stats = gr.JSON()
-        num_messages_distribution_plot = gr.Plot(label="Number of Messages Distribution")
-        num_total_tokens_per_example_plot = gr.Plot(label="Total Number of Tokens per Example")
         num_assistant_tokens_per_example_plot = gr.Plot(
             label="Number of Assistant Tokens per Example"
         )
     submit.click(
         file_upload_callback,
-        inputs=[input_files, system_prompt, do_spelling_correction, validation_split],
         outputs=[
             output_file,
             output_file,
@@ -194,11 +263,17 @@ with gr.Blocks(theme=theme) as demo:
             num_messages_distribution_plot,
             num_total_tokens_per_example_plot,
             num_assistant_tokens_per_example_plot,
-        ]
     )
-    output_file.click(remove_file_and_hide_button, inputs=[output_file], outputs=[output_file])
-    output_file_validation.click(remove_file_and_hide_button, inputs=[output_file_validation], outputs=[output_file_validation])
 if __name__ == "__main__":
     demo.launch()

 )
 from validation import (
     check_format_errors,
     estimate_cost,
     get_distributions,
 )
     for file in progress.tqdm(files, desc="Processing files"):
         if modified_dataset is None:
             # First file
+            modified_dataset = process_chat_file(
+                file, do_spelling_correction=do_spelling_correction
+            )
         else:
             # Concatenate the datasets
+            this_file_dataset = process_chat_file(
+                file, do_spelling_correction=do_spelling_correction
+            )
             modified_dataset = datasets.concatenate_datasets(
                 [modified_dataset, this_file_dataset]
             )
     return modified_dataset
+def file_upload_callback(
+    files,
+    system_prompt,
+    do_spelling_correction,
+    validation_split,
+    user_role,
+    model_role,
+    whatsapp_name,
+    progress=gr.Progress(),
+):
     print(f"Processing {files}")
     full_system_prompt = f"""You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
 # Task
+The {model_role} and the {user_role} can send multiple messages in a row, as a JSON list of strings. Your answer always needs to be JSON compliant. The strings are delimited by double quotes ("). The strings are separated by a comma (,). The list is delimited by square brackets ([, ]). Always start your answer with [", and close it with "]. Do not write anything else in your answer after "].
 # Information about me
 You should use the following information about me to answer:
+{system_prompt}"""
+    # Example
+    # [{{\"role\":\"user\",\"content\":\"[\"Hello!\",\"How are you?\"]\"}},{{\"role\":\"assistant\",\"content\":\"[\"Hi!\",\"I'm doing great.\",\"What about you?\"]\"}},{{\"role\":\"user\",\"content\":\"[\"I'm doing well.\",\"Have you been travelling?\"]\"}}]
+    # Response:
+    # [{{\"role\":\"assistant\",\"content\":\"[\"Yes, I've been to many places.\",\"I love travelling.\"]\"}}]"""
+    # # Avoid using the full system prompt for now, as it is too long and increases the cost of the training
+    # full_system_prompt = system_prompt
+    dataset = convert_to_dataset(
+        files=files, progress=progress, do_spelling_correction=do_spelling_correction
+    )
     training_examples_ds = transform_conversations_dataset_into_training_examples(
+        conversations_ds=dataset,
+        system_prompt=full_system_prompt,
+        user_role=user_role,
+        model_role=model_role,
+        whatsapp_name=whatsapp_name,
     )
     # Split into training and validation datasets (80% and 20%)
+    training_examples_ds = training_examples_ds.train_test_split(
+        test_size=validation_split, seed=42
+    )
+    training_examples_ds, validation_examples_ds = (
+        training_examples_ds["train"],
+        training_examples_ds["test"],
+    )
+    training_examples_ds = training_examples_ds#.select(
+    #    range(min(250, len(training_examples_ds)))
+    #)
+    validation_examples_ds = validation_examples_ds.select(
+        range(min(200, len(validation_examples_ds)))
+    )
+    format_errors = check_format_errors(
+        training_examples_ds, user_role=user_role, model_role=model_role
+    )
+    distributions = get_distributions(
+        training_examples_ds, user_role=user_role, model_role=model_role
+    )
+    cost_stats = estimate_cost(
+        training_examples_ds, user_role=user_role, model_role=model_role
+    )
     stats = {
         "Format Errors": format_errors,
     fig_num_assistant_tokens_per_example_plot = plt.figure()
     num_assistant_tokens_per_example_plot = plt.hist(
+        distributions["assistant_message_lens"], bins=20
     )
     # The DownloadFile component requires a path to the file, it can't accept a buffer to keep the file in memory.
         stats,
         fig_num_messages_distribution_plot,
         fig_num_total_tokens_per_example_plot,
+        fig_num_assistant_tokens_per_example_plot,
     )
         value="""Aldan is an AI researcher who loves to play around with AI systems, travelling and learning new things.""",
     )
+    whatsapp_name = gr.Textbox(
+        label="Your WhatsApp Name",
+        placeholder="Your WhatsApp Name",
+        info="Enter your WhatsApp name as it appears in your profile. It needs to match exactly your name. If you're unsure, you can check the chat messages to see it.",
+    )
+    user_role = gr.Textbox(
+        label="Role for User",
+        info="This is a technical parameter. If you don't know what to write, just type 'user'.",
+        value="user",
+    )
+    model_role = gr.Textbox(
+        label="Role for Model",
+        info="This is a technical parameter. If you don't know what to write, just type 'model'.",
+        value="model",
+    )
     do_spelling_correction = gr.Checkbox(
         label="Do Spelling Correction (English)",
         info="Check this box if you want to perform spelling correction on the chat messages before generating the training examples.",
     submit = gr.Button(value="Submit", variant="primary")
+    output_file = gr.DownloadButton(
+        label="Download Generated Training Examples", visible=False, variant="primary"
+    )
+    output_file_validation = gr.DownloadButton(
+        label="Download Generated Validation Examples",
+        visible=False,
+        variant="secondary",
+    )
     # output_example = gr.JSON(label="Example Training Example")
     with gr.Group():
         # Statistics about the dataset
         gr.Markdown("## Statistics")
         written_stats = gr.JSON()
+        num_messages_distribution_plot = gr.Plot(
+            label="Number of Messages Distribution"
+        )
+        num_total_tokens_per_example_plot = gr.Plot(
+            label="Total Number of Tokens per Example"
+        )
         num_assistant_tokens_per_example_plot = gr.Plot(
             label="Number of Assistant Tokens per Example"
         )
     submit.click(
         file_upload_callback,
+        inputs=[
+            input_files,
+            system_prompt,
+            do_spelling_correction,
+            validation_split,
+            user_role,
+            model_role,
+            whatsapp_name,
+        ],
         outputs=[
             output_file,
             output_file,
             num_messages_distribution_plot,
             num_total_tokens_per_example_plot,
             num_assistant_tokens_per_example_plot,
+        ],
     )
+    output_file.click(
+        remove_file_and_hide_button, inputs=[output_file], outputs=[output_file]
+    )
+    output_file_validation.click(
+        remove_file_and_hide_button,
+        inputs=[output_file_validation],
+        outputs=[output_file_validation],
+    )
 if __name__ == "__main__":
     demo.launch()

utils.py CHANGED Viewed

@@ -35,8 +35,9 @@ def process_line(example):
 # %%
 # Now, create message groups ('conversations')
 # The idea is to group messages that are close in time
-# We'll use a 240 minute threshold
-MINUTES_THRESHOLD = 240
 def group_messages(messages_iterable):
@@ -67,8 +68,9 @@ def printable_conversation(conversation):
 import spacy
 import contextualSpellCheck
 from spellchecker import SpellChecker
 spell = SpellChecker()
-#nlp = spacy.load("es_core_news_sm")
 nlp = spacy.load("en_core_web_sm")
@@ -262,8 +264,10 @@ def process_chat_file(file, do_spelling_correction, do_reordering=False):
     # Generate the dataset
     conversations_ds = datasets.Dataset.from_dict({"conversations": groups})
-    # Filter out conversations with less than 10 messages
-    conversations_ds = conversations_ds.filter(lambda x: len(x["conversations"]) >= 10)
     conversations_ds_without_whatsapp_annotations = conversations_ds.map(
         remove_whatapp_annotations,
@@ -296,8 +300,12 @@ def process_chat_file(file, do_spelling_correction, do_reordering=False):
     return changed_contact_name_ds
 def transform_conversations_dataset_into_training_examples(
-    conversations_ds, system_prompt
 ):
     """
     Takes in a dataset with conversations and returns a dataset with training examples.
@@ -317,26 +325,70 @@ def transform_conversations_dataset_into_training_examples(
     ```
     """
-    def process_one_example(example):
-        messages = [{"role": "system", "content": [system_prompt]}]
-        for msg in example["conversations"]:
-            converted_role = "assistant" if msg["contact_name"] == "Aldi" else "user"
-            if converted_role == messages[-1]["role"]:
-                messages[-1]["content"] += [msg["message"]]
-            else:
-                messages.append({"role": converted_role, "content": [msg["message"]]})
-        return {
-            "messages": [
-                {
-                    "role": m["role"],
-                    "content": json.dumps(m["content"], ensure_ascii=False),
-                }
-                for m in messages
-            ]
-        }
-    return conversations_ds.map(
-        process_one_example,
         remove_columns=["conversations"],
-        num_proc=os.cpu_count() - 1,
     )

 # %%
 # Now, create message groups ('conversations')
 # The idea is to group messages that are close in time
+# We'll use a 180 minute threshold
+MINUTES_THRESHOLD = 180
+MIN_MESSAGES_THRESHOLD = 5
 def group_messages(messages_iterable):
 import spacy
 import contextualSpellCheck
 from spellchecker import SpellChecker
 spell = SpellChecker()
+# nlp = spacy.load("es_core_news_sm")
 nlp = spacy.load("en_core_web_sm")
     # Generate the dataset
     conversations_ds = datasets.Dataset.from_dict({"conversations": groups})
+    # Filter out conversations with less than 5 messages
+    conversations_ds = conversations_ds.filter(
+        lambda x: len(x["conversations"]) >= MIN_MESSAGES_THRESHOLD
+    )
     conversations_ds_without_whatsapp_annotations = conversations_ds.map(
         remove_whatapp_annotations,
     return changed_contact_name_ds
+SPLIT_CONVERSATION_THRESHOLD = 40
+MAX_CHARACTERS_PER_MESSAGE = 10000  # Max is 8,192 tokens (https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-about#sample-datasets)
 def transform_conversations_dataset_into_training_examples(
+    conversations_ds, system_prompt, user_role, model_role, whatsapp_name
 ):
     """
     Takes in a dataset with conversations and returns a dataset with training examples.
     ```
     """
+    def process_examples(examples):
+        processed_examples = []
+        for conversation in examples["conversations"]:
+            messages = [{"role": "system", "content": [system_prompt]}]
+            counter = 0
+            for msg in conversation:
+                converted_role = (
+                    model_role if msg["contact_name"] == whatsapp_name else user_role
+                )
+                if (
+                    counter > SPLIT_CONVERSATION_THRESHOLD
+                    and converted_role == user_role
+                ):
+                    processed_examples.append(
+                        {
+                            "messages": [
+                                {
+                                    "role": m["role"],
+                                    "content": json.dumps(
+                                        m["content"], ensure_ascii=False
+                                    ),
+                                }
+                                for m in messages
+                            ]
+                        }
+                    )
+                    messages = [{"role": "system", "content": [system_prompt]}]
+                    counter = 0
+                if converted_role == messages[-1]["role"]:
+                    messages[-1]["content"] += [msg["message"]]
+                else:
+                    messages.append(
+                        {"role": converted_role, "content": [msg["message"]]}
+                    )
+                counter += 1
+            if len(messages) >= MIN_MESSAGES_THRESHOLD:
+                processed_examples.append(
+                    {
+                        "messages": [
+                            {
+                                "role": m["role"],
+                                "content": json.dumps(m["content"], ensure_ascii=False),
+                            }
+                            for m in messages
+                        ]
+                    }
+                )
+        # Before returning, flatten the list of dictionaries into a dictionary of lists
+        flattened_examples = {}
+        for key in processed_examples[0].keys():
+            flattened_examples[key] = [d[key] for d in processed_examples]
+        return flattened_examples
+    processed_examples = conversations_ds.map(
+        process_examples,
         remove_columns=["conversations"],
+        # num_proc=os.cpu_count() - 1,
+        batched=True,
+    )
+    examples_filtered_by_length = processed_examples.filter(
+        lambda x: all(
+            [len(m["content"]) < MAX_CHARACTERS_PER_MESSAGE for m in x["messages"]]
+        )
     )
+    return examples_filtered_by_length

validation.py CHANGED Viewed

@@ -3,7 +3,7 @@ from collections import defaultdict
 import tiktoken
-def check_format_errors(train_dataset):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
@@ -27,7 +27,7 @@ def check_format_errors(train_dataset):
             if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                 format_errors["message_unrecognized_key"] += 1
-            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                 format_errors["unrecognized_role"] += 1
             content = message.get("content", None)
@@ -36,7 +36,7 @@ def check_format_errors(train_dataset):
             if (not content and not function_call) or not isinstance(content, str):
                 format_errors["missing_content"] += 1
-        if not any(message.get("role", None) == "assistant" for message in messages):
             format_errors["example_missing_assistant_message"] += 1
     if format_errors:
@@ -48,7 +48,7 @@ def check_format_errors(train_dataset):
     return format_errors if format_errors else {}
-def get_distributions(train_dataset):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
@@ -72,7 +72,7 @@ def get_distributions(train_dataset):
     def num_assistant_tokens_from_messages(messages):
         num_tokens = 0
         for message in messages:
-            if message["role"] == "assistant":
                 num_tokens += len(encoding.encode(message["content"]))
         return num_tokens
@@ -87,7 +87,7 @@ def get_distributions(train_dataset):
         messages = ex["messages"]
         if not any(message["role"] == "system" for message in messages):
             n_missing_system += 1
-        if not any(message["role"] == "user" for message in messages):
             n_missing_user += 1
         n_messages.append(len(messages))
         convo_lens.append(num_tokens_from_messages(messages))
@@ -102,7 +102,7 @@ def get_distributions(train_dataset):
     }
-def check_token_counts(train_dataset):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
@@ -115,7 +115,7 @@ def check_token_counts(train_dataset):
     # Warnings and tokens counts
-    distributions = get_distributions(train_dataset)
     n_missing_system = distributions["n_missing_system"]
     n_missing_user = distributions["n_missing_user"]
     n_messages = distributions["n_messages"]
@@ -135,11 +135,11 @@ def check_token_counts(train_dataset):
     return
-def estimate_cost(train_dataset):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
-    distributions = get_distributions(train_dataset)
     n_missing_system = distributions["n_missing_system"]
     n_missing_user = distributions["n_missing_user"]
     n_messages = distributions["n_messages"]

 import tiktoken
+def check_format_errors(train_dataset, user_role, model_role):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
             if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                 format_errors["message_unrecognized_key"] += 1
+            if message.get("role", None) not in ["system", user_role, model_role]:
                 format_errors["unrecognized_role"] += 1
             content = message.get("content", None)
             if (not content and not function_call) or not isinstance(content, str):
                 format_errors["missing_content"] += 1
+        if not any(message.get("role", None) == model_role for message in messages):
             format_errors["example_missing_assistant_message"] += 1
     if format_errors:
     return format_errors if format_errors else {}
+def get_distributions(train_dataset, user_role, model_role):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     def num_assistant_tokens_from_messages(messages):
         num_tokens = 0
         for message in messages:
+            if message["role"] == model_role:
                 num_tokens += len(encoding.encode(message["content"]))
         return num_tokens
         messages = ex["messages"]
         if not any(message["role"] == "system" for message in messages):
             n_missing_system += 1
+        if not any(message["role"] == user_role for message in messages):
             n_missing_user += 1
         n_messages.append(len(messages))
         convo_lens.append(num_tokens_from_messages(messages))
     }
+def check_token_counts(train_dataset, user_role, model_role):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
     # Warnings and tokens counts
+    distributions = get_distributions(train_dataset, user_role=user_role, model_role=model_role)
     n_missing_system = distributions["n_missing_system"]
     n_missing_user = distributions["n_missing_user"]
     n_messages = distributions["n_messages"]
     return
+def estimate_cost(train_dataset, user_role, model_role):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
+    distributions = get_distributions(train_dataset, user_role=user_role, model_role=model_role)
     n_missing_system = distributions["n_missing_system"]
     n_missing_user = distributions["n_missing_user"]
     n_messages = distributions["n_messages"]