Spaces:

acmc
/

whatsapp-chats-finetuning-formatter

Running

App Files Files Community

ACMCMC commited on Jun 14

Commit

4aad28a

1 Parent(s): e79532a

Bugfix

Browse files

Files changed (2) hide show

app.py +32 -14
utils.py +33 -42

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import matplotlib.pyplot as plt
 from utils import (
     process_chat_file,
     transform_conversations_dataset_into_training_examples,
-    convert_gpt_to_gemini_format,  # Add this import
 )
 from validation import check_format_errors, estimate_cost, get_distributions
@@ -71,6 +71,7 @@ def file_upload_callback(
     user_role,
     model_role,
     whatsapp_name,
     datetime_dayfirst,
     message_line_format,
     minutes_threshold,
@@ -79,7 +80,6 @@ def file_upload_callback(
     split_conversation_threshold,
     progress=gr.Progress(),
 ):
-    output_format = "GPT"
     logger.info(f"Processing {files}")
     full_system_prompt = f"""# Task
 You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
@@ -131,6 +131,30 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
         f"Total number of generated examples: {total_number_of_generated_examples}"
     )
     # Split into training and validation datasets (80% and 20%)
     try:
         split_examples_ds = full_examples_ds.train_test_split(
@@ -194,18 +218,12 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
         training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
         validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
-        # Extract the gemini_format column and save as JSON files
-        training_gemini_list = training_examples_gemini["gemini_format"]
-        validation_gemini_list = validation_examples_gemini["gemini_format"]
-        # Save as JSON files with Gemini format
-        file_path = f"training_examples_gemini_{uuid}.json"
-        with open(file_path, 'w', encoding='utf-8') as f:
-            json.dump(training_gemini_list, f, ensure_ascii=False, indent=2)
-        file_path_validation = f"validation_examples_gemini_{uuid}.json"
-        with open(file_path_validation, 'w', encoding='utf-8') as f:
-            json.dump(validation_gemini_list, f, ensure_ascii=False, indent=2)
     else:
         # Original GPT format - JSONL
         file_path = f"training_examples_{uuid}.jsonl"
@@ -413,7 +431,7 @@ with gr.Blocks(theme=theme) as demo:
             user_role,
             model_role,
             whatsapp_name,
-            # output_format,
             datetime_dayfirst,
             message_line_format,
             minutes_threshold,

 from utils import (
     process_chat_file,
     transform_conversations_dataset_into_training_examples,
+    convert_gpt_to_gemini_format,  # Use the fixed version
 )
 from validation import check_format_errors, estimate_cost, get_distributions
     user_role,
     model_role,
     whatsapp_name,
+    output_format,
     datetime_dayfirst,
     message_line_format,
     minutes_threshold,
     split_conversation_threshold,
     progress=gr.Progress(),
 ):
     logger.info(f"Processing {files}")
     full_system_prompt = f"""# Task
 You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
         f"Total number of generated examples: {total_number_of_generated_examples}"
     )
+    # Remove messages where we don't have the pattern user -> model -> user -> model (there should be at least one user message and one model message after every user message)
+    def has_valid_message_pattern(example):
+        messages = example["messages"]
+        if not messages:
+            return False
+        # The first message should be a system message, so we can skip it
+        if messages and messages[0]["role"] == "system":
+            messages = messages[1:]
+        # If there are less than 2 messages, we can't have a valid pattern
+        if len(messages) < 2:
+            return False
+        for i in range(0, len(messages) - 1, 2):
+            # Check if the current message is from the user and the next one is from the model
+            if messages[i]["role"] == user_role and messages[i + 1]["role"] == model_role:
+                continue
+            # If we reach here, it means the pattern is broken
+            return False
+        return True
+    full_examples_ds = full_examples_ds.filter(has_valid_message_pattern)
+    logger.info(
+        f"Number of examples after filtering for valid message pattern: {len(full_examples_ds)}"
+    )
     # Split into training and validation datasets (80% and 20%)
     try:
         split_examples_ds = full_examples_ds.train_test_split(
         training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
         validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
+        # Save as JSONL files with Gemini format - one JSON object per line
+        file_path = f"training_examples_gemini_{uuid}.jsonl"
+        training_examples_gemini.to_json(path_or_buf=file_path, force_ascii=False)
+        file_path_validation = f"validation_examples_gemini_{uuid}.jsonl"
+        validation_examples_gemini.to_json(path_or_buf=file_path_validation, force_ascii=False)
     else:
         # Original GPT format - JSONL
         file_path = f"training_examples_{uuid}.jsonl"
             user_role,
             model_role,
             whatsapp_name,
+            output_format,
             datetime_dayfirst,
             message_line_format,
             minutes_threshold,

utils.py CHANGED Viewed

@@ -525,52 +525,43 @@ def transform_conversations_dataset_into_training_examples(
 def convert_gpt_to_gemini_format(gpt_dataset):
     """
-    Convert GPT format Dataset to Gemini 2.0 format
     GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
     Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
     """
-    def process_examples(examples):
-        gemini_examples = []
-        for messages in examples["messages"]:
-            # Parse JSON messages if they're strings
-            if isinstance(messages, str):
-                import json
-                messages = json.loads(messages)
-            # Find system message and other contents
-            system_instruction = None
-            contents = []
-            for msg in messages:
-                if msg["role"] == "system":
-                    system_instruction = {
-                        "role": "system",
-                        "parts": [{"text": msg["content"]}]
-                    }
-                elif msg["role"] in ["user", "assistant", "model"]:
-                    # Convert assistant to model for Gemini
-                    role = "model" if msg["role"] == "assistant" else msg["role"]
-                    contents.append({
-                        "role": role,
-                        "parts": [{"text": msg["content"]}]
-                    })
-            gemini_example = {"contents": contents}
-            if system_instruction:
-                gemini_example["systemInstruction"] = system_instruction
-            gemini_examples.append(gemini_example)
-        # Return in the format expected by Dataset.map
-        return {"gemini_format": gemini_examples}
-    # Use Dataset.map to process the data
-    processed_dataset = gpt_dataset.map(
-        process_examples,
-        remove_columns=["messages"],
-        batched=True,
-    )
-    return processed_dataset

 def convert_gpt_to_gemini_format(gpt_dataset):
     """
+    Convert GPT format training examples to Gemini 2.0 format using Dataset
     GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
     Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
     """
+    def convert_example(example):
+        messages = example["messages"]
+        # Parse JSON messages if they're strings
+        if isinstance(messages, str):
+            import json
+            messages = json.loads(messages)
+        # Find system message and other contents
+        system_instruction = None
+        contents = []
+        for msg in messages:
+            if msg["role"] == "system":
+                system_instruction = {
+                    "role": "system",
+                    "parts": [{"text": msg["content"]}]
+                }
+            elif msg["role"] in ["user", "assistant", "model"]:
+                # Convert assistant to model for Gemini
+                role = "model" if msg["role"] == "assistant" else msg["role"]
+                contents.append({
+                    "role": role,
+                    "parts": [{"text": msg["content"]}]
+                })
+        # Build Gemini example - always include contents, optionally include systemInstruction
+        gemini_example = {"contents": contents}
+        if system_instruction:
+            gemini_example["systemInstruction"] = system_instruction
+        return gemini_example
+    # Use Dataset.map to convert each example individually
+    return gpt_dataset.map(convert_example, remove_columns=["messages"])