ACMCMC
		
	commited on
		
		
					Commit 
							
							·
						
						4aad28a
	
1
								Parent(s):
							
							e79532a
								
Bugfix
Browse files
    	
        app.py
    CHANGED
    
    | @@ -11,7 +11,7 @@ import matplotlib.pyplot as plt | |
| 11 | 
             
            from utils import (
         | 
| 12 | 
             
                process_chat_file,
         | 
| 13 | 
             
                transform_conversations_dataset_into_training_examples,
         | 
| 14 | 
            -
                convert_gpt_to_gemini_format,  #  | 
| 15 | 
             
            )
         | 
| 16 | 
             
            from validation import check_format_errors, estimate_cost, get_distributions
         | 
| 17 |  | 
| @@ -71,6 +71,7 @@ def file_upload_callback( | |
| 71 | 
             
                user_role,
         | 
| 72 | 
             
                model_role,
         | 
| 73 | 
             
                whatsapp_name,
         | 
|  | |
| 74 | 
             
                datetime_dayfirst,
         | 
| 75 | 
             
                message_line_format,
         | 
| 76 | 
             
                minutes_threshold,
         | 
| @@ -79,7 +80,6 @@ def file_upload_callback( | |
| 79 | 
             
                split_conversation_threshold,
         | 
| 80 | 
             
                progress=gr.Progress(),
         | 
| 81 | 
             
            ):
         | 
| 82 | 
            -
                output_format = "GPT"
         | 
| 83 | 
             
                logger.info(f"Processing {files}")
         | 
| 84 | 
             
                full_system_prompt = f"""# Task
         | 
| 85 | 
             
            You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
         | 
| @@ -131,6 +131,30 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J | |
| 131 | 
             
                    f"Total number of generated examples: {total_number_of_generated_examples}"
         | 
| 132 | 
             
                )
         | 
| 133 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 134 | 
             
                # Split into training and validation datasets (80% and 20%)
         | 
| 135 | 
             
                try:
         | 
| 136 | 
             
                    split_examples_ds = full_examples_ds.train_test_split(
         | 
| @@ -194,18 +218,12 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J | |
| 194 | 
             
                    training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
         | 
| 195 | 
             
                    validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
         | 
| 196 |  | 
| 197 | 
            -
                    #  | 
| 198 | 
            -
                     | 
| 199 | 
            -
                     | 
| 200 | 
            -
                    
         | 
| 201 | 
            -
                    # Save as JSON files with Gemini format
         | 
| 202 | 
            -
                    file_path = f"training_examples_gemini_{uuid}.json"
         | 
| 203 | 
            -
                    with open(file_path, 'w', encoding='utf-8') as f:
         | 
| 204 | 
            -
                        json.dump(training_gemini_list, f, ensure_ascii=False, indent=2)
         | 
| 205 |  | 
| 206 | 
            -
                    file_path_validation = f"validation_examples_gemini_{uuid}. | 
| 207 | 
            -
                     | 
| 208 | 
            -
                        json.dump(validation_gemini_list, f, ensure_ascii=False, indent=2)
         | 
| 209 | 
             
                else:
         | 
| 210 | 
             
                    # Original GPT format - JSONL
         | 
| 211 | 
             
                    file_path = f"training_examples_{uuid}.jsonl"
         | 
| @@ -413,7 +431,7 @@ with gr.Blocks(theme=theme) as demo: | |
| 413 | 
             
                        user_role,
         | 
| 414 | 
             
                        model_role,
         | 
| 415 | 
             
                        whatsapp_name,
         | 
| 416 | 
            -
                         | 
| 417 | 
             
                        datetime_dayfirst,
         | 
| 418 | 
             
                        message_line_format,
         | 
| 419 | 
             
                        minutes_threshold,
         | 
|  | |
| 11 | 
             
            from utils import (
         | 
| 12 | 
             
                process_chat_file,
         | 
| 13 | 
             
                transform_conversations_dataset_into_training_examples,
         | 
| 14 | 
            +
                convert_gpt_to_gemini_format,  # Use the fixed version
         | 
| 15 | 
             
            )
         | 
| 16 | 
             
            from validation import check_format_errors, estimate_cost, get_distributions
         | 
| 17 |  | 
|  | |
| 71 | 
             
                user_role,
         | 
| 72 | 
             
                model_role,
         | 
| 73 | 
             
                whatsapp_name,
         | 
| 74 | 
            +
                output_format,
         | 
| 75 | 
             
                datetime_dayfirst,
         | 
| 76 | 
             
                message_line_format,
         | 
| 77 | 
             
                minutes_threshold,
         | 
|  | |
| 80 | 
             
                split_conversation_threshold,
         | 
| 81 | 
             
                progress=gr.Progress(),
         | 
| 82 | 
             
            ):
         | 
|  | |
| 83 | 
             
                logger.info(f"Processing {files}")
         | 
| 84 | 
             
                full_system_prompt = f"""# Task
         | 
| 85 | 
             
            You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
         | 
|  | |
| 131 | 
             
                    f"Total number of generated examples: {total_number_of_generated_examples}"
         | 
| 132 | 
             
                )
         | 
| 133 |  | 
| 134 | 
            +
                # Remove messages where we don't have the pattern user -> model -> user -> model (there should be at least one user message and one model message after every user message)
         | 
| 135 | 
            +
                def has_valid_message_pattern(example):
         | 
| 136 | 
            +
                    messages = example["messages"]
         | 
| 137 | 
            +
                    if not messages:
         | 
| 138 | 
            +
                        return False
         | 
| 139 | 
            +
                    # The first message should be a system message, so we can skip it
         | 
| 140 | 
            +
                    if messages and messages[0]["role"] == "system":
         | 
| 141 | 
            +
                        messages = messages[1:]
         | 
| 142 | 
            +
                    # If there are less than 2 messages, we can't have a valid pattern
         | 
| 143 | 
            +
                    if len(messages) < 2:
         | 
| 144 | 
            +
                        return False
         | 
| 145 | 
            +
                    for i in range(0, len(messages) - 1, 2):
         | 
| 146 | 
            +
                        # Check if the current message is from the user and the next one is from the model
         | 
| 147 | 
            +
                        if messages[i]["role"] == user_role and messages[i + 1]["role"] == model_role:
         | 
| 148 | 
            +
                            continue
         | 
| 149 | 
            +
                        # If we reach here, it means the pattern is broken
         | 
| 150 | 
            +
                        return False
         | 
| 151 | 
            +
                    return True
         | 
| 152 | 
            +
                
         | 
| 153 | 
            +
                full_examples_ds = full_examples_ds.filter(has_valid_message_pattern)
         | 
| 154 | 
            +
                logger.info(
         | 
| 155 | 
            +
                    f"Number of examples after filtering for valid message pattern: {len(full_examples_ds)}"
         | 
| 156 | 
            +
                )
         | 
| 157 | 
            +
             | 
| 158 | 
             
                # Split into training and validation datasets (80% and 20%)
         | 
| 159 | 
             
                try:
         | 
| 160 | 
             
                    split_examples_ds = full_examples_ds.train_test_split(
         | 
|  | |
| 218 | 
             
                    training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
         | 
| 219 | 
             
                    validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
         | 
| 220 |  | 
| 221 | 
            +
                    # Save as JSONL files with Gemini format - one JSON object per line
         | 
| 222 | 
            +
                    file_path = f"training_examples_gemini_{uuid}.jsonl"
         | 
| 223 | 
            +
                    training_examples_gemini.to_json(path_or_buf=file_path, force_ascii=False)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 224 |  | 
| 225 | 
            +
                    file_path_validation = f"validation_examples_gemini_{uuid}.jsonl"
         | 
| 226 | 
            +
                    validation_examples_gemini.to_json(path_or_buf=file_path_validation, force_ascii=False)
         | 
|  | |
| 227 | 
             
                else:
         | 
| 228 | 
             
                    # Original GPT format - JSONL
         | 
| 229 | 
             
                    file_path = f"training_examples_{uuid}.jsonl"
         | 
|  | |
| 431 | 
             
                        user_role,
         | 
| 432 | 
             
                        model_role,
         | 
| 433 | 
             
                        whatsapp_name,
         | 
| 434 | 
            +
                        output_format,
         | 
| 435 | 
             
                        datetime_dayfirst,
         | 
| 436 | 
             
                        message_line_format,
         | 
| 437 | 
             
                        minutes_threshold,
         | 
    	
        utils.py
    CHANGED
    
    | @@ -525,52 +525,43 @@ def transform_conversations_dataset_into_training_examples( | |
| 525 |  | 
| 526 | 
             
            def convert_gpt_to_gemini_format(gpt_dataset):
         | 
| 527 | 
             
                """
         | 
| 528 | 
            -
                Convert GPT format  | 
| 529 |  | 
| 530 | 
             
                GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
         | 
| 531 | 
             
                Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
         | 
| 532 | 
             
                """
         | 
| 533 | 
            -
                def  | 
| 534 | 
            -
                     | 
| 535 |  | 
| 536 | 
            -
                     | 
| 537 | 
            -
             | 
| 538 | 
            -
                         | 
| 539 | 
            -
             | 
| 540 | 
            -
                            messages = json.loads(messages)
         | 
| 541 | 
            -
                        
         | 
| 542 | 
            -
                        # Find system message and other contents
         | 
| 543 | 
            -
                        system_instruction = None
         | 
| 544 | 
            -
                        contents = []
         | 
| 545 | 
            -
                        
         | 
| 546 | 
            -
                        for msg in messages:
         | 
| 547 | 
            -
                            if msg["role"] == "system":
         | 
| 548 | 
            -
                                system_instruction = {
         | 
| 549 | 
            -
                                    "role": "system", 
         | 
| 550 | 
            -
                                    "parts": [{"text": msg["content"]}]
         | 
| 551 | 
            -
                                }
         | 
| 552 | 
            -
                            elif msg["role"] in ["user", "assistant", "model"]:
         | 
| 553 | 
            -
                                # Convert assistant to model for Gemini
         | 
| 554 | 
            -
                                role = "model" if msg["role"] == "assistant" else msg["role"]
         | 
| 555 | 
            -
                                contents.append({
         | 
| 556 | 
            -
                                    "role": role,
         | 
| 557 | 
            -
                                    "parts": [{"text": msg["content"]}]
         | 
| 558 | 
            -
                                })
         | 
| 559 | 
            -
                        
         | 
| 560 | 
            -
                        gemini_example = {"contents": contents}
         | 
| 561 | 
            -
                        if system_instruction:
         | 
| 562 | 
            -
                            gemini_example["systemInstruction"] = system_instruction
         | 
| 563 | 
            -
                        
         | 
| 564 | 
            -
                        gemini_examples.append(gemini_example)
         | 
| 565 |  | 
| 566 | 
            -
                    #  | 
| 567 | 
            -
                     | 
| 568 | 
            -
             | 
| 569 | 
            -
             | 
| 570 | 
            -
             | 
| 571 | 
            -
             | 
| 572 | 
            -
             | 
| 573 | 
            -
             | 
| 574 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 575 |  | 
| 576 | 
            -
                 | 
|  | 
|  | |
| 525 |  | 
| 526 | 
             
            def convert_gpt_to_gemini_format(gpt_dataset):
         | 
| 527 | 
             
                """
         | 
| 528 | 
            +
                Convert GPT format training examples to Gemini 2.0 format using Dataset
         | 
| 529 |  | 
| 530 | 
             
                GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
         | 
| 531 | 
             
                Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
         | 
| 532 | 
             
                """
         | 
| 533 | 
            +
                def convert_example(example):
         | 
| 534 | 
            +
                    messages = example["messages"]
         | 
| 535 |  | 
| 536 | 
            +
                    # Parse JSON messages if they're strings
         | 
| 537 | 
            +
                    if isinstance(messages, str):
         | 
| 538 | 
            +
                        import json
         | 
| 539 | 
            +
                        messages = json.loads(messages)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 540 |  | 
| 541 | 
            +
                    # Find system message and other contents
         | 
| 542 | 
            +
                    system_instruction = None
         | 
| 543 | 
            +
                    contents = []
         | 
| 544 | 
            +
                    
         | 
| 545 | 
            +
                    for msg in messages:
         | 
| 546 | 
            +
                        if msg["role"] == "system":
         | 
| 547 | 
            +
                            system_instruction = {
         | 
| 548 | 
            +
                                "role": "system", 
         | 
| 549 | 
            +
                                "parts": [{"text": msg["content"]}]
         | 
| 550 | 
            +
                            }
         | 
| 551 | 
            +
                        elif msg["role"] in ["user", "assistant", "model"]:
         | 
| 552 | 
            +
                            # Convert assistant to model for Gemini
         | 
| 553 | 
            +
                            role = "model" if msg["role"] == "assistant" else msg["role"]
         | 
| 554 | 
            +
                            contents.append({
         | 
| 555 | 
            +
                                "role": role,
         | 
| 556 | 
            +
                                "parts": [{"text": msg["content"]}]
         | 
| 557 | 
            +
                            })
         | 
| 558 | 
            +
                    
         | 
| 559 | 
            +
                    # Build Gemini example - always include contents, optionally include systemInstruction  
         | 
| 560 | 
            +
                    gemini_example = {"contents": contents}
         | 
| 561 | 
            +
                    if system_instruction:
         | 
| 562 | 
            +
                        gemini_example["systemInstruction"] = system_instruction
         | 
| 563 | 
            +
                    
         | 
| 564 | 
            +
                    return gemini_example
         | 
| 565 |  | 
| 566 | 
            +
                # Use Dataset.map to convert each example individually
         | 
| 567 | 
            +
                return gpt_dataset.map(convert_example, remove_columns=["messages"])
         | 
