ACMCMC commited on
Commit
4aad28a
·
1 Parent(s): e79532a
Files changed (2) hide show
  1. app.py +32 -14
  2. utils.py +33 -42
app.py CHANGED
@@ -11,7 +11,7 @@ import matplotlib.pyplot as plt
11
  from utils import (
12
  process_chat_file,
13
  transform_conversations_dataset_into_training_examples,
14
- convert_gpt_to_gemini_format, # Add this import
15
  )
16
  from validation import check_format_errors, estimate_cost, get_distributions
17
 
@@ -71,6 +71,7 @@ def file_upload_callback(
71
  user_role,
72
  model_role,
73
  whatsapp_name,
 
74
  datetime_dayfirst,
75
  message_line_format,
76
  minutes_threshold,
@@ -79,7 +80,6 @@ def file_upload_callback(
79
  split_conversation_threshold,
80
  progress=gr.Progress(),
81
  ):
82
- output_format = "GPT"
83
  logger.info(f"Processing {files}")
84
  full_system_prompt = f"""# Task
85
  You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
@@ -131,6 +131,30 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
131
  f"Total number of generated examples: {total_number_of_generated_examples}"
132
  )
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  # Split into training and validation datasets (80% and 20%)
135
  try:
136
  split_examples_ds = full_examples_ds.train_test_split(
@@ -194,18 +218,12 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
194
  training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
195
  validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
196
 
197
- # Extract the gemini_format column and save as JSON files
198
- training_gemini_list = training_examples_gemini["gemini_format"]
199
- validation_gemini_list = validation_examples_gemini["gemini_format"]
200
-
201
- # Save as JSON files with Gemini format
202
- file_path = f"training_examples_gemini_{uuid}.json"
203
- with open(file_path, 'w', encoding='utf-8') as f:
204
- json.dump(training_gemini_list, f, ensure_ascii=False, indent=2)
205
 
206
- file_path_validation = f"validation_examples_gemini_{uuid}.json"
207
- with open(file_path_validation, 'w', encoding='utf-8') as f:
208
- json.dump(validation_gemini_list, f, ensure_ascii=False, indent=2)
209
  else:
210
  # Original GPT format - JSONL
211
  file_path = f"training_examples_{uuid}.jsonl"
@@ -413,7 +431,7 @@ with gr.Blocks(theme=theme) as demo:
413
  user_role,
414
  model_role,
415
  whatsapp_name,
416
- # output_format,
417
  datetime_dayfirst,
418
  message_line_format,
419
  minutes_threshold,
 
11
  from utils import (
12
  process_chat_file,
13
  transform_conversations_dataset_into_training_examples,
14
+ convert_gpt_to_gemini_format, # Use the fixed version
15
  )
16
  from validation import check_format_errors, estimate_cost, get_distributions
17
 
 
71
  user_role,
72
  model_role,
73
  whatsapp_name,
74
+ output_format,
75
  datetime_dayfirst,
76
  message_line_format,
77
  minutes_threshold,
 
80
  split_conversation_threshold,
81
  progress=gr.Progress(),
82
  ):
 
83
  logger.info(f"Processing {files}")
84
  full_system_prompt = f"""# Task
85
  You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
 
131
  f"Total number of generated examples: {total_number_of_generated_examples}"
132
  )
133
 
134
+ # Remove messages where we don't have the pattern user -> model -> user -> model (there should be at least one user message and one model message after every user message)
135
+ def has_valid_message_pattern(example):
136
+ messages = example["messages"]
137
+ if not messages:
138
+ return False
139
+ # The first message should be a system message, so we can skip it
140
+ if messages and messages[0]["role"] == "system":
141
+ messages = messages[1:]
142
+ # If there are less than 2 messages, we can't have a valid pattern
143
+ if len(messages) < 2:
144
+ return False
145
+ for i in range(0, len(messages) - 1, 2):
146
+ # Check if the current message is from the user and the next one is from the model
147
+ if messages[i]["role"] == user_role and messages[i + 1]["role"] == model_role:
148
+ continue
149
+ # If we reach here, it means the pattern is broken
150
+ return False
151
+ return True
152
+
153
+ full_examples_ds = full_examples_ds.filter(has_valid_message_pattern)
154
+ logger.info(
155
+ f"Number of examples after filtering for valid message pattern: {len(full_examples_ds)}"
156
+ )
157
+
158
  # Split into training and validation datasets (80% and 20%)
159
  try:
160
  split_examples_ds = full_examples_ds.train_test_split(
 
218
  training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
219
  validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
220
 
221
+ # Save as JSONL files with Gemini format - one JSON object per line
222
+ file_path = f"training_examples_gemini_{uuid}.jsonl"
223
+ training_examples_gemini.to_json(path_or_buf=file_path, force_ascii=False)
 
 
 
 
 
224
 
225
+ file_path_validation = f"validation_examples_gemini_{uuid}.jsonl"
226
+ validation_examples_gemini.to_json(path_or_buf=file_path_validation, force_ascii=False)
 
227
  else:
228
  # Original GPT format - JSONL
229
  file_path = f"training_examples_{uuid}.jsonl"
 
431
  user_role,
432
  model_role,
433
  whatsapp_name,
434
+ output_format,
435
  datetime_dayfirst,
436
  message_line_format,
437
  minutes_threshold,
utils.py CHANGED
@@ -525,52 +525,43 @@ def transform_conversations_dataset_into_training_examples(
525
 
526
  def convert_gpt_to_gemini_format(gpt_dataset):
527
  """
528
- Convert GPT format Dataset to Gemini 2.0 format
529
 
530
  GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
531
  Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
532
  """
533
- def process_examples(examples):
534
- gemini_examples = []
535
 
536
- for messages in examples["messages"]:
537
- # Parse JSON messages if they're strings
538
- if isinstance(messages, str):
539
- import json
540
- messages = json.loads(messages)
541
-
542
- # Find system message and other contents
543
- system_instruction = None
544
- contents = []
545
-
546
- for msg in messages:
547
- if msg["role"] == "system":
548
- system_instruction = {
549
- "role": "system",
550
- "parts": [{"text": msg["content"]}]
551
- }
552
- elif msg["role"] in ["user", "assistant", "model"]:
553
- # Convert assistant to model for Gemini
554
- role = "model" if msg["role"] == "assistant" else msg["role"]
555
- contents.append({
556
- "role": role,
557
- "parts": [{"text": msg["content"]}]
558
- })
559
-
560
- gemini_example = {"contents": contents}
561
- if system_instruction:
562
- gemini_example["systemInstruction"] = system_instruction
563
-
564
- gemini_examples.append(gemini_example)
565
 
566
- # Return in the format expected by Dataset.map
567
- return {"gemini_format": gemini_examples}
568
-
569
- # Use Dataset.map to process the data
570
- processed_dataset = gpt_dataset.map(
571
- process_examples,
572
- remove_columns=["messages"],
573
- batched=True,
574
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
 
576
- return processed_dataset
 
 
525
 
526
  def convert_gpt_to_gemini_format(gpt_dataset):
527
  """
528
+ Convert GPT format training examples to Gemini 2.0 format using Dataset
529
 
530
  GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
531
  Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
532
  """
533
+ def convert_example(example):
534
+ messages = example["messages"]
535
 
536
+ # Parse JSON messages if they're strings
537
+ if isinstance(messages, str):
538
+ import json
539
+ messages = json.loads(messages)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
 
541
+ # Find system message and other contents
542
+ system_instruction = None
543
+ contents = []
544
+
545
+ for msg in messages:
546
+ if msg["role"] == "system":
547
+ system_instruction = {
548
+ "role": "system",
549
+ "parts": [{"text": msg["content"]}]
550
+ }
551
+ elif msg["role"] in ["user", "assistant", "model"]:
552
+ # Convert assistant to model for Gemini
553
+ role = "model" if msg["role"] == "assistant" else msg["role"]
554
+ contents.append({
555
+ "role": role,
556
+ "parts": [{"text": msg["content"]}]
557
+ })
558
+
559
+ # Build Gemini example - always include contents, optionally include systemInstruction
560
+ gemini_example = {"contents": contents}
561
+ if system_instruction:
562
+ gemini_example["systemInstruction"] = system_instruction
563
+
564
+ return gemini_example
565
 
566
+ # Use Dataset.map to convert each example individually
567
+ return gpt_dataset.map(convert_example, remove_columns=["messages"])