|  | import logging | 
					
						
						|  | from collections import defaultdict | 
					
						
						|  |  | 
					
						
						|  | import numpy as np | 
					
						
						|  | import tiktoken | 
					
						
						|  |  | 
					
						
						|  | logger = logging.getLogger(__name__) | 
					
						
						|  | logger.setLevel(logging.INFO) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def check_format_errors(train_dataset, user_role, model_role): | 
					
						
						|  | """ | 
					
						
						|  | Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | format_errors = defaultdict(int) | 
					
						
						|  |  | 
					
						
						|  | for ex in train_dataset: | 
					
						
						|  | if not isinstance(ex, dict): | 
					
						
						|  | format_errors["data_type"] += 1 | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | messages = ex.get("messages", None) | 
					
						
						|  | if not messages: | 
					
						
						|  | format_errors["missing_messages_list"] += 1 | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | for message in messages: | 
					
						
						|  | if "role" not in message or "content" not in message: | 
					
						
						|  | format_errors["message_missing_key"] += 1 | 
					
						
						|  |  | 
					
						
						|  | if any( | 
					
						
						|  | k not in ("role", "content", "name", "function_call", "weight") | 
					
						
						|  | for k in message | 
					
						
						|  | ): | 
					
						
						|  | format_errors["message_unrecognized_key"] += 1 | 
					
						
						|  |  | 
					
						
						|  | if message.get("role", None) not in ["system", user_role, model_role]: | 
					
						
						|  | format_errors["unrecognized_role"] += 1 | 
					
						
						|  |  | 
					
						
						|  | content = message.get("content", None) | 
					
						
						|  | function_call = message.get("function_call", None) | 
					
						
						|  |  | 
					
						
						|  | if (not content and not function_call) or not isinstance(content, str): | 
					
						
						|  | format_errors["missing_content"] += 1 | 
					
						
						|  |  | 
					
						
						|  | if not any(message.get("role", None) == model_role for message in messages): | 
					
						
						|  | format_errors["example_missing_assistant_message"] += 1 | 
					
						
						|  |  | 
					
						
						|  | if format_errors: | 
					
						
						|  | logger.warning("Found errors:") | 
					
						
						|  | for k, v in format_errors.items(): | 
					
						
						|  | logger.warning(f"{k}: {v}") | 
					
						
						|  | else: | 
					
						
						|  | logger.info("No errors found") | 
					
						
						|  |  | 
					
						
						|  | return format_errors if format_errors else {} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_distributions(train_dataset, user_role, model_role): | 
					
						
						|  | """ | 
					
						
						|  | Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep | 
					
						
						|  |  | 
					
						
						|  | Gets the distributions of the number of messages per example, the total number of tokens per example, and the number of assistant tokens per example. | 
					
						
						|  | """ | 
					
						
						|  | encoding = tiktoken.get_encoding("cl100k_base") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1): | 
					
						
						|  | num_tokens = 0 | 
					
						
						|  | for message in messages: | 
					
						
						|  | num_tokens += tokens_per_message | 
					
						
						|  | for key, value in message.items(): | 
					
						
						|  | num_tokens += len(encoding.encode(value)) | 
					
						
						|  | if key == "name": | 
					
						
						|  | num_tokens += tokens_per_name | 
					
						
						|  | num_tokens += 3 | 
					
						
						|  | return num_tokens | 
					
						
						|  |  | 
					
						
						|  | def num_assistant_tokens_from_messages(messages): | 
					
						
						|  | num_tokens = 0 | 
					
						
						|  | for message in messages: | 
					
						
						|  | if message["role"] == model_role: | 
					
						
						|  | num_tokens += len(encoding.encode(message["content"])) | 
					
						
						|  | return num_tokens | 
					
						
						|  |  | 
					
						
						|  | n_missing_system = 0 | 
					
						
						|  | n_missing_user = 0 | 
					
						
						|  | n_messages = [] | 
					
						
						|  | convo_lens = [] | 
					
						
						|  | assistant_message_lens = [] | 
					
						
						|  |  | 
					
						
						|  | for ex in train_dataset: | 
					
						
						|  | messages = ex["messages"] | 
					
						
						|  | if not any(message["role"] == "system" for message in messages): | 
					
						
						|  | n_missing_system += 1 | 
					
						
						|  | if not any(message["role"] == user_role for message in messages): | 
					
						
						|  | n_missing_user += 1 | 
					
						
						|  | n_messages.append(len(messages)) | 
					
						
						|  | convo_lens.append(num_tokens_from_messages(messages)) | 
					
						
						|  | assistant_message_lens.append(num_assistant_tokens_from_messages(messages)) | 
					
						
						|  |  | 
					
						
						|  | return { | 
					
						
						|  | "n_missing_system": n_missing_system, | 
					
						
						|  | "n_missing_user": n_missing_user, | 
					
						
						|  | "n_messages": n_messages, | 
					
						
						|  | "convo_lens": convo_lens, | 
					
						
						|  | "assistant_message_lens": assistant_message_lens, | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def check_token_counts(train_dataset, user_role, model_role): | 
					
						
						|  | """ | 
					
						
						|  | Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | def print_distribution(values, name): | 
					
						
						|  | logger.info(f"\n#### Distribution of {name}:") | 
					
						
						|  | logger.info(f"min / max: {min(values)}, {max(values)}") | 
					
						
						|  | logger.info(f"mean / median: {np.mean(values)}, {np.median(values)}") | 
					
						
						|  | logger.info(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | distributions = get_distributions( | 
					
						
						|  | train_dataset, user_role=user_role, model_role=model_role | 
					
						
						|  | ) | 
					
						
						|  | n_missing_system = distributions["n_missing_system"] | 
					
						
						|  | n_missing_user = distributions["n_missing_user"] | 
					
						
						|  | n_messages = distributions["n_messages"] | 
					
						
						|  | convo_lens = distributions["convo_lens"] | 
					
						
						|  | assistant_message_lens = distributions["assistant_message_lens"] | 
					
						
						|  |  | 
					
						
						|  | logger.info("Num examples missing system message:", n_missing_system) | 
					
						
						|  | logger.info("Num examples missing user message:", n_missing_user) | 
					
						
						|  | print_distribution(n_messages, "num_messages_per_example") | 
					
						
						|  | print_distribution(convo_lens, "num_total_tokens_per_example") | 
					
						
						|  | print_distribution(assistant_message_lens, "num_assistant_tokens_per_example") | 
					
						
						|  | n_too_long = sum(l > 4096 for l in convo_lens) | 
					
						
						|  | logger.info( | 
					
						
						|  | f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def estimate_cost(train_dataset, user_role, model_role): | 
					
						
						|  | """ | 
					
						
						|  | Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep | 
					
						
						|  | """ | 
					
						
						|  | distributions = get_distributions( | 
					
						
						|  | train_dataset, user_role=user_role, model_role=model_role | 
					
						
						|  | ) | 
					
						
						|  | n_missing_system = distributions["n_missing_system"] | 
					
						
						|  | n_missing_user = distributions["n_missing_user"] | 
					
						
						|  | n_messages = distributions["n_messages"] | 
					
						
						|  | convo_lens = distributions["convo_lens"] | 
					
						
						|  | assistant_message_lens = distributions["assistant_message_lens"] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | MAX_TOKENS_PER_EXAMPLE = 4096 | 
					
						
						|  |  | 
					
						
						|  | TARGET_EPOCHS = 3 | 
					
						
						|  | MIN_TARGET_EXAMPLES = 100 | 
					
						
						|  | MAX_TARGET_EXAMPLES = 25000 | 
					
						
						|  | MIN_DEFAULT_EPOCHS = 1 | 
					
						
						|  | MAX_DEFAULT_EPOCHS = 25 | 
					
						
						|  |  | 
					
						
						|  | n_epochs = TARGET_EPOCHS | 
					
						
						|  | n_train_examples = len(train_dataset) | 
					
						
						|  | try: | 
					
						
						|  | if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES: | 
					
						
						|  | n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples) | 
					
						
						|  | elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES: | 
					
						
						|  | n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples) | 
					
						
						|  | except: | 
					
						
						|  | n_epochs = TARGET_EPOCHS | 
					
						
						|  |  | 
					
						
						|  | n_billing_tokens_in_dataset = sum( | 
					
						
						|  | min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return { | 
					
						
						|  | "Estimated number of tokens in dataset": n_billing_tokens_in_dataset, | 
					
						
						|  | f"Estimated number of tokens that will be billed (assuming {n_epochs} training epochs)": n_epochs | 
					
						
						|  | * n_billing_tokens_in_dataset, | 
					
						
						|  | } | 
					
						
						|  |  |