Spaces:
Running
Running
| import torch | |
| from safetensors.torch import save_file, load_file | |
| import os | |
| import json | |
| import glob | |
| from collections import OrderedDict | |
| def fix_vocab_size(model_dir, output_dir, new_vocab_size=131072): | |
| """ | |
| Resizes the vocabulary-dependent tensors of a sharded model to a new size. | |
| Args: | |
| model_dir (str): The directory of the model to fix. | |
| output_dir (str): The directory where the fixed model will be saved. | |
| new_vocab_size (int): The target vocabulary size. | |
| """ | |
| try: | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| print(f"Created output directory: {output_dir}") | |
| # --- Step 1: Find all safetensor shards --- | |
| search_pattern = os.path.join(model_dir, '*.safetensors') | |
| shard_paths = sorted(glob.glob(search_pattern)) | |
| if not shard_paths: | |
| print(f"Error: No '.safetensors' files found in {model_dir}") | |
| return | |
| print(f"Found {len(shard_paths)} shards to process.") | |
| # --- Step 2: Identify which shards contain the vocab tensors --- | |
| vocab_tensor_keys = ["model.embed_tokens.weight", "lm_head.weight"] | |
| shards_to_modify = {} # {filename: {key: tensor}} | |
| for shard_path in shard_paths: | |
| with open(shard_path, "rb") as f: | |
| data = f.read() | |
| header_size = int.from_bytes(data[:8], 'little') | |
| header_str = data[8:8+header_size].decode('utf-8') | |
| header = json.loads(header_str) | |
| for key in header.keys(): | |
| if key in vocab_tensor_keys: | |
| filename = os.path.basename(shard_path) | |
| if filename not in shards_to_modify: | |
| shards_to_modify[filename] = {} | |
| # Load only the specific tensor | |
| shards_to_modify[filename][key] = load_file(shard_path)[key] | |
| print(f"Found '{key}' in shard: {filename}") | |
| if not shards_to_modify: | |
| print("Error: Could not find 'embed_tokens' or 'lm_head' tensors in any shard.") | |
| return | |
| # --- Step 3: Process all shards, modifying the ones with vocab tensors --- | |
| for shard_path in shard_paths: | |
| filename = os.path.basename(shard_path) | |
| output_shard_path = os.path.join(output_dir, filename) | |
| # Load all tensors from the current shard | |
| tensors = load_file(shard_path) | |
| if filename in shards_to_modify: | |
| print(f"Resizing tensors in {filename}...") | |
| for key, tensor in shards_to_modify[filename].items(): | |
| original_size = tensor.shape[0] | |
| print(f" - Resizing '{key}' from {original_size} to {new_vocab_size}") | |
| # Trim the tensor to the new vocabulary size | |
| resized_tensor = tensor[:new_vocab_size, :] | |
| tensors[key] = resized_tensor # Replace the tensor in the loaded dict | |
| # Save the (potentially modified) tensors to the new location | |
| save_file(tensors, output_shard_path) | |
| print(f"Saved new shard: {output_shard_path}") | |
| # --- Step 4: Modify and save the config.json --- | |
| config_path = os.path.join(model_dir, 'config.json') | |
| new_config_path = os.path.join(output_dir, 'config.json') | |
| if os.path.exists(config_path): | |
| with open(config_path, 'r') as f: | |
| config = json.load(f) | |
| print(f"\nUpdating config.json: 'vocab_size' from {config.get('vocab_size')} to {new_vocab_size}") | |
| config['vocab_size'] = new_vocab_size | |
| with open(new_config_path, 'w') as f: | |
| json.dump(config, f, indent=2) | |
| print(f"Saved new config.json to {new_config_path}") | |
| else: | |
| print("Warning: config.json not found. Please create it manually.") | |
| # --- Step 5: Copy other essential files --- | |
| for filename in os.listdir(model_dir): | |
| if filename.endswith(('.json', '.py', '.md', '.txt')) and filename != 'config.json': | |
| if not os.path.exists(os.path.join(output_dir, filename)): | |
| import shutil | |
| shutil.copy2(os.path.join(model_dir, filename), output_dir) | |
| print(f"Copied {filename} to output directory.") | |
| print("\nVocabulary resizing complete. The model is now ready for merging.") | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| if __name__ == "__main__": | |
| # --- Configuration --- | |
| # Directory of the original DeepHermes-24B model | |
| input_model_directory = r"path/to/your/DeepHermes-24B" | |
| # Directory to save the fixed, merge-ready model | |
| output_model_directory = r"path/to/your/DeepHermes-24B/fixed" | |
| # The standard vocab size you are targeting for the merge | |
| target_vocab_size = 131072 | |
| # --- Run the script --- | |
| fix_vocab_size(input_model_directory, output_model_directory, target_vocab_size) |