Spaces:
Running
Running
File size: 6,681 Bytes
d45a30f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import os
import argparse
import json
from gguf import GGUFReader
from typing import List, Dict, Any
def extract_and_save_tokenizer_files(gguf_path: str, output_dir: str) -> None:
"""
Extracts tokenizer metadata from a GGUF file and saves it as
tokenizer.json, tokenizer_config.json, and special_tokens_map.json.
"""
print(f"Loading GGUF file for tokenizer metadata: {gguf_path}")
reader = GGUFReader(gguf_path, 'r')
# --- Extract raw metadata from GGUF ---
try:
vocab_list_raw = reader.get_field("tokenizer.ggml.tokens").parts[0]
merges_list = reader.get_field("tokenizer.ggml.merges").parts[0]
bos_token_id = int(reader.get_field("tokenizer.ggml.bos_token_id").parts[0])
eos_token_id = int(reader.get_field("tokenizer.ggml.eos_token_id").parts[0])
unk_token_id = int(reader.get_field("tokenizer.ggml.unknown_token_id").parts[0])
padding_token_id = int(reader.get_field("tokenizer.ggml.padding_token_id").parts[0])
model_max_length = int(reader.get_field("llama.context_length").parts[0])
# Optional: chat template
chat_template = None
try:
chat_template = reader.get_field("tokenizer.chat_template").parts[0]
except KeyError:
pass # Chat template might not always be present
# Convert raw vocab bytes to strings
vocab_list = [token.decode('utf-8', errors='ignore') for token in vocab_list_raw]
except Exception as e:
print(f"Fatal Error: Could not extract essential tokenizer metadata from GGUF. Error: {e}")
return
# --- 1. Create tokenizer.json ---
try:
# The vocab for tokenizer.json needs to be a dict of {token_string: id}
vocab_dict = {token: i for i, token in enumerate(vocab_list)}
tokenizer_json_data = {
"version": "1.0",
"truncation": None,
"padding": None,
"added_tokens": [], # GGUF doesn't typically store this separately
"normalizer": {
"type": "Sequence",
"normalizers": [
{"type": "NFC"},
{"type": "Replace", "pattern": " ", "content": " "}, # Example, adjust if needed
]
},
"pre_tokenizer": {
"type": "ByteLevel", # Common for BPE models like GPT2/Llama
"add_prefix_space": False, # Based on tokenizer.ggml.add_space_prefix = 0
"splits_by_unicode_script": False,
"trim_offsets": True
},
"post_processor": {
"type": "ByteLevel",
"truncation": None,
"padding": None,
"add_prefix_space": False,
"trim_offsets": True
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": False,
"trim_offsets": True
},
"model": {
"type": "BPE",
"vocab": vocab_dict,
"merges": merges_list,
"dropout": None,
"unk_token": vocab_list[unk_token_id] if 0 <= unk_token_id < len(vocab_list) else "<unk>"
}
}
tokenizer_json_path = os.path.join(output_dir, "tokenizer.json")
with open(tokenizer_json_path, 'w', encoding='utf-8') as f:
json.dump(tokenizer_json_data, f, indent=None, separators=(',', ':')) # Compact format
print(f"Created tokenizer.json at {tokenizer_json_path}")
except Exception as e:
print(f"Warning: Could not create tokenizer.json. Error: {e}")
# --- 2. Create tokenizer_config.json ---
try:
tokenizer_config_data = {
"model_max_length": model_max_length,
"padding_side": "left", # Common default for causal models
"tokenizer_class": "LlamaTokenizer", # Mistral uses LlamaTokenizer
"clean_up_tokenization_spaces": False,
"add_bos_token": bool(reader.get_field("tokenizer.ggml.add_bos_token").parts[0]),
"add_eos_token": bool(reader.get_field("tokenizer.ggml.add_eos_token").parts[0]),
}
if chat_template:
tokenizer_config_data["chat_template"] = chat_template
tokenizer_config_path = os.path.join(output_dir, "tokenizer_config.json")
with open(tokenizer_config_path, 'w', encoding='utf-8') as f:
json.dump(tokenizer_config_data, f, indent=2)
print(f"Created tokenizer_config.json at {tokenizer_config_path}")
except Exception as e:
print(f"Warning: Could not create tokenizer_config.json. Error: {e}")
# --- 3. Create special_tokens_map.json ---
try:
special_tokens_map_data = {}
def get_token_string(token_id, default_str):
if 0 <= token_id < len(vocab_list):
return vocab_list[token_id]
return default_str
special_tokens_map_data["bos_token"] = get_token_string(bos_token_id, "<|begin_of_text|>")
special_tokens_map_data["eos_token"] = get_token_string(eos_token_id, "<|end_of_text|>")
special_tokens_map_data["unk_token"] = get_token_string(unk_token_id, "<unk>")
special_tokens_map_data["pad_token"] = get_token_string(padding_token_id, "<pad>")
special_tokens_map_path = os.path.join(output_dir, "special_tokens_map.json")
with open(special_tokens_map_path, 'w', encoding='utf-8') as f:
json.dump(special_tokens_map_data, f, indent=2)
print(f"Created special_tokens_map.json at {special_tokens_map_path}")
except Exception as e:
print(f"Warning: Could not create special_tokens_map.json. Error: {e}")
def main():
parser = argparse.ArgumentParser(
description="Extracts tokenizer metadata from a GGUF file and saves it as Hugging Face tokenizer files."
)
parser.add_argument("--gguf-file", required=True, help="Path to the original GGUF file to read metadata from.")
parser.add_argument("--output-dir", required=True, help="Path to the directory where the tokenizer files will be saved.")
args = parser.parse_args()
if not os.path.isfile(args.gguf_file):
print(f"Error: GGUF file not found at {args.gguf_file}")
return
if not os.path.isdir(args.output_dir):
os.makedirs(args.output_dir, exist_ok=True)
print(f"Created output directory: {args.output_dir}")
extract_and_save_tokenizer_files(args.gguf_file, args.output_dir)
print("\nTokenizer file generation complete.")
if __name__ == "__main__":
main() |