Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse files- modules/SD15/SDToken.py +7 -35
modules/SD15/SDToken.py
CHANGED
|
@@ -236,54 +236,32 @@ class SDTokenizer:
|
|
| 236 |
- `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
|
| 237 |
- `min_length` (int, optional): The minimum length of the input. Defaults to None.
|
| 238 |
"""
|
| 239 |
-
# Ensure tokenizer path exists
|
| 240 |
if tokenizer_path is None:
|
| 241 |
-
tokenizer_path = "
|
| 242 |
-
|
| 243 |
-
# Verify path exists
|
| 244 |
-
if not os.path.exists(tokenizer_path):
|
| 245 |
-
raise ValueError(f"Tokenizer path does not exist: {tokenizer_path}")
|
| 246 |
-
|
| 247 |
-
try:
|
| 248 |
-
if tokenizer_path is None:
|
| 249 |
-
# Use pre-bundled tokenizer
|
| 250 |
-
self.tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
|
| 251 |
-
else:
|
| 252 |
-
# Try local tokenizer files
|
| 253 |
-
self.tokenizer = CLIPTokenizerFast.from_pretrained(tokenizer_path)
|
| 254 |
-
except Exception as e:
|
| 255 |
-
raise RuntimeError(f"Failed to load tokenizer from {tokenizer_path}: {str(e)}")
|
| 256 |
-
|
| 257 |
self.max_length = max_length
|
| 258 |
self.min_length = min_length
|
| 259 |
-
|
| 260 |
-
# Get tokens from empty string tokenization
|
| 261 |
empty = self.tokenizer("")["input_ids"]
|
| 262 |
-
|
| 263 |
if has_start_token:
|
| 264 |
self.tokens_start = 1
|
| 265 |
self.start_token = empty[0]
|
| 266 |
self.end_token = empty[1]
|
| 267 |
else:
|
| 268 |
self.tokens_start = 0
|
| 269 |
-
self.start_token = None
|
| 270 |
self.end_token = empty[0]
|
| 271 |
-
|
| 272 |
self.pad_with_end = pad_with_end
|
| 273 |
self.pad_to_max_length = pad_to_max_length
|
| 274 |
|
| 275 |
-
# Create vocab lookup
|
| 276 |
vocab = self.tokenizer.get_vocab()
|
| 277 |
self.inv_vocab = {v: k for k, v in vocab.items()}
|
| 278 |
-
|
| 279 |
-
# Set embedding properties
|
| 280 |
self.embedding_directory = embedding_directory
|
| 281 |
self.max_word_length = 8
|
| 282 |
self.embedding_identifier = "embedding:"
|
| 283 |
self.embedding_size = embedding_size
|
| 284 |
self.embedding_key = embedding_key
|
| 285 |
|
| 286 |
-
|
| 287 |
def _try_get_embedding(self, embedding_name: str) -> tuple:
|
| 288 |
"""#### Try to get an embedding.
|
| 289 |
|
|
@@ -432,7 +410,7 @@ class SDTokenizer:
|
|
| 432 |
class SD1Tokenizer:
|
| 433 |
"""#### Class representing the SD1Tokenizer."""
|
| 434 |
|
| 435 |
-
def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer
|
| 436 |
"""#### Initialize the SD1Tokenizer.
|
| 437 |
|
| 438 |
#### Args:
|
|
@@ -441,14 +419,8 @@ class SD1Tokenizer:
|
|
| 441 |
- `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
|
| 442 |
"""
|
| 443 |
self.clip_name = clip_name
|
| 444 |
-
self.clip =
|
| 445 |
-
|
| 446 |
-
# Initialize tokenizer with proper arguments
|
| 447 |
-
kwargs = {"embedding_directory": embedding_directory}
|
| 448 |
-
if tokenizer_data:
|
| 449 |
-
kwargs.update(tokenizer_data)
|
| 450 |
-
|
| 451 |
-
setattr(self, self.clip, tokenizer(**kwargs))
|
| 452 |
|
| 453 |
def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
|
| 454 |
"""#### Tokenize text with weights.
|
|
|
|
| 236 |
- `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
|
| 237 |
- `min_length` (int, optional): The minimum length of the input. Defaults to None.
|
| 238 |
"""
|
|
|
|
| 239 |
if tokenizer_path is None:
|
| 240 |
+
tokenizer_path = os.path.join("_internal/sd1_tokenizer/", "")
|
| 241 |
+
self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
self.max_length = max_length
|
| 243 |
self.min_length = min_length
|
| 244 |
+
|
|
|
|
| 245 |
empty = self.tokenizer("")["input_ids"]
|
|
|
|
| 246 |
if has_start_token:
|
| 247 |
self.tokens_start = 1
|
| 248 |
self.start_token = empty[0]
|
| 249 |
self.end_token = empty[1]
|
| 250 |
else:
|
| 251 |
self.tokens_start = 0
|
| 252 |
+
self.start_token = None
|
| 253 |
self.end_token = empty[0]
|
|
|
|
| 254 |
self.pad_with_end = pad_with_end
|
| 255 |
self.pad_to_max_length = pad_to_max_length
|
| 256 |
|
|
|
|
| 257 |
vocab = self.tokenizer.get_vocab()
|
| 258 |
self.inv_vocab = {v: k for k, v in vocab.items()}
|
|
|
|
|
|
|
| 259 |
self.embedding_directory = embedding_directory
|
| 260 |
self.max_word_length = 8
|
| 261 |
self.embedding_identifier = "embedding:"
|
| 262 |
self.embedding_size = embedding_size
|
| 263 |
self.embedding_key = embedding_key
|
| 264 |
|
|
|
|
| 265 |
def _try_get_embedding(self, embedding_name: str) -> tuple:
|
| 266 |
"""#### Try to get an embedding.
|
| 267 |
|
|
|
|
| 410 |
class SD1Tokenizer:
|
| 411 |
"""#### Class representing the SD1Tokenizer."""
|
| 412 |
|
| 413 |
+
def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer):
|
| 414 |
"""#### Initialize the SD1Tokenizer.
|
| 415 |
|
| 416 |
#### Args:
|
|
|
|
| 419 |
- `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
|
| 420 |
"""
|
| 421 |
self.clip_name = clip_name
|
| 422 |
+
self.clip = "clip_{}".format(self.clip_name)
|
| 423 |
+
setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
|
| 425 |
def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
|
| 426 |
"""#### Tokenize text with weights.
|