Spaces:

kinouchi
/

image_vector_search

Runtime error

image_vector_search / japanese_clip /tokenizer.py

kittchy

[ADD] image_vector_search

30099ac unverified about 1 year ago

2.2 kB

	# coding=utf-8
	# Copyright 2022 rinna Co., Ltd.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from typing import Union, List
	import torch
	from transformers import T5Tokenizer


	def load_tokenizer():
	"""
	https://huggingface.co/rinna/japanese-roberta-base
	"""
	tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-roberta-base")
	tokenizer.do_lower_case = True # due to some bug of tokenizer config loading
	return tokenizer


	def tokenize(
	texts: Union[str, List[str]],
	tokenizer: T5Tokenizer = None,
	max_seq_len: int = 77,
	device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
	):
	"""
	This is a function that have the original clip's code has.
	https://github.com/openai/CLIP/blob/main/clip/clip.py#L195
	"""
	if isinstance(texts, str):
	texts = [texts]
	if tokenizer is None:
	tokenizer = load_tokenizer()
	inputs = tokenizer(
	texts,
	max_length=max_seq_len-1,
	padding="max_length",
	truncation=True,
	add_special_tokens=False,
	)
	# add cls token at first place
	input_ids = [[tokenizer.cls_token_id] + ids for ids in inputs['input_ids']]
	attention_mask = [[1] + am for am in inputs['attention_mask']]
	position_ids = [list(range(0, len(input_ids[0])))] * len(texts)

	input_ids = torch.tensor(input_ids, dtype=torch.long)
	attention_mask = torch.tensor(attention_mask, dtype=torch.long)
	position_ids = torch.tensor(position_ids, dtype=torch.long)
	return {
	"input_ids": input_ids.to(device),
	"attention_mask": attention_mask.to(device),
	"position_ids": position_ids.to(device),
	}