Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import time | |
| from transformers import (GPT2LMHeadModel, GPT2Tokenizer, | |
| OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, | |
| XLNetLMHeadModel, XLNetTokenizer, | |
| TransfoXLLMHeadModel, TransfoXLTokenizer, | |
| CTRLLMHeadModel, CTRLTokenizer) | |
| from Utils import forward, create_context | |
| import torch | |
| import torch.nn.functional as F | |
| from math import floor | |
| import requests | |
| import json | |
| import os | |
| from PPLM import run_model as run_pplm, DISCRIMINATOR_MODELS_PARAMS | |
| from GPUHandler import GPUHandler | |
| PADDING_TEXT = """With eyes for the most part downcast and, if ever they lighted on a fellow creature, at once and | |
| furtively averted, Bernard hastened across the roof. He was like a man pursued, but pursued by enemies he does not | |
| wish to see, lest they should seem more hostile even than he had supposed, and he himself be made to feel guiltier | |
| and even more helplessly alone. That horrible Benito Hoover!’ And yet the man had meant well enough. Which only made | |
| it, in a way, much worse. Those who meant well behaved in the same way as those who meant badly. Even Lenina was making | |
| him suffer. He remembered those weeks of timid indecision, during which he had looked and longed and despaired of ever | |
| having the courage to ask her. Dared he face the risk of being humiliated by a contemptuous refusal? But if she were to | |
| say yes, what rapture! Well, now she had said it and he was still wretched—wretched that she should have thought it | |
| such a perfect afternoon for Obstacle Golf, that she should have trotted away to join Henry Foster, that she should | |
| have found him funny for not wanting to talk of their most private affairs in public. Wretched, in a word, because she | |
| had behaved as any healthy and virtuous English girl ought to behave and not in some other, abnormal, extraordinary | |
| way. <eod> </s> <eos>""" | |
| try: | |
| PID = int(requests.get(url="http://localhost:3000").json()) | |
| N_GPU = torch.cuda.device_count() | |
| GPU_PER_WORKER = int(os.getenv("GPU_PER_WORKER")) | |
| GPU_IDS = list(range(PID * GPU_PER_WORKER, (PID + 1) * GPU_PER_WORKER)) | |
| print("Successfully init thread with id {}. The GPU ids attributed are: {}".format(PID, GPU_IDS)) | |
| with open(os.getenv("FILE")) as json_file: | |
| data = json.load(json_file) | |
| models = data["models_to_load"] | |
| cached_models = data.get("cached_models") | |
| except requests.exceptions.ConnectionError or TypeError: | |
| if __name__ == "__main__": | |
| PID = 0 | |
| N_GPU = torch.cuda.device_count() | |
| GPU_PER_WORKER = 1 | |
| GPU_IDS = [0] | |
| print("Successfully init development thread with id {}. The GPU ids attributed are: {}".format(PID, GPU_IDS)) | |
| models = ["pplm"] | |
| cached_models = None | |
| pass | |
| else: | |
| raise requests.exceptions.ConnectionError("The PID server is not running.") | |
| handler = GPUHandler(int(), models, GPU_IDS, cached_models) | |
| models = {} | |
| for gpu in handler.gpus: | |
| for model in gpu.models: | |
| model_name = model["identifier"] | |
| print(f"Loading {model_name} model and tokenizer") | |
| models[model_name] = model | |
| if model.get("cached_path"): | |
| print("Loading {} from local path.".format(model_name)) | |
| model_checkpoint_path = model["cached_path"] | |
| else: | |
| model_checkpoint_path = model["checkpoint"] | |
| if "configuration_options" in models[model_name]: | |
| configuration_options = models[model_name]["configuration_options"] | |
| print("Specific configuration options", configuration_options["options"]) | |
| config = configuration_options["config"].from_pretrained(model_checkpoint_path) | |
| for option_key, option_value in configuration_options["options"].items(): | |
| setattr(config, option_key, option_value) | |
| models[model_name]["model"] = models[model_name]["model"].from_pretrained(model_checkpoint_path, config=config).to(models[model_name]["device"]) | |
| else: | |
| models[model_name]["model"] = models[model_name]["model"].from_pretrained(model_checkpoint_path).to(models[model_name]["device"]) | |
| models[model_name]["tokenizer"] = models[model_name]["tokenizer"].from_pretrained(models[model_name]["checkpoint"]) | |
| models[model_name]["model"].eval() | |
| print("All models successfully loaded.") | |
| def top_k_top_p_filtering(batch_logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): | |
| """ | |
| Filter a distribution of logits using top-k and/or nucleus (top-p) filtering | |
| :param batch_logits: logits output by the model | |
| :param top_k: >0: keep only top k tokens with highest probability (top-k filtering). | |
| :param top_p: >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). | |
| :param filter_value: | |
| :return: A top_p/top_k filtered tensor of logits | |
| """ | |
| for i in range(batch_logits.size(0)): | |
| logits = batch_logits[i] | |
| assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear | |
| top_k = min(top_k, logits.size(-1)) # Safety check | |
| if top_k and top_k > 0: | |
| # Remove all tokens with a probability less than the last token of the top-k | |
| indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] | |
| logits[indices_to_remove] = filter_value | |
| if top_p and top_p > 0.0: | |
| sorted_logits, sorted_indices = torch.sort(logits, descending=True) | |
| cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) | |
| # Remove tokens with cumulative probability above the threshold | |
| sorted_indices_to_remove = cumulative_probs > top_p | |
| # Shift the indices to the right to keep also the first token above the threshold | |
| sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() | |
| sorted_indices_to_remove[..., 0] = 0 | |
| indices_to_remove = sorted_indices[sorted_indices_to_remove] | |
| logits[indices_to_remove] = filter_value | |
| if 'batched_logits' in locals(): | |
| batched_logits = torch.cat((batched_logits, logits.unsqueeze(0)), dim=0) | |
| else: | |
| batched_logits = logits.unsqueeze(0) | |
| return batched_logits | |
| def check_tensor_for_eot(output, eot_token, dot_token): | |
| return all([(eot_token in output_item or dot_token in output_item) for output_item in output.tolist()]) | |
| def truncate_after_eot(output, eot_tokens): | |
| result = [] | |
| for i in range(output.size(0)): | |
| if any([eot_token in output[i] for eot_token in eot_tokens]): | |
| item = output[i].tolist() | |
| index = find_min_value_in_array(item, eot_tokens) | |
| result.append(item[:index] + [eot_tokens[0]]) | |
| else: | |
| result.append(output[i].tolist()) | |
| return result | |
| def find_min_value_in_array(array, values): | |
| indexes = [] | |
| for value in values: | |
| try: | |
| indexes.append(array.index(value)) | |
| except ValueError: | |
| "" # Couldn't find value in array | |
| return min(indexes) | |
| # @lru_cache() | |
| def generate_completion( | |
| raw_text, | |
| length=-1, | |
| max_time=-1, | |
| model_name="small", | |
| temperature=1, | |
| max_tokens=256, | |
| top_p=0.0, | |
| top_k=0, | |
| batch_size=3, | |
| repetition_penalty=1.2, | |
| # PPLM | |
| bag_of_words_or_discrim=None, | |
| stepsize=0.02, | |
| gamma=1.5, | |
| num_iterations=3, | |
| window_length=5, | |
| kl_scale=0.01, | |
| gm_scale=0.95, | |
| use_sampling=False | |
| ): | |
| start = time.time() | |
| try: | |
| print("Running with model", model_name) | |
| model, tokenizer, device = models[model_name]["model"], models[model_name]["tokenizer"], models[model_name]["device"] | |
| except KeyError: | |
| print("Error. Defaulting to small model.") | |
| model, tokenizer, device = models["gpt2/small"]["model"], models["gpt2/small"]["tokenizer"], models["gpt2/small"]["device"] | |
| if "pplm" in model_name: | |
| if ":" in bag_of_words_or_discrim: | |
| discrim, discrim_label = bag_of_words_or_discrim.split(":") | |
| discrim_label = DISCRIMINATOR_MODELS_PARAMS[discrim]["class_id"][int(discrim_label)] | |
| bag_of_words = None | |
| # Hardcoded parameters for the discriminator | |
| gamma = 1.0 | |
| print("Running PPLM with discriminator:", discrim, discrim_label) | |
| else: | |
| bag_of_words = bag_of_words_or_discrim | |
| discrim = None | |
| discrim_label = None | |
| # Hardcoded parameters for the BOW | |
| gamma = 1.5 | |
| window_length = 5 | |
| print("Running PPLM with bag of words:", bag_of_words) | |
| print("kl", kl_scale, "gm", gm_scale, "sampling", use_sampling, "window length", window_length, "gamma", gamma, "temperature", temperature) | |
| return run_pplm( | |
| model, tokenizer, device, raw_text, | |
| max_time=max_time, | |
| discrim=discrim, | |
| discrim_label=discrim_label, | |
| num_samples=batch_size, | |
| bag_of_words=bag_of_words, | |
| length=length, | |
| temperature=temperature, | |
| top_k=top_k, | |
| stepsize=stepsize, | |
| gamma=gamma, | |
| num_iterations=num_iterations, | |
| window_length=window_length, | |
| kl_scale=kl_scale, | |
| gm_scale=gm_scale, | |
| use_sampling=use_sampling | |
| ) | |
| context_tokens, eot_token, dot_token = create_context(model_name, tokenizer, raw_text, PADDING_TEXT, max_tokens=max_tokens) | |
| if length == -1: | |
| length = 100 | |
| context = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1) | |
| prev = context | |
| past = None | |
| with torch.no_grad(): | |
| for _ in range(length): | |
| try: | |
| output = forward(model_name, model, prev, past, device=device) | |
| except RuntimeError: | |
| return "ERROR 500: OOM. TransfoXL asked for too much memory." | |
| logits, past = output if len(output) > 2 else output[0], None | |
| logits = logits[:, -1, :] / max(temperature, 0.001) | |
| if "ctrl" in model_name: | |
| for i in range(batch_size): | |
| for j in set(prev[i].tolist()): | |
| logits[i, j] /= repetition_penalty | |
| logits = top_k_top_p_filtering(logits, top_p=top_p, top_k=top_k) | |
| log_probs = F.softmax(logits, dim=-1) | |
| token = torch.multinomial(log_probs, num_samples=1) | |
| prev = torch.cat((prev, token), dim=1) | |
| # Check that there is no eot token in all of the sentence, else breaks. | |
| if check_tensor_for_eot(prev[:, len(context_tokens):], eot_token, dot_token) or (max_time != -1 and time.time() - start + 0.1 > max_time): | |
| break | |
| out = prev[:, len(context_tokens):] | |
| # Remove the words following the eot tokens. | |
| out = truncate_after_eot(out, list(filter(lambda t: t is not None, [dot_token, eot_token]))) | |
| end = time.time() | |
| # Remove empty sentences and duplicates | |
| generations = list(set(filter(lambda x: len(x) > 0, [" " + tokenizer.decode(single_generation).strip() for single_generation in out]))) | |
| sentences = [ | |
| {"value": generations[i], "time": end - start, "tokens": len(out[i])} for i in range(len(generations)) | |
| ] | |
| # print(end - start, [len(out[i]) for i in range(len(generations))]) | |
| return sentences | |
| if __name__ == "__main__": | |
| print(generate_completion( | |
| "My dog died", | |
| length=30, model_name="pplm", batch_size=3, top_k=10, top_p=0.9, | |
| bag_of_words_or_discrim="sentiment:2", | |
| stepsize=0.03, | |
| gamma=1, | |
| num_iterations=3, | |
| window_length=5, | |
| kl_scale=0.01, | |
| gm_scale=0.95, | |
| max_time=-1, | |
| use_sampling=False | |
| )) | |