Spaces:
Runtime error
Runtime error
| ''' | |
| simple demo adapted from [gradio](https://gradio.app/creating-a-chatbot/). | |
| ''' | |
| import gradio as gr | |
| import random | |
| import time | |
| import transformers | |
| import os | |
| import json | |
| import torch | |
| import argparse | |
| from tqdm import tqdm | |
| from transformers import LlamaTokenizer, LlamaForCausalLM | |
| def apply_delta(base_model_path, target_model_path, delta_path): | |
| print(f"Loading the delta weights from {delta_path}") | |
| delta_tokenizer = LlamaTokenizer.from_pretrained(delta_path, use_fast=False) | |
| delta = LlamaForCausalLM.from_pretrained( | |
| delta_path, low_cpu_mem_usage=True, torch_dtype=torch.float16 | |
| ) | |
| print(f"Loading the base model from {base_model_path}") | |
| base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False) | |
| base = LlamaForCausalLM.from_pretrained( | |
| base_model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16 | |
| ) | |
| # following alpaca training recipe, we have added new initialized tokens | |
| DEFAULT_PAD_TOKEN = "[PAD]" | |
| DEFAULT_EOS_TOKEN = "</s>" | |
| DEFAULT_BOS_TOKEN = "<s>" | |
| DEFAULT_UNK_TOKEN = "<unk>" | |
| special_tokens_dict = { | |
| "pad_token": DEFAULT_PAD_TOKEN, | |
| "eos_token": DEFAULT_EOS_TOKEN, | |
| "bos_token": DEFAULT_BOS_TOKEN, | |
| "unk_token": DEFAULT_UNK_TOKEN, | |
| } | |
| num_new_tokens = base_tokenizer.add_special_tokens(special_tokens_dict) | |
| base.resize_token_embeddings(len(base_tokenizer)) | |
| input_embeddings = base.get_input_embeddings().weight.data | |
| output_embeddings = base.get_output_embeddings().weight.data | |
| input_embeddings[-num_new_tokens:] = 0 | |
| output_embeddings[-num_new_tokens:] = 0 | |
| print("Applying the delta") | |
| target_weights = {} | |
| for name, param in tqdm(base.state_dict().items(), desc="Applying delta"): | |
| assert name in delta.state_dict() | |
| param.data += delta.state_dict()[name] | |
| target_weights[name] = param.data | |
| print(f"Saving the target model to {target_model_path}") | |
| base.load_state_dict(target_weights) | |
| # base.save_pretrained(target_model_path) | |
| # delta_tokenizer.save_pretrained(target_model_path) | |
| return base, delta_tokenizer | |
| base_weights = 'decapoda-research/llama-7b-hf' | |
| target_weights = 'expertllama' # local path | |
| delta_weights = 'OFA-Sys/expertllama-7b-delta' | |
| model, tokenizer = apply_delta(base_weights, target_weights, delta_weights) | |
| # tokenizer = transformers.LlamaTokenizer.from_pretrained(expertllama_path) | |
| # model = transformers.LlamaForCausalLM.from_pretrained(expertllama_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) | |
| # model.cuda() | |
| with gr.Blocks() as demo: | |
| chatbot = gr.Chatbot() | |
| msg = gr.Textbox() | |
| clear = gr.Button("Clear") | |
| def respond(message, chat_history): | |
| # prompt wrapper, only single-turn is allowed for now | |
| prompt = f"### Human:\n{prompt}\n\n### Assistant:\n" | |
| batch = tokenizer( | |
| prompt, | |
| return_tensors="pt", | |
| add_special_tokens=False | |
| ) | |
| batch = {k: v.cuda() for k, v in batch.items()} | |
| generated = model.generate(batch["input_ids"], max_length=1024, temperature=0.8) | |
| bot_message = tokenizer.decode(generated[0][:-1]).split("### Assistant:\n", 1)[1] | |
| chat_history.append((message, bot_message)) | |
| time.sleep(1) | |
| return "", chat_history | |
| msg.submit(respond, [msg, chatbot], [msg, chatbot]) | |
| clear.click(lambda: None, None, chatbot, queue=False) | |
| demo.launch() |