Spaces:
Runtime error
Runtime error
| ''' | |
| CREDIT: | |
| script adapted from [alpaca](https://huggingface.co/spaces/tloen/alpaca-lora/blob/main/app.py). | |
| ''' | |
| import gradio as gr | |
| import random | |
| import time | |
| import transformers | |
| import os | |
| import json | |
| import torch | |
| import argparse | |
| from tqdm import tqdm | |
| from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig | |
| def apply_delta(base_model_path, target_model_path, delta_path): | |
| print(f"Loading the delta weights from {delta_path}") | |
| delta_tokenizer = LlamaTokenizer.from_pretrained(delta_path, use_fast=False) | |
| delta = LlamaForCausalLM.from_pretrained( | |
| delta_path, low_cpu_mem_usage=True, torch_dtype=torch.float16 | |
| ) | |
| print(f"Loading the base model from {base_model_path}") | |
| base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False) | |
| base = LlamaForCausalLM.from_pretrained( | |
| base_model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16 | |
| ) | |
| # following alpaca training recipe, we have added new initialized tokens | |
| DEFAULT_PAD_TOKEN = "[PAD]" | |
| DEFAULT_EOS_TOKEN = "</s>" | |
| DEFAULT_BOS_TOKEN = "<s>" | |
| DEFAULT_UNK_TOKEN = "<unk>" | |
| special_tokens_dict = { | |
| "pad_token": DEFAULT_PAD_TOKEN, | |
| "eos_token": DEFAULT_EOS_TOKEN, | |
| "bos_token": DEFAULT_BOS_TOKEN, | |
| "unk_token": DEFAULT_UNK_TOKEN, | |
| } | |
| num_new_tokens = base_tokenizer.add_special_tokens(special_tokens_dict) | |
| base.resize_token_embeddings(len(base_tokenizer)) | |
| input_embeddings = base.get_input_embeddings().weight.data | |
| output_embeddings = base.get_output_embeddings().weight.data | |
| input_embeddings[-num_new_tokens:] = 0 | |
| output_embeddings[-num_new_tokens:] = 0 | |
| print("Applying the delta") | |
| target_weights = {} | |
| for name, param in tqdm(base.state_dict().items(), desc="Applying delta"): | |
| assert name in delta.state_dict() | |
| param.data += delta.state_dict()[name] | |
| target_weights[name] = param.data | |
| print(f"Saving the target model to {target_model_path}") | |
| base.load_state_dict(target_weights) | |
| # base.save_pretrained(target_model_path) | |
| # delta_tokenizer.save_pretrained(target_model_path) | |
| delta = None | |
| return base, delta_tokenizer | |
| base_weights = 'decapoda-research/llama-7b-hf' | |
| target_weights = 'expertllama' # local path | |
| delta_weights = 'OFA-Sys/expertllama-7b-delta' | |
| model, tokenizer = apply_delta(base_weights, target_weights, delta_weights) | |
| model = model.to(torch.float) | |
| if torch.__version__ >= "2": | |
| model = torch.compile(model) | |
| def respond( | |
| instruction, | |
| temperature=0.1, | |
| top_p=0.75, | |
| top_k=40, | |
| num_beams=4, | |
| max_new_tokens=128, | |
| **kwargs, | |
| ): | |
| # prompt wrapper, only single-turn is allowed for now | |
| prompt = f"### Human:\n{instruction}\n\n### Assistant:\n" | |
| inputs = tokenizer( | |
| prompt, | |
| return_tensors="pt", | |
| add_special_tokens=False | |
| ) | |
| generation_config = GenerationConfig( | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| num_beams=num_beams, | |
| **kwargs, | |
| ) | |
| with torch.no_grad(): | |
| generation_output = model.generate( | |
| input_ids=inputs["input_ids"], | |
| generation_config=generation_config, | |
| return_dict_in_generate=True, | |
| output_scores=True, | |
| max_new_tokens=max_new_tokens, | |
| ) | |
| response = tokenizer.decode(generation_output.sequences[0][:-2]).split("### Assistant:\n", 1)[1] | |
| return response | |
| g = gr.Interface( | |
| fn=respond, | |
| inputs=[ | |
| gr.components.Textbox( | |
| lines=2, label="Instruction" | |
| ), | |
| gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"), | |
| gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"), | |
| gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), | |
| gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"), | |
| gr.components.Slider( | |
| minimum=1, maximum=768, step=1, value=512, label="Max tokens" | |
| ), | |
| ], | |
| outputs=[ | |
| gr.inputs.Textbox( | |
| lines=8, | |
| label="Output", | |
| ) | |
| ], | |
| title="ExpertLLaMA", | |
| description="ExpertLLaMA is an open-source chatbot trained on expert-like data produced with GPT-3.5, see our [project repo](https://github.com/OFA-Sys/ExpertLLaMA) for details.", | |
| ) | |
| g.queue(concurrency_count=1) | |
| g.launch() |