SeaLLM-Chat

Paused

App Files Files Community

nxphi47 commited on Jan 25, 2024

Commit

c821309

verified ·

1 Parent(s): ae33a24

Update app.py

Browse files

Files changed (1) hide show

app.py +195 -525

app.py CHANGED Viewed

@@ -18,6 +18,11 @@ import filelock
 import glob
 import json
 import time
 from gradio_client.documentation import document, set_documentation_group
@@ -278,455 +283,6 @@ path_markdown = """
-def custom_hf_model_weights_iterator(
-    model_name_or_path: str,
-    cache_dir: Optional[str] = None,
-    use_np_cache: bool = False,
-) -> Iterator[Tuple[str, torch.Tensor]]:
-    # ! if use vllm==0.1.4, use this to augment hf_model_weights_iterator loader
-    from vllm.model_executor.weight_utils import Disabledtqdm
-    # Prepare file lock directory to prevent multiple processes from
-    # downloading the same model weights at the same time.
-    lock_dir = cache_dir if cache_dir is not None else "/tmp"
-    lock_file_name = model_name_or_path.replace("/", "-") + ".lock"
-    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name))
-    # Download model weights from huggingface.
-    is_local = os.path.isdir(model_name_or_path)
-    if not is_local:
-        with lock:
-            hf_folder = snapshot_download(model_name_or_path,
-                                          allow_patterns="*.bin",
-                                          cache_dir=cache_dir,
-                                          local_files_only=True,
-                                          tqdm_class=Disabledtqdm)
-    else:
-        hf_folder = model_name_or_path
-    hf_bin_files = [
-        x for x in glob.glob(os.path.join(hf_folder, "*model*.bin"))
-        if not x.endswith("training_args.bin")
-    ]
-    hf_safetensors_files = [
-        x for x in glob.glob(os.path.join(hf_folder, "*model*.safetensors"))
-        if not x.endswith("training_args.bin")
-    ]
-    if use_np_cache:
-        # Convert the model weights from torch tensors to numpy arrays for
-        # faster loading.
-        np_folder = os.path.join(hf_folder, "np")
-        os.makedirs(np_folder, exist_ok=True)
-        weight_names_file = os.path.join(np_folder, "weight_names.json")
-        with lock:
-            if not os.path.exists(weight_names_file):
-                weight_names = []
-                for bin_file in hf_bin_files:
-                    state = torch.load(bin_file, map_location="cpu")
-                    for name, param in state.items():
-                        param_path = os.path.join(np_folder, name)
-                        with open(param_path, "wb") as f:
-                            np.save(f, param.cpu().detach().numpy())
-                        weight_names.append(name)
-                with open(weight_names_file, "w") as f:
-                    json.dump(weight_names, f)
-        with open(weight_names_file, "r") as f:
-            weight_names = json.load(f)
-        for name in weight_names:
-            param_path = os.path.join(np_folder, name)
-            with open(param_path, "rb") as f:
-                param = np.load(f)
-            yield name, torch.from_numpy(param)
-    else:
-        if len(hf_bin_files) > 0:
-            print(F'Load bin files: {hf_bin_files}')
-            for bin_file in hf_bin_files:
-                state = torch.load(bin_file, map_location="cpu")
-                for name, param in state.items():
-                    yield name, param
-                del state
-                torch.cuda.empty_cache()
-        elif len(hf_safetensors_files) > 0:
-            print(F'Load safetensor files: {hf_safetensors_files}')
-            from safetensors.torch import load_file
-            for safe_file in hf_safetensors_files:
-                # state = torch.load(bin_file, map_location="cpu")
-                state = load_file(safe_file)
-                for name, param in state.items():
-                    yield name, param
-                del state
-                torch.cuda.empty_cache()
-        else:
-            raise ValueError(f'no files available either bin or safe')
-def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
-    """convert PySafeSlice object from safetensors to torch.Tensor
-    PySafeSlice object supports indexing, which is done before loading the
-    actual tensor and can reduce the amount of memory being read into the
-    memory. However, it does not support more advanced functionalities
-    like `.view()` or `.t()`. Therefore, if we need to modify the loaded
-    tensor with these more complicated operators, we need to convert to
-    tensor first.
-    """
-    if not isinstance(x, torch.Tensor):
-        x = x[:]
-    return x
-def load_padded_tensor_parallel_vocab(
-    param: torch.Tensor,
-    loaded_weight: Any,  # `torch.Tensor` or `PySafeSlice`
-    tensor_model_parallel_rank: int,
-) -> None:
-    shard_size = param.shape[0]
-    start_idx = tensor_model_parallel_rank * shard_size
-    end_idx = (tensor_model_parallel_rank + 1) * shard_size
-    loaded_weight = loaded_weight[start_idx:end_idx]
-    loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-    param[:loaded_weight.shape[0]].copy_(loaded_weight)
-def llama_load_weights(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        use_np_cache: bool = False,
-        load_format: str = "auto",
-        revision: Optional[str] = None
-):
-    # if use vllm==0.1.4
-    from vllm.model_executor.weight_utils import (
-        load_tensor_parallel_weights
-    )
-    from vllm.model_executor.parallel_utils.parallel_state import (
-        get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-    tp_size = get_tensor_model_parallel_world_size()
-    tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-    q_proj_shard_size = (self.config.hidden_size // tp_size)
-    kv_proj_shard_size = (self.config.hidden_size //
-                            self.config.num_attention_heads *
-                            getattr(self.config, "num_key_value_heads", self.config.num_attention_heads) // tp_size)
-    attention_weight_specs = [
-        # (weight_name, shard_size, offset)
-        ("q_proj", q_proj_shard_size, 0),
-        ("k_proj", kv_proj_shard_size, q_proj_shard_size),
-        ("v_proj", kv_proj_shard_size,
-            q_proj_shard_size + kv_proj_shard_size),
-    ]
-    state_dict = self.state_dict()
-    need_to_load = len(state_dict)
-    loaded = 0
-    iterator = custom_hf_model_weights_iterator(model_name_or_path, cache_dir, use_np_cache)
-    for name, loaded_weight in iterator:
-        if "rotary_emb.inv_freq" in name:
-            continue
-        if "embed_tokens" in name or "lm_head" in name:
-            param = state_dict[name]
-            # Consider padding in the vocab size.
-            padded_vocab_size = (param.shape[0] * tp_size)
-            # num_extra_rows = padded_vocab_size - self.config.vocab_size
-            num_extra_rows = padded_vocab_size - loaded_weight.size(0)
-            load_size = loaded_weight.size()
-            extra_rows = torch.empty(num_extra_rows,
-                                        loaded_weight.shape[1])
-            extra_rows = extra_rows.to(loaded_weight)
-            loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
-            if num_extra_rows > 0:
-                print(f'Add empty to {num_extra_rows} extra row for {name}')
-            print(f'Load: {name} | {padded_vocab_size=} | {self.config.vocab_size=} | {num_extra_rows=} | {param.size()=} | {loaded_weight.size()=} | {load_size=}')
-        is_attention_weight = False
-        for weight_name, shard_size, offset in attention_weight_specs:
-            if weight_name not in name or "qkv_proj" in name:
-                continue
-            param = state_dict[name.replace(weight_name, "qkv_proj")]
-            loaded_weight = loaded_weight[
-                shard_size * tensor_model_parallel_rank:shard_size *
-                (tensor_model_parallel_rank + 1)]
-            param_slice = param.data[offset:offset + shard_size]
-            assert param_slice.shape == loaded_weight.shape
-            param_slice.copy_(loaded_weight)
-            loaded += 1.0 / 3
-            is_attention_weight = True
-            break
-        if is_attention_weight:
-            continue
-        # ! qkv_proj is sharded differently if concatenated into qkv
-        # qkv:      qqqq kkkk vvvv
-        # lweight:  qq0qq1 kk0kk1 vv0vv1
-        # q_shard_size: hidden_size // tp_size = qq
-        # qkv_s0:   qq0_kk0_vv0
-        # qkv_s1:   qq1_kk1_vv1
-        if "qkv_proj" in name:
-            param = state_dict[name]
-            # loaded_weight
-            qsize = self.config.hidden_size
-            kvsize = self.config.hidden_size // self.config.num_attention_heads * getattr(self.config, "num_key_value_heads", self.config.num_attention_heads)
-            q_offsets = (
-                q_proj_shard_size * tensor_model_parallel_rank,
-                q_proj_shard_size * (tensor_model_parallel_rank + 1)
-            )
-            k_offsets = (
-                qsize + kv_proj_shard_size * tensor_model_parallel_rank,
-                qsize + kv_proj_shard_size * (tensor_model_parallel_rank + 1)
-            )
-            v_offsets = (
-                qsize + kvsize + kv_proj_shard_size * tensor_model_parallel_rank,
-                qsize + kvsize + kv_proj_shard_size * (tensor_model_parallel_rank + 1)
-            )
-            _loaded_weight = torch.cat(
-                [
-                    loaded_weight[q_offsets[0]:q_offsets[1]],
-                    loaded_weight[k_offsets[0]:k_offsets[1]],
-                    loaded_weight[v_offsets[0]:v_offsets[1]],
-                ], 0
-            )
-            assert param.shape == _loaded_weight.shape, f'{param.shape=} != {_loaded_weight.shape=}'
-            param.data.copy_(_loaded_weight)
-            loaded += 1.0
-            is_attention_weight = True
-        if is_attention_weight:
-            continue
-        is_gate_up_weight = False
-        for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
-            if weight_name not in name or "gate_up_proj" in name:
-                continue
-            param = state_dict[name.replace(weight_name, "gate_up_proj")]
-            shard_size = param.shape[0] // 2
-            loaded_weight = loaded_weight[
-                shard_size * tensor_model_parallel_rank:shard_size *
-                (tensor_model_parallel_rank + 1)]
-            param_slice = param.data[shard_size * stride_id:shard_size *
-                                        (stride_id + 1)]
-            assert param_slice.shape == loaded_weight.shape
-            param_slice.copy_(loaded_weight)
-            loaded += 1.0 / 2
-            is_gate_up_weight = True
-            break
-        if is_gate_up_weight:
-            continue
-        if "gate_up_proj" in name:
-            param = state_dict[name]
-            shard_size = param.shape[0] // 2
-            intermediate_size = self.config.intermediate_size
-            g_offsets = (
-                shard_size * tensor_model_parallel_rank,
-                shard_size * (tensor_model_parallel_rank + 1)
-            )
-            u_offsets = (
-                intermediate_size + shard_size * tensor_model_parallel_rank,
-                intermediate_size + shard_size * (tensor_model_parallel_rank + 1)
-            )
-            _loaded_weight = torch.cat(
-                [
-                    loaded_weight[g_offsets[0]:g_offsets[1]],
-                    loaded_weight[u_offsets[0]:u_offsets[1]],
-                ], 0
-            )
-            assert param.shape == _loaded_weight.shape
-            param.data.copy_(_loaded_weight)
-            loaded += 1.0
-            is_gate_up_weight = True
-        if is_gate_up_weight:
-            continue
-        param = state_dict[name]
-        load_tensor_parallel_weights(param, loaded_weight, name,
-                                        self._column_parallel_weights,
-                                        self._row_parallel_weights,
-                                        tensor_model_parallel_rank)
-        loaded += 1
-    if np.abs(loaded - need_to_load) < 0.01:
-        print(f'WARNING: only {loaded} params loaded out of {need_to_load}')
-    else:
-        print(f'Loaded all {loaded} params loaded out of {need_to_load}')
-def new_llama_load_weights(
-    self,
-    model_name_or_path: str,
-    cache_dir: Optional[str] = None,
-    load_format: str = "auto",
-    revision: Optional[str] = None
-):
-    # If use newest vllm, not been thoroughly tested yet.
-    from vllm.model_executor.weight_utils import (
-        load_tensor_parallel_weights, hf_model_weights_iterator
-    )
-    from vllm.model_executor.parallel_utils.parallel_state import (
-        get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-    if self.quant_config is None:
-        weight_suffixes = ["weight"]
-    else:
-        weight_suffixes = self.quant_config.get_tp_tensor_names()
-    column_parallel_weights: List[str] = []
-    for layer in self._column_parallel_layers:
-        for suffix in weight_suffixes:
-            column_parallel_weights.append(f"{layer}.{suffix}")
-    row_parallel_weights: List[str] = []
-    for layer in self._row_parallel_layers:
-        for suffix in weight_suffixes:
-            row_parallel_weights.append(f"{layer}.{suffix}")
-    tp_size = get_tensor_model_parallel_world_size()
-    tp_rank = get_tensor_model_parallel_rank()
-    assert tp_size == 1, f'tensorparallel >=2 not allowed. {tp_size}'
-    q_proj_shard_size = (self.config.hidden_size // tp_size)
-    num_kv_heads_replicas = max(1,
-                                tp_size // self.config.num_key_value_heads)
-    num_kv_heads_per_gpu = max(1,
-                                self.config.num_key_value_heads // tp_size)
-    kv_proj_shard_size = (self.config.hidden_size //
-                            self.config.num_attention_heads *
-                            num_kv_heads_per_gpu)
-    attention_weight_specs = [
-        # (weight_name, shard_size, offset)
-        ("q_proj", q_proj_shard_size, 0),
-        ("k_proj", kv_proj_shard_size, q_proj_shard_size),
-        ("v_proj", kv_proj_shard_size,
-            q_proj_shard_size + kv_proj_shard_size),
-    ]
-    state_dict = self.state_dict()
-    need_to_load = len(state_dict)
-    loaded = 0
-    for name, loaded_weight in hf_model_weights_iterator(
-            model_name_or_path, cache_dir, load_format, revision):
-        if "rotary_emb.inv_freq" in name:
-            continue
-        is_packed = False
-        is_transposed = False
-        if self.quant_config is not None:
-            is_packed = self.quant_config.is_packed(name)
-            is_transposed = self.quant_config.is_transposed(name)
-        if is_transposed:
-            loaded_weight = convert_pyslice_to_tensor(loaded_weight)
-            loaded_weight = loaded_weight.T
-        is_attention_weight = False
-        for weight_name, shard_size, offset in attention_weight_specs:
-            if weight_name not in name or "qkv_proj" in name:
-                continue
-            param = state_dict[name.replace(weight_name, "qkv_proj")]
-            if is_transposed:
-                param = param.T
-            if is_packed:
-                shard_size //= self.quant_config.pack_factor
-                offset //= self.quant_config.pack_factor
-            if weight_name in ["k_proj", "v_proj"]:
-                shard_id = tp_rank // num_kv_heads_replicas
-            else:
-                shard_id = tp_rank
-            loaded_weight = loaded_weight[shard_size *
-                                            shard_id:shard_size *
-                                            (shard_id + 1)]
-            param_slice = param.data[offset:offset + shard_size]
-            assert param_slice.shape == loaded_weight.shape
-            param_slice.copy_(loaded_weight)
-            loaded += 1.0 / 3
-            is_attention_weight = True
-            break
-        if is_attention_weight:
-            continue
-        # TODO: need to figure out to do sharding with qkv_proj fused
-        is_gate_up_weight = False
-        for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
-            if weight_name not in name or "gate_up_proj" in name:
-                continue
-            param = state_dict[name.replace(weight_name, "gate_up_proj")]
-            if is_transposed:
-                param = param.T
-            shard_size = param.shape[0] // 2
-            loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
-                                            (tp_rank + 1)]
-            param_slice = param.data[shard_size * stride_id:shard_size *
-                                        (stride_id + 1)]
-            assert param_slice.shape == loaded_weight.shape
-            param_slice.copy_(loaded_weight)
-            loaded += 1.0 / 2
-            is_gate_up_weight = True
-            break
-        if is_gate_up_weight:
-            continue
-        # TODO: need to figure out to do sharding with gate_up_proj fused
-        param = state_dict[name]
-        if is_transposed:
-            param = param.T
-        if "embed_tokens" in name or "lm_head" in name:
-            load_padded_tensor_parallel_vocab(param, loaded_weight,
-                                                tp_rank)
-            loaded += 1
-            continue
-        load_tensor_parallel_weights(param, loaded_weight, name,
-                                        column_parallel_weights,
-                                        row_parallel_weights, tp_rank)
-        loaded += 1
-    if np.abs(loaded - need_to_load) < 0.01:
-        print(f'WARNING: only {loaded} params loaded out of {need_to_load}')
-    else:
-        print(f'Loaded all {loaded} params loaded out of {need_to_load}')
-# Reassign LlamaForCausalLM.load_weights with llama_load_weights
-if not DEBUG:
-    try:
-        import vllm
-        from vllm.model_executor.model_loader import _MODEL_REGISTRY
-        from vllm.model_executor.models import LlamaForCausalLM
-        _MODEL_REGISTRY['FasterLlamaForCausalLM'] = LlamaForCausalLM
-        if vllm.__version__ == "0.1.4":
-            LlamaForCausalLM.load_weights = llama_load_weights
-        else:
-            LlamaForCausalLM.load_weights = new_llama_load_weights
-        if DTYPE == "bfloat16":
-            try:
-                compute_capability = torch.cuda.get_device_capability()
-                if compute_capability[0] < 8:
-                    gpu_name = torch.cuda.get_device_name()
-                    print(
-                        "Bfloat16 is only supported on GPUs with compute capability "
-                        f"of at least 8.0. Your {gpu_name} GPU has compute capability "
-                        f"{compute_capability[0]}.{compute_capability[1]}. --> Move to FLOAT16")
-                    DTYPE = "float16"
-            except Exception as e:
-                print(f'Unable to obtain compute_capability: {e}')
-    except Exception as e:
-        print(f'Failing import and reconfigure VLLM: {str(e)}')
 # ! ==================================================================
 set_documentation_group("component")
@@ -734,41 +290,6 @@ set_documentation_group("component")
 RES_PRINTED = False
-def llama_chat_sys_input_seq_constructor(text, sys_prompt=SYSTEM_PROMPT_1, bos_token=BOS_TOKEN, eos_token=EOS_TOKEN):
-    return f"{bos_token}{B_INST} {B_SYS} {sys_prompt} {E_SYS} {text} {E_INST}"
-def llama_chat_multiturn_sys_input_seq_constructor(
-    message: str,
-    history: List[Tuple[str, str]],
-    sys_prompt=SYSTEM_PROMPT_1,
-    bos_token=BOS_TOKEN,
-    eos_token=EOS_TOKEN,
-    include_end_instruct=True,
-):
-    """
-    ```
-        <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
-        <bos>[INST] Prompt [/INST] Answer <eos>
-        <bos>[INST] Prompt [/INST]
-    ```
-    """
-    text = ''
-    end_instr = f" {E_INST}" if include_end_instruct else ""
-    for i, (prompt, res) in enumerate(history):
-        if i == 0:
-            text += f"{bos_token}{B_INST} {B_SYS} {sys_prompt} {E_SYS} {prompt}{end_instr}"
-        else:
-            text += f"{bos_token}{B_INST} {prompt}{end_instr}"
-        if res is not None:
-            text += f" {res} {eos_token} "
-    if len(history) == 0 or text.strip() == '':
-        text = f"{bos_token}{B_INST} {B_SYS} {sys_prompt} {E_SYS} {message}{end_instr}"
-    else:
-        text += f"{bos_token}{B_INST} {message}{end_instr}"
-    return text
 @document()
 class ChatBot(gr.Chatbot):
@@ -966,29 +487,63 @@ def _setup_events(self) -> None:
         )
     # Reconfigure clear_btn to stop and clear text box
-    # if self.clear_btn:
-    #     self.clear_btn.click(
-    #         lambda: ([], [], None),
-    #         None,
-    #         [self.chatbot, self.chatbot_state, self.saved_input],
-    #         queue=False,
-    #         api_name=False,
-    #         cancels=submit_event,
-    #     )
 def _display_input(
-        self, message: str, history: list[list[str | None]]
-    ) -> tuple[list[list[str | None]], list[list[str | None]]]:
     if message is not None and message.strip() != "":
         history.append([message, None])
     return history, history
 # replace
 gr.ChatInterface._setup_stop_events = _setup_stop_events
 gr.ChatInterface._setup_events = _setup_events
 gr.ChatInterface._display_input = _display_input
 @document()
@@ -1036,25 +591,6 @@ class CustomTabbedInterface(gr.Blocks):
                         interface.render()
-# def vllm_abort(self: Any):
-#     sh = self.llm_engine.scheduler
-#     for g in (sh.waiting + sh.running + sh.swapped):
-#         sh.abort_seq_group(g.request_id)
-#     from vllm.sequence import SequenceStatus
-#     scheduler = self.llm_engine.scheduler
-#     for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
-#         for seq_group in state_queue:
-#             # if seq_group.request_id == request_id:
-#             # Remove the sequence group from the state queue.
-#             state_queue.remove(seq_group)
-#             for seq in seq_group.seqs:
-#                 if seq.is_finished():
-#                     continue
-#                 scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
 def vllm_abort(self):
     sh = self.llm_engine.scheduler
     for g in (sh.waiting + sh.running + sh.swapped):
@@ -1231,6 +767,14 @@ def chatml_format(message, history=None, system_prompt=None):
     return chatml_chat_convo_format(conversations, True, default_system=system_prompt)
 def chat_response_stream_multiturn(
     message: str,
     history: List[Tuple[str, str]],
@@ -1242,6 +786,9 @@ def chat_response_stream_multiturn(
     system_prompt: Optional[str] = SYSTEM_PROMPT_1
 ) -> str:
     global LOG_FILE, LOG_PATH
     from vllm import LLM, SamplingParams
     """Build multi turn
     <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
@@ -1274,16 +821,12 @@ def chat_response_stream_multiturn(
     message_safety = safety_check(message, history=history)
     if message_safety is not None:
-        yield message_safety
-        return
     # history will be appended with message later on
-    # full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
-    #     message, history, sys_prompt=system_prompt
-    # )
     full_prompt = chatml_format(message.strip(), history=history, system_prompt=system_prompt)
-    # print(full_prompt)
     if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
         raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
@@ -1334,6 +877,89 @@ def chat_response_stream_multiturn(
     if message_safety is not None:
         yield message_safety
         return
 def maybe_log_conv_file(current_time, history, message, response, **kwargs):
@@ -1715,6 +1341,48 @@ CHAT_EXAMPLES = [
 # performance items
 def launch_demo():
     global demo, llm, DEBUG, LOG_FILE
@@ -1817,7 +1485,7 @@ def launch_demo():
                 gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation'),
                 gr.Number(value=frequence_penalty, label='Frequency penalty', info='> 0 encourage new tokens over repeated tokens'),
                 gr.Number(value=presence_penalty, label='Presence penalty', info='> 0 encourage new tokens, < 0 encourage existing tokens'),
-                gr.Textbox(value="[STOP],[END],<s>,</s>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1),
                 gr.Number(value=0, label='current_time', visible=False),
             ],
             outputs=[
@@ -1829,11 +1497,13 @@ def launch_demo():
             description=FILE_UPLOAD_DESCRIPTION,
             allow_flagging=False,
             examples=[
-                ["upload_chat.json", "chat", 0.2, 1024, 0.5, 0, "[STOP],[END],<s>,</s>"],
-                ["upload_few_shot.json", "few-shot", 0.2, 128, 0.5, 0, "[STOP],[END],<s>,</s>,\\n"]
             ],
             cache_examples=False,
         )
         demo_chat = gr.ChatInterface(
             response_fn,
@@ -1869,8 +1539,8 @@ def launch_demo():
             descriptions += f"<br> {path_markdown.format(model_path=model_path)}"
         demo = CustomTabbedInterface(
-            interface_list=[demo_chat, demo_file_upload],
-            tab_names=["Chat Interface", "Batch Inference"],
             title=f"{model_title}",
             description=descriptions,
         )

 import glob
 import json
 import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
 from gradio_client.documentation import document, set_documentation_group
 # ! ==================================================================
 set_documentation_group("component")
 RES_PRINTED = False
 @document()
 class ChatBot(gr.Chatbot):
         )
     # Reconfigure clear_btn to stop and clear text box
 def _display_input(
+        self, message: str, history: List[List[Union[str, None]]]
+    ) -> Tuple[List[List[Union[str, None]]], List[List[list[Union[str, None]]]]]:
     if message is not None and message.strip() != "":
         history.append([message, None])
     return history, history
+async def _stream_fn(
+    self,
+    message: str,
+    history_with_input,
+    request: Request,
+    *args,
+) -> AsyncGenerator:
+    history = history_with_input[:-1]
+    inputs, _, _ = special_args(
+        self.fn, inputs=[message, history, *args], request=request
+    )
+    if self.is_async:
+        generator = self.fn(*inputs)
+    else:
+        generator = await anyio.to_thread.run_sync(
+            self.fn, *inputs, limiter=self.limiter
+        )
+        generator = SyncToAsyncIterator(generator, self.limiter)
+    try:
+        first_response = await async_iteration(generator)
+        update = history + [[message, first_response]]
+        yield update, update
+    except StopIteration:
+        update = history + [[message, None]]
+        yield update, update
+    try:
+        async for response in generator:
+            update = history + [[message, response]]
+            yield update, update
+    except Exception as e:
+        # if "invalid" in str(e):
+        #     yield history, history
+        #     raise e
+        # else:
+        #     raise e
+        yield history, history
+        raise e
 # replace
 gr.ChatInterface._setup_stop_events = _setup_stop_events
 gr.ChatInterface._setup_events = _setup_events
 gr.ChatInterface._display_input = _display_input
+gr.ChatInterface._stream_fn = _stream_fn
 @document()
                         interface.render()
 def vllm_abort(self):
     sh = self.llm_engine.scheduler
     for g in (sh.waiting + sh.running + sh.swapped):
     return chatml_chat_convo_format(conversations, True, default_system=system_prompt)
+def debug_chat_response_stream_multiturn(*args, **kwargs):
+    message = "This is a debugging message"
+    for i in range(len(message)):
+        time.sleep(0.05)
+        yield message[:i]
 def chat_response_stream_multiturn(
     message: str,
     history: List[Tuple[str, str]],
     system_prompt: Optional[str] = SYSTEM_PROMPT_1
 ) -> str:
     global LOG_FILE, LOG_PATH
+    if DEBUG:
+        yield from debug_chat_response_stream_multiturn()
+        return
     from vllm import LLM, SamplingParams
     """Build multi turn
     <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
     message_safety = safety_check(message, history=history)
     if message_safety is not None:
+        # yield message_safety
+        raise gr.Error(message_safety)
     # history will be appended with message later on
     full_prompt = chatml_format(message.strip(), history=history, system_prompt=system_prompt)
     if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
         raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
     if message_safety is not None:
         yield message_safety
         return
+def debug_generate_free_form_stream(message):
+    output = " This is a debugging message...."
+    for i in range(len(output)):
+        time.sleep(0.05)
+        yield message + output[:i]
+def generate_free_form_stream(
+    message: str,
+    temperature: float,
+    max_tokens: int,
+    frequency_penalty: float,
+    presence_penalty: float,
+    current_time: Optional[float] = None,
+    stop_strings: str = '<s>,</s>,<|im_start|>,<|im_end|>',
+) -> str:
+    global LOG_FILE, LOG_PATH
+    if DEBUG:
+        yield from debug_generate_free_form_stream(message)
+        return
+    from vllm import LLM, SamplingParams
+    """Build multi turn
+    """
+    global llm, RES_PRINTED
+    assert llm is not None
+    tokenizer = llm.get_tokenizer()
+    # force removing all
+    vllm_abort(llm)
+    temperature = float(temperature)
+    frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    stop_strings = [x.strip() for x in stop_strings.strip().split(",")]
+    stop_strings = list(set(stop_strings + ['</s>', '<|im_start|>']))
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        max_tokens=max_tokens,
+        frequency_penalty=frequency_penalty,
+        presence_penalty=presence_penalty,
+        stop=stop_strings,
+        # ignore_eos=True,
+    )
+    # full_prompt = message
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    message_safety = safety_check(message)
+    if message_safety is not None:
+        raise gr.Error(message_safety)
+    if len(tokenizer.encode(message, add_special_tokens=False)) >= 4050:
+        raise gr.Error(f"Prompt is too long!")
+    cur_out = None
+    for j, gen in enumerate(vllm_generate_stream(llm, message, sampling_params)):
+        if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
+            # optionally check safety, and respond
+            if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
+                message_safety = safety_check(cur_out, history=None)
+                if message_safety is not None:
+                    raise gr.Error(message_safety)
+            yield message + cur_out
+        assert len(gen) == 1, f'{gen}'
+        item = next(iter(gen.values()))
+        cur_out = item.outputs[0].text
+        #cur_out = "Our system is under maintenance, will be back soon!"
+        if j >= max_tokens - 2:
+            gr.Warning(f'The response hits limit of {max_tokens} tokens. Consider increase the max tokens parameter in the Additional Inputs.')
+    if cur_out is not None:
+        yield message + cur_out
+    message_safety = safety_check(message + cur_out, history=None)
+    if message_safety is not None:
+        raise gr.Error(message_safety)
 def maybe_log_conv_file(current_time, history, message, response, **kwargs):
 # performance items
+def create_free_form_generation_demo():
+    global short_model_path
+    max_tokens = MAX_TOKENS
+    temperature = TEMPERATURE
+    frequence_penalty = FREQUENCE_PENALTY
+    presence_penalty = PRESENCE_PENALTY
+    introduction = """
+## Free-form:
+Put any context string (like few-shot prompts) and get the model to generate.
+    """
+    with gr.Blocks() as demo_free_form:
+        gr.Markdown(introduction)
+        with gr.Row():
+            txt = gr.Textbox(
+                scale=4,
+                lines=16,
+                show_label=False,
+                placeholder="Enter any free form text and submit",
+                container=False,
+            )
+        with gr.Row():
+            free_submit_button = gr.Button('Submit')
+        with gr.Row():
+            temp = gr.Number(value=temperature, label='Temperature', info="Higher -> more random")
+            length = gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation')
+            freq_pen = gr.Number(value=frequence_penalty, label='Frequency penalty', info='> 0 encourage new tokens over repeated tokens')
+            pres_pen = gr.Number(value=presence_penalty, label='Presence penalty', info='> 0 encourage new tokens, < 0 encourage existing tokens')
+            stop_strings = gr.Textbox(value="<s>,</s>,<|im_start|>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1)
+        free_submit_button.click(
+            generate_free_form_stream,
+            [txt, temp, length, freq_pen, pres_pen, stop_strings],
+            txt
+        )
+    return demo_free_form
 def launch_demo():
     global demo, llm, DEBUG, LOG_FILE
                 gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation'),
                 gr.Number(value=frequence_penalty, label='Frequency penalty', info='> 0 encourage new tokens over repeated tokens'),
                 gr.Number(value=presence_penalty, label='Presence penalty', info='> 0 encourage new tokens, < 0 encourage existing tokens'),
+                gr.Textbox(value="<s>,</s>,<|im_start|>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1),
                 gr.Number(value=0, label='current_time', visible=False),
             ],
             outputs=[
             description=FILE_UPLOAD_DESCRIPTION,
             allow_flagging=False,
             examples=[
+                ["upload_chat.json", "chat", 0.2, 1024, 0.5, 0, "<s>,</s>,<|im_start|>"],
+                ["upload_few_shot.json", "few-shot", 0.2, 128, 0.5, 0, "<s>,</s>,<|im_start|>,\\n"]
             ],
             cache_examples=False,
         )
+        demo_free_form = create_free_form_generation_demo()
         demo_chat = gr.ChatInterface(
             response_fn,
             descriptions += f"<br> {path_markdown.format(model_path=model_path)}"
         demo = CustomTabbedInterface(
+            interface_list=[demo_chat, demo_file_upload, demo_free_form],
+            tab_names=["Chat Interface", "Batch Inference", "Free-form"],
             title=f"{model_title}",
             description=descriptions,
         )