Spaces:
Paused
Paused
| import gradio as gr | |
| import time | |
| from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor | |
| from io import BytesIO | |
| from urllib.request import urlopen | |
| import librosa | |
| import os, json | |
| from sys import argv | |
| from vllm import LLM, SamplingParams | |
| def load_model_processor(model_path): | |
| processor = AutoProcessor.from_pretrained(model_path) | |
| llm = LLM( | |
| model=model_path, trust_remote_code=True, gpu_memory_utilization=0.8, | |
| enforce_eager=True, device = "cuda", | |
| limit_mm_per_prompt={"audio": 5}, | |
| ) | |
| return llm, processor | |
| model_path1 = "Qwen/Qwen2-Audio-7B-Instruct" #argv[1] | |
| model1, processor1 = load_model_processor(model_path1) | |
| def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9, | |
| max_new_tokens = 2048): | |
| text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) | |
| audios = [] | |
| for message in conversation: | |
| if isinstance(message["content"], list): | |
| for ele in message["content"]: | |
| if ele["type"] == "audio": | |
| if ele['audio_url'] != None: | |
| audios.append(librosa.load( | |
| ele['audio_url'], | |
| sr=processor.feature_extractor.sampling_rate)[0] | |
| ) | |
| sampling_params = SamplingParams( | |
| temperature=temperature, max_tokens=max_new_tokens, repetition_penalty=repetition_penalty, top_p=top_p, top_k=20, | |
| stop_token_ids=[], | |
| ) | |
| input = { | |
| 'prompt': text, | |
| 'multi_modal_data': { | |
| 'audio': [(audio, 16000) for audio in audios] | |
| } | |
| } | |
| output = model.generate([input], sampling_params=sampling_params)[0] | |
| response = output.outputs[0].text | |
| return response | |
| def print_like_dislike(x: gr.LikeData): | |
| print(x.index, x.value, x.liked) | |
| def add_message(history, message): | |
| paths = [] | |
| for turn in history: | |
| if turn['role'] == "user" and type(turn['content']) != str: | |
| paths.append(turn['content'][0]) | |
| for x in message["files"]: | |
| if x not in paths: | |
| history.append({"role": "user", "content": {"path": x}}) | |
| if message["text"] is not None: | |
| history.append({"role": "user", "content": message["text"]}) | |
| return history, gr.MultimodalTextbox(value=None, interactive=False) | |
| def format_user_messgae(message): | |
| if type(message['content']) == str: | |
| return {"role": "user", "content": [{"type": "text", "text": message['content']}]} | |
| else: | |
| return {"role": "user", "content": [{"type": "audio", "audio_url": message['content'][0]}]} | |
| def history_to_conversation(history): | |
| conversation = [] | |
| audio_paths = [] | |
| for turn in history: | |
| if turn['role'] == "user": | |
| if not turn['content']: | |
| continue | |
| turn = format_user_messgae(turn) | |
| if turn['content'][0]['type'] == 'audio': | |
| if turn['content'][0]['audio_url'] in audio_paths: | |
| continue | |
| else: | |
| audio_paths.append(turn['content'][0]['audio_url']) | |
| if len(conversation) > 0 and conversation[-1]["role"] == "user": | |
| conversation[-1]['content'].append(turn['content'][0]) | |
| else: | |
| conversation.append(turn) | |
| else: | |
| conversation.append(turn) | |
| print(json.dumps(conversation, indent=4, ensure_ascii=False)) | |
| return conversation | |
| def bot(history: list, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9, | |
| max_new_tokens = 2048): | |
| conversation = history_to_conversation(history) | |
| response = response_to_audio_conv(conversation, model=model1, processor=processor1, temperature = temperature,repetition_penalty=repetition_penalty, top_p = top_p, max_new_tokens = max_new_tokens) | |
| # response = "Nice to meet you!" | |
| print("Bot:",response) | |
| history.append({"role": "assistant", "content": ""}) | |
| for character in response: | |
| history[-1]["content"] += character | |
| time.sleep(0.01) | |
| yield history | |
| insturctions = """**Instruction**: there are three input format: | |
| 1. text: input text message only | |
| 2. audio: upload audio file or record a voice message | |
| 3. audio + text: record a voice message and input text message""" | |
| with gr.Blocks() as demo: | |
| # gr.Markdown("""<p align="center"><img src="images/seal_logo.png" style="height: 80px"/><p>""") | |
| # gr.Image("images/seal_logo.png", elem_id="seal_logo", show_label=False,height=80,show_fullscreen_button=False) | |
| gr.Markdown( | |
| """<div style="text-align: center; font-size: 32px; font-weight: bold;">SeaLLMs-Audio ChatBot</div>""", | |
| ) | |
| # Description text | |
| gr.Markdown( | |
| """<div style="text-align: center; font-size: 16px;"> | |
| This WebUI is based on SeaLLMs-Audio-7B-Chat, developed by Alibaba DAMO Academy.<br> | |
| You can interact with the chatbot in <b>English, Chinese, Indonesian, Thai, or Vietnamese</b>.<br> | |
| For each round, you can input <b>audio and/or text</b>. | |
| </div>""", | |
| ) | |
| # Links with proper formatting | |
| gr.Markdown( | |
| """<div style="text-align: center; font-size: 16px;"> | |
| <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Website]</a> | |
| <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Model🤗]</a> | |
| <a href="https://github.com/liuchaoqun/SeaLLMs-Audio">[Github]</a> | |
| </div>""", | |
| ) | |
| # gr.Markdown(insturctions) | |
| # with gr.Row(): | |
| # with gr.Column(): | |
| # temperature = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Temperature") | |
| # with gr.Column(): | |
| # top_p = gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.1, label="Top P") | |
| # with gr.Column(): | |
| # repetition_penalty = gr.Slider(minimum=0, maximum=2, value=1.1, step=0.1, label="Repetition Penalty") | |
| chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages") | |
| chat_input = gr.MultimodalTextbox( | |
| interactive=True, | |
| file_count="single", | |
| file_types=['.wav'], | |
| placeholder="Enter message (optional) ...", | |
| show_label=False, | |
| sources=["microphone", "upload"], | |
| ) | |
| chat_msg = chat_input.submit( | |
| add_message, [chatbot, chat_input], [chatbot, chat_input] | |
| ) | |
| bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response") | |
| # bot_msg = chat_msg.then(bot, [chatbot, temperature, repetition_penalty, top_p], chatbot, api_name="bot_response") | |
| bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input]) | |
| # chatbot.like(print_like_dislike, None, None, like_user_message=True) | |
| clear_button = gr.ClearButton([chatbot, chat_input]) | |
| # PORT = 7950 | |
| # demo.launch(server_port=PORT, show_api = True, allowed_paths = [], | |
| # root_path = f"https://dsw-gateway.alibaba-inc.com/dsw81322/proxy/{PORT}/") | |
| demo.launch( | |
| share=False, | |
| inbrowser=True, | |
| server_port=7950, | |
| server_name="0.0.0.0", | |
| max_threads=40 | |
| ) | |