Spaces:

SeaLLMs
/

SeaLLMs-Audio-Demo

Paused

App Files Files Community

lukecq commited on Mar 13

Commit

2360578

verified ·

1 Parent(s): 015f3f9

Upload app.py

Browse files

Files changed (1) hide show

app.py +180 -63

app.py CHANGED Viewed

@@ -1,64 +1,181 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import time
+from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
+from io import BytesIO
+from urllib.request import urlopen
+import librosa
+import os, json
+from sys import argv
+from vllm import LLM, SamplingParams
+def load_model_processor(model_path):
+    processor = AutoProcessor.from_pretrained(model_path)
+    llm = LLM(
+        model=model_path, trust_remote_code=True, gpu_memory_utilization=0.4,
+        enforce_eager=True,
+        limit_mm_per_prompt={"audio": 5},
+    )
+    return llm, processor
+model_path1 = "Qwen/Qwen2-Audio-7B-Instruct" #argv[1]
+model1, processor1 = load_model_processor(model_path1)
+def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
+                    max_new_tokens = 2048):
+    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+    audios = []
+    for message in conversation:
+        if isinstance(message["content"], list):
+            for ele in message["content"]:
+                if ele["type"] == "audio":
+                    if ele['audio_url'] != None:
+                        audios.append(librosa.load(
+                            ele['audio_url'],
+                            sr=processor.feature_extractor.sampling_rate)[0]
+                        )
+    sampling_params = SamplingParams(
+        temperature=temperature, max_tokens=max_new_tokens, repetition_penalty=repetition_penalty, top_p=top_p, top_k=20,
+        stop_token_ids=[],
+    )
+    input = {
+            'prompt': text,
+            'multi_modal_data': {
+                'audio': [(audio, 16000) for audio in audios]
+            }
+            }
+    output = model.generate([input], sampling_params=sampling_params)[0]
+    response = output.outputs[0].text
+    return response
+def print_like_dislike(x: gr.LikeData):
+    print(x.index, x.value, x.liked)
+def add_message(history, message):
+    paths = []
+    for turn in history:
+        if turn['role'] == "user" and type(turn['content']) != str:
+            paths.append(turn['content'][0])
+    for x in message["files"]:
+        if x not in paths:
+            history.append({"role": "user", "content": {"path": x}})
+    if message["text"] is not None:
+        history.append({"role": "user", "content": message["text"]})
+    return history, gr.MultimodalTextbox(value=None, interactive=False)
+def format_user_messgae(message):
+    if type(message['content']) == str:
+        return {"role": "user", "content": [{"type": "text", "text": message['content']}]}
+    else:
+        return {"role": "user", "content": [{"type": "audio", "audio_url": message['content'][0]}]}
+def history_to_conversation(history):
+    conversation = []
+    audio_paths = []
+    for turn in history:
+        if turn['role'] == "user":
+            if not turn['content']:
+                continue
+            turn = format_user_messgae(turn)
+            if turn['content'][0]['type'] == 'audio':
+                if turn['content'][0]['audio_url'] in audio_paths:
+                    continue
+                else:
+                    audio_paths.append(turn['content'][0]['audio_url'])
+            if len(conversation) > 0 and conversation[-1]["role"] == "user":
+                conversation[-1]['content'].append(turn['content'][0])
+            else:
+                conversation.append(turn)
+        else:
+            conversation.append(turn)
+    print(json.dumps(conversation, indent=4, ensure_ascii=False))
+    return conversation
+def bot(history: list, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
+                    max_new_tokens = 2048):
+    conversation = history_to_conversation(history)
+    response = response_to_audio_conv(conversation, model=model1, processor=processor1, temperature = temperature,repetition_penalty=repetition_penalty, top_p = top_p, max_new_tokens = max_new_tokens)
+    # response = "Nice to meet you!"
+    print("Bot:",response)
+    history.append({"role": "assistant", "content": ""})
+    for character in response:
+        history[-1]["content"] += character
+        time.sleep(0.01)
+        yield history
+insturctions = """**Instruction**: there are three input format:
+    1. text: input text message only
+    2. audio: upload audio file or record a voice message
+    3. audio + text: record a voice message and input text message"""
+with gr.Blocks() as demo:
+    # gr.Markdown("""<p align="center"><img src="images/seal_logo.png" style="height: 80px"/><p>""")
+    # gr.Image("images/seal_logo.png", elem_id="seal_logo", show_label=False,height=80,show_fullscreen_button=False)
+    gr.Markdown(
+        """<div style="text-align: center; font-size: 32px; font-weight: bold;">SeaLLMs-Audio ChatBot</div>""",
+    )
+    # Description text
+    gr.Markdown(
+        """<div style="text-align: center; font-size: 16px;">
+    This WebUI is based on SeaLLMs-Audio-7B-Chat, developed by Alibaba DAMO Academy.<br>
+    You can interact with the chatbot in <b>English, Chinese, Indonesian, Thai, or Vietnamese</b>.<br>
+    For each round, you can input <b>audio and/or text</b>.
+    </div>""",
+    )
+    # Links with proper formatting
+    gr.Markdown(
+        """<div style="text-align: center; font-size: 16px;">
+        <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Website]</a> &nbsp;
+        <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Model🤗]</a> &nbsp;
+        <a href="https://github.com/liuchaoqun/SeaLLMs-Audio">[Github]</a>
+        </div>""",
+    )
+    # gr.Markdown(insturctions)
+    # with gr.Row():
+    #     with gr.Column():
+    #         temperature = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Temperature")
+    #     with gr.Column():
+    #         top_p = gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.1, label="Top P")
+    #     with gr.Column():
+    #         repetition_penalty = gr.Slider(minimum=0, maximum=2, value=1.1, step=0.1, label="Repetition Penalty")
+    chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages")
+    chat_input = gr.MultimodalTextbox(
+        interactive=True,
+        file_count="single",
+        file_types=['.wav'],
+        placeholder="Enter message (optional) ...",
+        show_label=False,
+        sources=["microphone", "upload"],
+    )
+    chat_msg = chat_input.submit(
+        add_message, [chatbot, chat_input], [chatbot, chat_input]
+    )
+    bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
+    # bot_msg = chat_msg.then(bot, [chatbot, temperature, repetition_penalty, top_p], chatbot, api_name="bot_response")
+    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
+    # chatbot.like(print_like_dislike, None, None, like_user_message=True)
+    clear_button = gr.ClearButton([chatbot, chat_input])
+PORT = 7950
+demo.launch(server_port=PORT, show_api = True, allowed_paths = [],
+    root_path = f"https://dsw-gateway.alibaba-inc.com/dsw81322/proxy/{PORT}/")
+# demo.launch(
+#     share=False,
+#     inbrowser=True,
+#     server_port=7950,
+#     server_name="0.0.0.0",
+#     max_threads=40
+# )