Spaces:

SeaLLMs
/

SeaLLMs-Audio-Demo

Paused

App Files Files Community

lukecq commited on Mar 15

Commit

aec1ff0

verified ·

1 Parent(s): 5cda5a4

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -88

app.py CHANGED Viewed

@@ -8,22 +8,40 @@ import os, json
 from sys import argv
 from vllm import LLM, SamplingParams
-print(gr.__version__)
 def load_model_processor(model_path):
     processor = AutoProcessor.from_pretrained(model_path)
     llm = LLM(
-        model=model_path, trust_remote_code=True, gpu_memory_utilization=0.8,
         enforce_eager=True,  device = "cuda",
         limit_mm_per_prompt={"audio": 5},
     )
     return llm, processor
-model_path1 = "Qwen/Qwen2-Audio-7B-Instruct" #argv[1]
 model1, processor1 = load_model_processor(model_path1)
-def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
-                    max_new_tokens = 2048):
     text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
     audios = []
     for message in conversation:
@@ -52,70 +70,15 @@ def response_to_audio_conv(conversation, model=None, processor=None, temperature
     response = output.outputs[0].text
     return response
-def print_like_dislike(x: gr.LikeData):
-    print(x.index, x.value, x.liked)
-def add_message(history, message):
-    paths = []
-    for turn in history:
-        if turn['role'] == "user" and type(turn['content']) != str:
-            paths.append(turn['content'][0])
-    for x in message["files"]:
-        if x not in paths:
-            history.append({"role": "user", "content": {"path": x}})
-    if message["text"] is not None:
-        history.append({"role": "user", "content": message["text"]})
-    return history, gr.MultimodalTextbox(value=None, interactive=False)
-def format_user_messgae(message):
-    if type(message['content']) == str:
-        return {"role": "user", "content": [{"type": "text", "text": message['content']}]}
-    else:
-        return {"role": "user", "content": [{"type": "audio", "audio_url": message['content'][0]}]}
-def history_to_conversation(history):
-    conversation = []
-    audio_paths = []
-    for turn in history:
-        if turn['role'] == "user":
-            if not turn['content']:
-                continue
-            turn = format_user_messgae(turn)
-            if turn['content'][0]['type'] == 'audio':
-                if turn['content'][0]['audio_url'] in audio_paths:
-                    continue
-                else:
-                    audio_paths.append(turn['content'][0]['audio_url'])
-            if len(conversation) > 0 and conversation[-1]["role"] == "user":
-                conversation[-1]['content'].append(turn['content'][0])
-            else:
-                conversation.append(turn)
-        else:
-            conversation.append(turn)
-    print(json.dumps(conversation, indent=4, ensure_ascii=False))
-    return conversation
-def bot(history: list, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
-                    max_new_tokens = 2048):
-    conversation = history_to_conversation(history)
-    response = response_to_audio_conv(conversation, model=model1, processor=processor1, temperature = temperature,repetition_penalty=repetition_penalty, top_p = top_p, max_new_tokens = max_new_tokens)
-    # response = "Nice to meet you!"
-    print("Bot:",response)
-    history.append({"role": "assistant", "content": ""})
-    for character in response:
-        history[-1]["content"] += character
-        time.sleep(0.01)
-        yield history
-insturctions = """**Instruction**: there are three input format:
-    1. text: input text message only
-    2. audio: upload audio file or record a voice message
-    3. audio + text: record a voice message and input text message"""
 with gr.Blocks() as demo:
     # gr.Markdown("""<p align="center"><img src="images/seal_logo.png" style="height: 80px"/><p>""")
     # gr.Image("images/seal_logo.png", elem_id="seal_logo", show_label=False,height=80,show_fullscreen_button=False)
     gr.Markdown(
@@ -148,31 +111,35 @@ with gr.Blocks() as demo:
     #         top_p = gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.1, label="Top P")
     #     with gr.Column():
     #         repetition_penalty = gr.Slider(minimum=0, maximum=2, value=1.1, step=0.1, label="Repetition Penalty")
-    chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages")
-    chat_input = gr.MultimodalTextbox(
-        interactive=True,
-        file_count="single",
-        file_types=['.wav'],
-        placeholder="Enter message (optional) ...",
-        show_label=False,
-        sources=["microphone", "upload"],
     )
-    chat_msg = chat_input.submit(
-        add_message, [chatbot, chat_input], [chatbot, chat_input]
     )
-    bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
-    # bot_msg = chat_msg.then(bot, [chatbot, temperature, repetition_penalty, top_p], chatbot, api_name="bot_response")
-    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
-    # chatbot.like(print_like_dislike, None, None, like_user_message=True)
-    clear_button = gr.ClearButton([chatbot, chat_input])
-# PORT = 7950
-# demo.launch(server_port=PORT, show_api = True, allowed_paths = [],
-#     root_path = f"https://dsw-gateway.alibaba-inc.com/dsw81322/proxy/{PORT}/")
 demo.launch(
     share=False,

 from sys import argv
 from vllm import LLM, SamplingParams
+from huggingface_hub import login
+TOKEN = os.environ.get("TOKEN", None)
+login(token=TOKEN)
 def load_model_processor(model_path):
     processor = AutoProcessor.from_pretrained(model_path)
     llm = LLM(
+        model=model_path, trust_remote_code=True, gpu_memory_utilization=0.4,
         enforce_eager=True,  device = "cuda",
         limit_mm_per_prompt={"audio": 5},
     )
     return llm, processor
+model_path1 = "SeaLLMs/SeaLLMs-Audio-7B"
 model1, processor1 = load_model_processor(model_path1)
+def response_to_audio(audio_url, text, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,max_new_tokens = 2048):
+    if text == None:
+        conversation = [
+            {"role": "user", "content": [
+                {"type": "audio", "audio_url": audio_url},
+            ]},]
+    elif audio_url == None:
+        conversation = [
+            {"role": "user", "content": [
+                {"type": "text", "text": text},
+           ]},]
+    else:
+        conversation = [
+            {"role": "user", "content": [
+                {"type": "audio", "audio_url": audio_url},
+                {"type": "text", "text": text},
+           ]},]
     text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
     audios = []
     for message in conversation:
     response = output.outputs[0].text
     return response
+def clear_inputs():
+    return None, "", ""
+def compare_responses(audio_url, text):
+    response1 = response_to_audio(audio_url, text, model1, processor1)
+    return response1
 with gr.Blocks() as demo:
+    # gr.Markdown(f"Evaluate {model_path1}")
     # gr.Markdown("""<p align="center"><img src="images/seal_logo.png" style="height: 80px"/><p>""")
     # gr.Image("images/seal_logo.png", elem_id="seal_logo", show_label=False,height=80,show_fullscreen_button=False)
     gr.Markdown(
     #         top_p = gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.1, label="Top P")
     #     with gr.Column():
     #         repetition_penalty = gr.Slider(minimum=0, maximum=2, value=1.1, step=0.1, label="Repetition Penalty")
+    with gr.Row():
+        with gr.Column():
+            # mic_input = gr.Microphone(label="Record Audio", type="filepath", elem_id="mic_input")
+            mic_input = gr.Audio(sources = ['upload', 'microphone'], label="Record Audio", type="filepath", elem_id="mic_input")
+        with gr.Column():
+            additional_input = gr.Textbox(label="Text Input")
+    # Button to trigger the function
+    with gr.Row():
+        btn_submit = gr.Button("Submit")
+        btn_clear = gr.Button("Clear")
+    with gr.Row():
+        output_text1 = gr.Textbox(label=model_path1.split('/')[-1], interactive=False, elem_id="output_text1")
+    btn_submit.click(
+        fn=response_to_audio,
+        inputs=[mic_input, additional_input],
+        outputs=[output_text1],
     )
+    btn_clear.click(
+        fn=clear_inputs,
+        inputs=None,
+        outputs=[mic_input, additional_input, output_text1],
+        queue=False,
     )
 demo.launch(
     share=False,