Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor | |
| from qwen_omni_utils import process_mm_info | |
| import spaces | |
| MODEL_ID = "Qwen/Qwen2.5-Omni-7B" if False else "Qwen/Qwen2.5-Omni-7B" # keep explicit string | |
| model = Qwen2_5OmniForConditionalGeneration.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype="auto", | |
| device_map="auto", | |
| ) | |
| model.disable_talker() | |
| processor = Qwen2_5OmniProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| def run_omni(audio_path: str, instruction: str) -> str: | |
| if not audio_path: | |
| return "Please upload an audio file." | |
| system_text = ( | |
| "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, " | |
| "capable of perceiving auditory and visual inputs, as well as generating text and speech." | |
| ) | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [{"type": "text", "text": system_text}], | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "audio", "audio_url": audio_path}, | |
| {"type": "text", "text": instruction}, | |
| ], | |
| }, | |
| ] | |
| text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) | |
| audios, images, videos = process_mm_info(conversation, use_audio_in_video=False) | |
| inputs = processor( | |
| text=text, | |
| audio=audios, | |
| images=images, | |
| videos=videos, | |
| return_tensors="pt", | |
| padding=True, | |
| ) | |
| inputs = inputs.to(model.device) | |
| output_ids = model.generate(**inputs, max_new_tokens=4096) | |
| output_ids = output_ids[:, inputs["input_ids"].shape[1]:] | |
| response = processor.batch_decode( | |
| output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| return response | |
| with gr.Blocks(title="Qwen2.5 Omni (Audio) Demo") as demo: | |
| gr.Markdown("# Qwen2.5-Omni (Audio) Demo") | |
| gr.Markdown("Upload an audio file and provide an instruction for the model.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio(type="filepath", label="Upload Audio") | |
| instruction = gr.Textbox( | |
| label="Instruction", | |
| value="Transcribe the audio, then summarize it in one sentence.", | |
| ) | |
| submit_btn = gr.Button("Run", variant="primary") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Response", lines=14) | |
| submit_btn.click(run_omni, [audio_input, instruction], output_text) | |
| if __name__ == "__main__": | |
| demo.queue().launch(share=False, ssr_mode=False) | |