Spaces:
Running
Running
File size: 5,702 Bytes
df835ed 17d9d28 7abb7ba df835ed b50b5f5 fff885e 0a24813 c69cd11 df835ed 7abb7ba df835ed cf8c971 56709e2 cf8c971 56709e2 cf8c971 df835ed 0a24813 b50b5f5 56709e2 0a24813 c69cd11 0a24813 b50b5f5 c69cd11 fff885e df835ed 17d9d28 cf8c971 56709e2 b50b5f5 cf8c971 b50b5f5 cf8c971 fff885e 0a24813 b50b5f5 56709e2 b50b5f5 fff885e 0a24813 fff885e 0a24813 b50b5f5 7abb7ba 56709e2 b50b5f5 df835ed fff885e 56709e2 b50b5f5 56709e2 b50b5f5 56709e2 b50b5f5 56709e2 fff885e 56709e2 fff885e 56709e2 b50b5f5 56709e2 b50b5f5 cf8c971 b50b5f5 c69cd11 b50b5f5 cf8c971 b50b5f5 cf8c971 df835ed cf8c971 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import soundfile as sf
import numpy as np
import os
import sys
from pathlib import Path
import base64
from io import BytesIO
# Model and Tokenizer Loading
MODEL_ID = "Qwen/Qwen-Audio-Chat"
def load_model():
print("Loading model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
print("Model and tokenizer loaded successfully")
return model, tokenizer
def process_audio(audio_path):
"""Process audio file for the model."""
try:
print(f"Processing audio file: {audio_path}")
# Read audio file
audio_data, sample_rate = sf.read(audio_path)
# Convert to mono if stereo
if len(audio_data.shape) > 1:
audio_data = audio_data.mean(axis=1)
# Ensure float32 format
audio_data = audio_data.astype(np.float32)
# Create in-memory buffer
audio_buffer = BytesIO()
# Write audio to buffer in WAV format
sf.write(audio_buffer, audio_data, sample_rate, format='WAV')
# Get the buffer content and encode to base64
audio_buffer.seek(0)
audio_base64 = base64.b64encode(audio_buffer.read()).decode('utf-8')
print(f"Audio processed successfully. Sample rate: {sample_rate}, Shape: {audio_data.shape}")
return {
"audio": audio_base64,
"sampling_rate": sample_rate
}
except Exception as e:
print(f"Error processing audio: {e}")
import traceback
traceback.print_exc()
return None
@spaces.GPU
def analyze_audio(audio_path: str, question: str = None) -> str:
"""
Main function for audio analysis that will be exposed as a tool.
Args:
audio_path: Path to the audio file
question: Optional question about the audio
Returns:
str: Model's response about the audio
"""
print(f"\nStarting analysis with audio_path: {audio_path}, question: {question}")
# Input validation
if audio_path is None or not isinstance(audio_path, str):
return "Please provide a valid audio file."
if not os.path.exists(audio_path):
return f"Audio file not found: {audio_path}"
# Process audio
audio_data = process_audio(audio_path)
if audio_data is None:
return "Failed to process the audio file. Please ensure it's a valid audio format."
try:
model, tokenizer = load_model()
query = question if question else "Please describe what you hear in this audio clip."
print("Preparing messages...")
messages = [
{
"role": "user",
"content": [
{
"type": "audio",
"data": audio_data["audio"],
"sampling_rate": audio_data["sampling_rate"]
},
{
"type": "text",
"text": query
}
]
}
]
print("Applying chat template...")
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
print(f"Generated prompt text: {text[:200]}...")
print("Tokenizing input...")
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
print("Generating response...")
with torch.no_grad():
outputs = model.generate(
**model_inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id
)
if outputs is None:
print("Model generated None output")
return "The model failed to generate a response. Please try again."
print(f"Output shape: {outputs.shape}")
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated response: {response[:200]}...")
return response
except Exception as e:
print(f"Error during processing: {str(e)}")
import traceback
traceback.print_exc()
return f"An error occurred while processing: {str(e)}"
# Create Gradio interface with clear input/output specifications
demo = gr.Interface(
fn=analyze_audio,
inputs=[
gr.Audio(
type="filepath",
label="Audio Input",
sources=["upload", "microphone"],
format="mp3" # Specify format to ensure consistent audio format
),
gr.Textbox(
label="Question",
placeholder="Optional: Ask a specific question about the audio",
value=""
)
],
outputs=gr.Textbox(label="Analysis"),
title="Qwen Audio Analysis Tool",
description="Upload an audio file or record from microphone to get AI-powered analysis using Qwen-Audio-Chat model",
examples=[
["path/to/example1.wav", "What instruments do you hear?"],
["path/to/example2.wav", "Describe the mood of this audio."]
],
cache_examples=False
)
if __name__ == "__main__":
demo.launch()
|