Spaces:
Sleeping
Sleeping
| import torchaudio | |
| import sys | |
| import torch | |
| import random | |
| from config import TANGO_FLUX_DIR | |
| sys.path.append(TANGO_FLUX_DIR) | |
| from tangoflux import TangoFluxInference | |
| from transformers import AutoTokenizer, T5EncoderModel | |
| from collections import Counter | |
| class GenerateAudio(): | |
| def __init__(self): | |
| self.device = "cuda" | |
| self.model = None | |
| self.text_encoder = None | |
| # Basic categories for object classification | |
| self.categories = { | |
| 'vehicle': ['car', 'bus', 'truck', 'motorcycle', 'bicycle', 'train', 'vehicle'], | |
| 'nature': ['tree', 'bird', 'water', 'river', 'lake', 'ocean', 'rain', 'wind', 'forest'], | |
| 'urban': ['traffic', 'building', 'street', 'signal', 'construction'], | |
| 'animal': ['dog', 'cat', 'bird', 'insect', 'frog', 'squirrel'], | |
| 'human': ['person', 'people', 'crowd', 'child', 'footstep', 'voice'], | |
| 'indoor': ['door', 'window', 'chair', 'table', 'fan', 'appliance', 'tv', 'radio'] | |
| } | |
| # Suffixes and prefixes for pattern matching | |
| self.suffixes = { | |
| 'tree': 'nature', | |
| 'bird': 'animal', | |
| 'car': 'vehicle', | |
| 'truck': 'vehicle', | |
| 'signal': 'urban' | |
| } | |
| def _load_model(self): | |
| if self.model is None: | |
| self.model = TangoFluxInference(name='declare-lab/TangoFlux') | |
| if self.text_encoder is None: | |
| self.text_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").to(self.device).eval() | |
| else: | |
| self.text_encoder = self.text_encoder.to(self.device) | |
| def generate_sound(self, prompt, steps=25, duration=10, guidance_scale=4.5, disable_progress=True): | |
| self._load_model() | |
| with torch.no_grad(): | |
| latents = self.model.model.inference_flow( | |
| prompt, | |
| duration=duration, | |
| num_inference_steps=steps, | |
| guidance_scale=guidance_scale, | |
| disable_progress=disable_progress | |
| ) | |
| wave = self.model.vae.decode(latents.transpose(2, 1)).sample.cpu()[0] | |
| waveform_end = int(duration * self.model.vae.config.sampling_rate) | |
| wave = wave[:, :waveform_end] | |
| return wave | |
| def _categorize_object(self, object_name): | |
| """Categorize an object based on keywords or patterns""" | |
| object_lower = object_name.lower() | |
| # Check if the object contains any category keywords | |
| for category, keywords in self.categories.items(): | |
| for keyword in keywords: | |
| if keyword in object_lower: | |
| return category | |
| # Check suffix/prefix patterns | |
| words = object_lower.split() | |
| for word in words: | |
| for suffix, category in self.suffixes.items(): | |
| if word.endswith(suffix): | |
| return category | |
| return "unknown" | |
| def _describe_object_sound(self, object_name, zone): | |
| """Generate an appropriate sound description based on object type and distance""" | |
| category = self._categorize_object(object_name) | |
| # Volume descriptor based on zone | |
| volume_descriptors = { | |
| "near": ["prominent", "clear", "loud", "distinct"], | |
| "medium": ["moderate", "audible", "present"], | |
| "far": ["subtle", "distant", "faint", "soft"] | |
| } | |
| volume = random.choice(volume_descriptors[zone]) | |
| # Sound descriptors based on category | |
| sound_templates = { | |
| "vehicle": [ | |
| "{volume} engine sounds from the {object}", | |
| "{volume} mechanical noise of the {object}", | |
| "the {object} creating {volume} road noise", | |
| "{volume} sounds of the {object} in motion" | |
| ], | |
| "nature": [ | |
| "{volume} rustling of the {object}", | |
| "the {object} making {volume} natural sounds", | |
| "{volume} environmental sounds from the {object}", | |
| "the {object} with {volume} movement in the wind" | |
| ], | |
| "urban": [ | |
| "{volume} urban sounds around the {object}", | |
| "the {object} with {volume} city ambience", | |
| "{volume} noise from the {object}", | |
| "the {object} contributing to {volume} street sounds" | |
| ], | |
| "animal": [ | |
| "{volume} calls from the {object}", | |
| "the {object} making {volume} animal sounds", | |
| "{volume} sounds of the {object}", | |
| "the {object} with its {volume} presence" | |
| ], | |
| "human": [ | |
| "{volume} voices from the {object}", | |
| "the {object} creating {volume} human sounds", | |
| "{volume} movement sounds from the {object}", | |
| "the {object} with {volume} activity" | |
| ], | |
| "indoor": [ | |
| "{volume} ambient sounds around the {object}", | |
| "the {object} making {volume} indoor noises", | |
| "{volume} mechanical sounds from the {object}", | |
| "the {object} with its {volume} presence" | |
| ], | |
| "unknown": [ | |
| "{volume} sounds from the {object}", | |
| "the {object} creating {volume} audio", | |
| "{volume} noises associated with the {object}", | |
| "the {object} with its {volume} acoustic presence" | |
| ] | |
| } | |
| # Select a template for this category | |
| templates = sound_templates.get(category, sound_templates["unknown"]) | |
| template = random.choice(templates) | |
| # Fill in the template | |
| description = template.format(volume=volume, object=object_name) | |
| return description | |
| def create_audio_prompt(self, object_depths): | |
| if not object_depths: | |
| return "Environmental ambient sounds." | |
| for obj in object_depths: | |
| if obj.get("sound_description") and len(obj["sound_description"]) > 5: | |
| return obj["sound_description"] | |
| return f"Sounds of {object_depths[0]['original_label']}." | |
| def process_and_generate_audio(self, object_depths, output_path=None, duration=10, steps=25, guidance_scale=4.5): | |
| self._load_model() | |
| if not object_depths: | |
| prompt = "Environmental ambient sounds." | |
| else: | |
| # Sort objects by depth to prioritize closer objects | |
| sorted_objects = sorted(object_depths, key=lambda x: x["mean_depth"]) | |
| prompt = self.create_audio_prompt(sorted_objects) | |
| print(f"Generated audio prompt: {prompt}") | |
| wave = self.generate_sound( | |
| prompt, | |
| steps=steps, | |
| duration=duration, | |
| guidance_scale=guidance_scale | |
| ) | |
| sample_rate = self.model.vae.config.sampling_rate | |
| if output_path: | |
| torchaudio.save( | |
| output_path, | |
| wave.unsqueeze(0), | |
| sample_rate | |
| ) | |
| print(f"Audio saved to: {output_path}") | |
| return wave, sample_rate |