Spaces:
Running
Running
| import os | |
| import argparse | |
| from PIL import Image | |
| import numpy as np | |
| import torch | |
| import torchaudio | |
| import gc | |
| from config import LOGS_DIR, OUTPUT_DIR | |
| from DepthEstimator import DepthEstimator | |
| from SoundMapper import SoundMapper | |
| from GenerateAudio import GenerateAudio | |
| from GenerateCaptions import generate_caption | |
| from audio_mixer import compose_audio | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Generate sound from panoramic images") | |
| parser.add_argument("--image_dir", type=str, default=LOGS_DIR, help="Directory containing input images") | |
| parser.add_argument("--output_dir", type=str, default=OUTPUT_DIR, help="Directory for output files") | |
| parser.add_argument("--audio_duration", type=int, default=10, help="Duration of generated audio in seconds") | |
| parser.add_argument("--location", type=str, default="52.3436723,4.8529625", help='Location in format "latitude,longitude" (e.g., "40.7128,-74.0060")') | |
| parser.add_argument("--view", type=str, default="front", choices=["front", "back", "left", "right"], help="Perspective view to analyze") | |
| parser.add_argument("--model", type=str, default="intern_2_5-4B", help="Vision-language model to use for analysis") | |
| parser.add_argument("--cpu_only", action="store_true", help="Force CPU usage even if CUDA is available") | |
| parser.add_argument("--panoramic", action="store_true", default=False, | |
| help="Process panoramic images instead of a single image") | |
| args = parser.parse_args() | |
| lat, lon = args.location.split(",") | |
| os.makedirs(args.output_dir, exist_ok=True) | |
| if args.panoramic: | |
| print("-----------Processing panoramic images-----------") | |
| # Generate captions for all views at once with panoramic=True | |
| view_results = generate_caption(lat, lon, view=args.view, model=args.model, | |
| cpu_only=args.cpu_only, panoramic=True) | |
| if not view_results: | |
| print("Failed to generate captions for panoramic views") | |
| return | |
| sound_mapper = SoundMapper() | |
| processed_maps = sound_mapper.process_depth_maps() | |
| image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")] | |
| # Create audio generator | |
| audio_generator = GenerateAudio() | |
| sound_tracks_dict = {} # keep track of sound tracks and their weight | |
| # Process each view | |
| for i, view_result in enumerate(view_results): | |
| current_view = view_result["view"] | |
| print(f"Processing {current_view} view ({i+1}/{len(view_results)})") | |
| # Find corresponding image path for this view | |
| image_path = os.path.join(args.image_dir, f"{current_view}.jpg") | |
| if not os.path.exists(image_path): | |
| print(f"Warning: Image file {image_path} not found") | |
| continue | |
| image_index = [idx for idx, path in enumerate(image_paths) | |
| if os.path.basename(path) == f"{current_view}.jpg"] | |
| if not image_index: | |
| print(f"Could not find processed map for {current_view} view") | |
| continue | |
| depth_map = processed_maps[image_index[0]]["normalization"] | |
| object_depths = sound_mapper.analyze_object_depths( | |
| image_path, depth_map, lat, lon, | |
| caption_data=view_result, | |
| all_objects=False | |
| ) | |
| if not object_depths: | |
| print(f"No objects detected in the {current_view} view") | |
| continue | |
| # Generate audio for this view | |
| output_path = os.path.join(args.output_dir, f"sound_{current_view}.wav") | |
| print(f"Generating audio for {current_view} view...") | |
| audio, sample_rate = audio_generator.process_and_generate_audio( | |
| object_depths, | |
| duration=args.audio_duration | |
| ) | |
| if audio.dim() == 3: | |
| audio = audio.squeeze(0) | |
| elif audio.dim() == 1: | |
| audio = audio.unsqueeze(0) | |
| if audio.dim() != 2: | |
| raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D") | |
| torchaudio.save( | |
| output_path, | |
| audio, | |
| sample_rate | |
| ) | |
| if object_depths: | |
| sound_tracks_dict[output_path] = object_depths[0]['weight'] | |
| print(f"Generated audio saved to: {output_path}") | |
| print("-" * 50) | |
| if sound_tracks_dict: | |
| print("Composing final audio from all views...") | |
| compose_audio( | |
| list(sound_tracks_dict.keys()), | |
| list(sound_tracks_dict.values()), | |
| os.path.join(args.output_dir, "panoramic_composition.wav") | |
| ) | |
| print(f"Final audio composition saved to: {os.path.join(args.output_dir, 'panoramic_composition.wav')}") | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| del sound_mapper, audio_generator | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| else: | |
| print("Processing single image...") | |
| view_result = generate_caption(lat, lon, view=args.view, model=args.model, | |
| cpu_only=args.cpu_only, panoramic=False) | |
| if not view_result: | |
| print("Failed to generate caption for the view") | |
| return | |
| image_path = os.path.join(args.image_dir, f"{args.view}.jpg") | |
| if not os.path.exists(image_path): | |
| print(f"Error: Image file {image_path} not found") | |
| return | |
| print(f"Processing image: {image_path}") | |
| sound_mapper = SoundMapper() | |
| processed_maps = sound_mapper.process_depth_maps() | |
| image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")] | |
| image_basename = os.path.basename(image_path) | |
| image_index = [i for i, path in enumerate(image_paths) if os.path.basename(path) == image_basename] | |
| if not image_index: | |
| print(f"Could not find processed map for {image_basename}") | |
| return | |
| depth_map = processed_maps[image_index[0]]["normalization"] | |
| print("Detecting objects and their depths...") | |
| object_depths = sound_mapper.analyze_object_depths( | |
| image_path, depth_map, lat, lon, | |
| caption_data=view_result, | |
| all_objects=True | |
| ) | |
| if not object_depths: | |
| print("No objects detected in the image.") | |
| return | |
| print(f"Detected {len(object_depths)} objects:") | |
| for obj in object_depths: | |
| print(f" - {obj['original_label']} (Zone: {obj['zone_description']}, Depth: {obj['mean_depth']:.4f})") | |
| print("Generating audio...") | |
| audio_generator = GenerateAudio() | |
| audio, sample_rate = audio_generator.process_and_generate_audio( | |
| object_depths, | |
| duration=args.audio_duration | |
| ) | |
| if audio.dim() == 3: | |
| audio = audio.squeeze(0) | |
| elif audio.dim() == 1: | |
| audio = audio.unsqueeze(0) | |
| if audio.dim() != 2: | |
| raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D") | |
| output_path = os.path.join(args.output_dir, f"sound_{args.view}.wav") | |
| torchaudio.save( | |
| output_path, | |
| audio, | |
| sample_rate | |
| ) | |
| print(f"Generated audio saved to: {output_path}") | |
| if __name__ == "__main__": | |
| main() | |
| # Usage: | |
| #(For single image): python main.py --view front | |
| #(For panoramic images): python main.py --panoramic |