Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| import os | |
| import shutil | |
| import uuid | |
| import json | |
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| import time | |
| # Import pipeline modules | |
| from utils.transcribe import transcribe_audio, list_available_whisper_models | |
| from utils.segment import segment_lyrics | |
| from utils.prompt_gen import generate_scene_prompts, list_available_llm_models | |
| from utils.video_gen import ( | |
| create_video_segments, | |
| list_available_image_models, | |
| list_available_video_models, | |
| preview_image_generation | |
| ) | |
| from utils.glue import stitch_and_caption | |
| # Create output directories if not existing | |
| os.makedirs("templates", exist_ok=True) | |
| os.makedirs("templates/minimalist", exist_ok=True) | |
| os.makedirs("tmp", exist_ok=True) | |
| # Load available model options | |
| WHISPER_MODELS = list_available_whisper_models() | |
| DEFAULT_WHISPER_MODEL = "medium.en" | |
| LLM_MODELS = list_available_llm_models() | |
| DEFAULT_LLM_MODEL = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ" | |
| IMAGE_MODELS = list_available_image_models() | |
| DEFAULT_IMAGE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0" | |
| VIDEO_MODELS = list_available_video_models() | |
| DEFAULT_VIDEO_MODEL = "stabilityai/stable-video-diffusion-img2vid-xt" | |
| # Default prompt template | |
| DEFAULT_PROMPT_TEMPLATE = """You are a cinematographer generating a scene for a music video. | |
| Describe one vivid visual scene ({max_words} words max) that matches the mood and imagery of these lyrics. | |
| Focus on setting, atmosphere, lighting, and framing. Do not mention the artist or singing. | |
| Use only {max_sentences} sentence(s). | |
| Lyrics: "{lyrics}" | |
| Scene description:""" | |
| # Prepare style template options by scanning templates/ directory | |
| TEMPLATE_DIR = "templates" | |
| template_choices = [] | |
| for name in os.listdir(TEMPLATE_DIR): | |
| if os.path.isdir(os.path.join(TEMPLATE_DIR, name)): | |
| template_choices.append(name) | |
| template_choices = sorted(template_choices) | |
| DEFAULT_TEMPLATE = "minimalist" if "minimalist" in template_choices else (template_choices[0] if template_choices else None) | |
| # Advanced settings defaults | |
| DEFAULT_RESOLUTION = "1024x576" # default resolution | |
| DEFAULT_FPS_MODE = "Auto" # auto-match lyric timing | |
| DEFAULT_SEED = 0 # 0 means random seed | |
| DEFAULT_MAX_WORDS = 30 # default word limit for scene descriptions | |
| DEFAULT_MAX_SENTENCES = 1 # default sentence limit | |
| DEFAULT_CROSSFADE = 0.25 # default crossfade duration | |
| DEFAULT_STYLE_SUFFIX = "cinematic, 35 mm, shallow depth of field, film grain" | |
| # Mode for image generation | |
| IMAGE_MODES = ["Independent", "Consistent (Img2Img)"] | |
| DEFAULT_IMAGE_MODE = "Independent" | |
| def process_audio( | |
| audio_path, | |
| whisper_model, | |
| llm_model, | |
| image_model, | |
| video_model, | |
| template_name, | |
| resolution, | |
| fps_mode, | |
| seed, | |
| prompt_template, | |
| max_words, | |
| max_sentences, | |
| style_suffix, | |
| image_mode, | |
| strength, | |
| crossfade_duration | |
| ): | |
| """ | |
| End-to-end processing function to generate the music video with kinetic subtitles. | |
| Returns final video file path for preview and download. | |
| """ | |
| # Default progress function just prints to console | |
| def progress(percent, desc=""): | |
| print(f"Progress: {percent}% - {desc}") | |
| # Input validation | |
| if not audio_path or not os.path.exists(audio_path): | |
| raise ValueError("Please provide a valid audio file") | |
| if not template_name or template_name not in template_choices: | |
| template_name = DEFAULT_TEMPLATE or "minimalist" | |
| # Prepare a unique temp directory for this run (to avoid conflicts between parallel jobs) | |
| session_id = str(uuid.uuid4())[:8] | |
| work_dir = os.path.join("tmp", f"run_{session_id}") | |
| os.makedirs(work_dir, exist_ok=True) | |
| # Save parameter settings for debugging | |
| params = { | |
| "whisper_model": whisper_model, | |
| "llm_model": llm_model, | |
| "image_model": image_model, | |
| "video_model": video_model, | |
| "template": template_name, | |
| "resolution": resolution, | |
| "fps_mode": fps_mode, | |
| "seed": seed, | |
| "max_words": max_words, | |
| "max_sentences": max_sentences, | |
| "style_suffix": style_suffix, | |
| "image_mode": image_mode, | |
| "strength": strength, | |
| "crossfade_duration": crossfade_duration | |
| } | |
| with open(os.path.join(work_dir, "params.json"), "w") as f: | |
| json.dump(params, f, indent=2) | |
| try: | |
| # 1. Transcription | |
| progress(0, desc="Transcribing audio with Whisper...") | |
| try: | |
| result = transcribe_audio(audio_path, whisper_model) | |
| if not result or 'segments' not in result: | |
| raise ValueError("Transcription failed - no speech detected") | |
| except Exception as e: | |
| raise RuntimeError(f"Audio transcription failed: {str(e)}") | |
| progress(15, desc="Transcription completed. Segmenting lyrics...") | |
| # 2. Segmentation | |
| try: | |
| segments = segment_lyrics(result) | |
| if not segments: | |
| raise ValueError("No valid segments found in transcription") | |
| except Exception as e: | |
| raise RuntimeError(f"Audio segmentation failed: {str(e)}") | |
| progress(25, desc=f"Detected {len(segments)} lyric segments. Generating scene prompts...") | |
| # 3. Scene-prompt generation | |
| try: | |
| # Format the prompt template with the limits | |
| formatted_prompt_template = prompt_template.format( | |
| max_words=max_words, | |
| max_sentences=max_sentences, | |
| lyrics="{lyrics}" # This placeholder will be filled for each segment | |
| ) | |
| prompts = generate_scene_prompts( | |
| segments, | |
| llm_model=llm_model, | |
| prompt_template=formatted_prompt_template, | |
| style_suffix=style_suffix | |
| ) | |
| if len(prompts) != len(segments): | |
| raise ValueError(f"Prompt generation mismatch: {len(prompts)} prompts for {len(segments)} segments") | |
| except Exception as e: | |
| raise RuntimeError(f"Scene prompt generation failed: {str(e)}") | |
| # Save generated prompts for display or debugging | |
| with open(os.path.join(work_dir, "prompts.txt"), "w", encoding="utf-8") as f: | |
| for i, p in enumerate(prompts): | |
| f.write(f"Segment {i+1}: {p}\n") | |
| progress(35, desc="Scene prompts ready. Generating video segments...") | |
| # Parse resolution with validation | |
| try: | |
| if resolution and "x" in resolution.lower(): | |
| width, height = map(int, resolution.lower().split("x")) | |
| if width <= 0 or height <= 0: | |
| raise ValueError("Invalid resolution values") | |
| else: | |
| width, height = 1024, 576 # default high resolution | |
| except (ValueError, TypeError) as e: | |
| print(f"Warning: Invalid resolution '{resolution}', using default 1024x576") | |
| width, height = 1024, 576 | |
| # Determine FPS handling | |
| fps_value = None | |
| dynamic_fps = True | |
| if fps_mode and fps_mode.lower() != "auto": | |
| try: | |
| fps_value = float(fps_mode) | |
| if fps_value <= 0: | |
| raise ValueError("FPS must be positive") | |
| dynamic_fps = False | |
| except (ValueError, TypeError): | |
| print(f"Warning: Invalid FPS '{fps_mode}', using auto mode") | |
| fps_value = None | |
| dynamic_fps = True | |
| # 4. Imageβvideo generation for each segment | |
| try: | |
| segment_videos = create_video_segments( | |
| segments, | |
| prompts, | |
| image_model=image_model, | |
| video_model=video_model, | |
| width=width, | |
| height=height, | |
| dynamic_fps=dynamic_fps, | |
| base_fps=fps_value, | |
| seed=seed, | |
| work_dir=work_dir, | |
| image_mode=image_mode, | |
| strength=strength, | |
| progress_callback=None | |
| ) | |
| if not segment_videos: | |
| raise ValueError("No video segments were generated") | |
| except Exception as e: | |
| raise RuntimeError(f"Video generation failed: {str(e)}") | |
| progress(80, desc="Video segments generated. Stitching and adding subtitles...") | |
| # 5. Concatenation & audio syncing, plus kinetic subtitles overlay | |
| try: | |
| final_video_path = stitch_and_caption( | |
| segment_videos, | |
| audio_path, | |
| segments, | |
| template_name, | |
| work_dir=work_dir, | |
| crossfade_duration=crossfade_duration | |
| ) | |
| if not final_video_path or not os.path.exists(final_video_path): | |
| raise ValueError("Final video file was not created") | |
| except Exception as e: | |
| raise RuntimeError(f"Video stitching and captioning failed: {str(e)}") | |
| progress(100, desc="β Generation complete!") | |
| return final_video_path, work_dir | |
| except Exception as e: | |
| # Enhanced error reporting | |
| error_msg = str(e) | |
| if "CUDA" in error_msg or "GPU" in error_msg: | |
| error_msg += "\n\nπ‘ Tip: This application requires a CUDA-compatible GPU with sufficient VRAM." | |
| elif "model" in error_msg.lower(): | |
| error_msg += "\n\nπ‘ Tip: Model loading failed. Check your internet connection and try again." | |
| elif "audio" in error_msg.lower(): | |
| error_msg += "\n\nπ‘ Tip: Please ensure your audio file is in a supported format (MP3, WAV, M4A)." | |
| print(f"Error during processing: {error_msg}") | |
| raise RuntimeError(error_msg) | |
| # Define Gradio UI components | |
| with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π΅ Audio β Kinetic-Subtitle Music Video | |
| Transform your audio tracks into dynamic music videos with AI-generated scenes and animated subtitles. | |
| **β¨ Features:** | |
| - π€ **Whisper Transcription** - Accurate speech-to-text with word-level timing | |
| - π§ **AI Scene Generation** - LLM-powered visual descriptions from lyrics | |
| - π¨ **Image & Video AI** - Stable Diffusion + Video Diffusion models | |
| - π¬ **Kinetic Subtitles** - Animated text synchronized with audio | |
| - β‘ **Fully Local** - No API keys required, runs on your GPU | |
| **π Quick Start:** | |
| 1. Upload an audio file (MP3, WAV, M4A) | |
| 2. Choose your AI models (or keep defaults) | |
| 3. Customize style and settings | |
| 4. Click "Generate Music Video" | |
| """) | |
| # System requirements info | |
| with gr.Accordion("π» System Requirements & Tips", open=False): | |
| gr.Markdown(""" | |
| **Hardware Requirements:** | |
| - NVIDIA GPU with 8GB+ VRAM (recommended: RTX 3080/4070 or better) | |
| - 16GB+ system RAM | |
| - Fast storage (SSD recommended) | |
| **Supported Audio Formats:** | |
| - MP3, WAV, M4A, FLAC, OGG | |
| - Recommended: Clear vocals, 30 seconds to 3 minutes | |
| **Performance Tips:** | |
| - Use lower resolution (512x288) for faster generation | |
| - Choose smaller models for quicker processing | |
| - Ensure stable power supply for GPU-intensive tasks | |
| """) | |
| # Main configuration | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| label="π΅ Upload Audio Track", | |
| type="filepath", | |
| info="Upload your music file. For best results, use clear audio with distinct vocals." | |
| ) | |
| with gr.Column(): | |
| # Quick settings panel | |
| gr.Markdown("### β‘ Quick Settings") | |
| quick_quality = gr.Radio( | |
| choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"], | |
| value="Balanced (1024x576)", | |
| label="Quality Preset", | |
| info="Higher quality = better results but slower generation" | |
| ) | |
| # Model selection tabs | |
| with gr.Tabs(): | |
| with gr.TabItem("π€ AI Models"): | |
| gr.Markdown("**Choose the AI models for each processing step:**") | |
| with gr.Row(): | |
| with gr.Column(): | |
| whisper_dropdown = gr.Dropdown( | |
| label="π€ Transcription Model (Whisper)", | |
| choices=WHISPER_MODELS, | |
| value=DEFAULT_WHISPER_MODEL, | |
| info="Larger models are more accurate but slower. 'medium.en' is recommended for English." | |
| ) | |
| llm_dropdown = gr.Dropdown( | |
| label="π§ Scene Description Model (LLM)", | |
| choices=LLM_MODELS, | |
| value=DEFAULT_LLM_MODEL, | |
| info="Language model to generate visual scene descriptions from lyrics." | |
| ) | |
| with gr.Column(): | |
| image_dropdown = gr.Dropdown( | |
| label="π¨ Image Generation Model", | |
| choices=IMAGE_MODELS, | |
| value=DEFAULT_IMAGE_MODEL, | |
| info="Stable Diffusion model for generating scene images." | |
| ) | |
| video_dropdown = gr.Dropdown( | |
| label="π¬ Video Animation Model", | |
| choices=VIDEO_MODELS, | |
| value=DEFAULT_VIDEO_MODEL, | |
| info="Model to animate still images into video clips." | |
| ) | |
| with gr.TabItem("βοΈ Scene Prompting"): | |
| gr.Markdown("**Customize how AI generates scene descriptions:**") | |
| with gr.Column(): | |
| prompt_template_input = gr.Textbox( | |
| label="LLM Prompt Template", | |
| value=DEFAULT_PROMPT_TEMPLATE, | |
| lines=6, | |
| info="Template for generating scene descriptions. Use {lyrics}, {max_words}, and {max_sentences} as placeholders." | |
| ) | |
| with gr.Row(): | |
| max_words_input = gr.Slider( | |
| label="Max Words per Scene", | |
| minimum=10, | |
| maximum=100, | |
| step=5, | |
| value=DEFAULT_MAX_WORDS, | |
| info="Limit words in each scene description (more words = more detailed scenes)." | |
| ) | |
| max_sentences_input = gr.Slider( | |
| label="Max Sentences per Scene", | |
| minimum=1, | |
| maximum=5, | |
| step=1, | |
| value=DEFAULT_MAX_SENTENCES, | |
| info="Limit sentences per scene (1-2 recommended for music videos)." | |
| ) | |
| style_suffix_input = gr.Textbox( | |
| label="Visual Style Keywords", | |
| value=DEFAULT_STYLE_SUFFIX, | |
| info="Style keywords added to all scenes for consistent visual style (e.g., 'cinematic, vibrant colors')." | |
| ) | |
| with gr.TabItem("π¬ Video Settings"): | |
| gr.Markdown("**Configure video output and subtitle styling:**") | |
| with gr.Column(): | |
| with gr.Row(): | |
| template_dropdown = gr.Dropdown( | |
| label="πͺ Subtitle Animation Style", | |
| choices=template_choices, | |
| value=DEFAULT_TEMPLATE, | |
| info="Choose the kinetic subtitle animation style." | |
| ) | |
| res_dropdown = gr.Dropdown( | |
| label="πΊ Video Resolution", | |
| choices=["512x288", "1024x576", "1280x720"], | |
| value=DEFAULT_RESOLUTION, | |
| info="Higher resolution = better quality but much slower generation." | |
| ) | |
| with gr.Row(): | |
| fps_input = gr.Textbox( | |
| label="ποΈ Video FPS", | |
| value=DEFAULT_FPS_MODE, | |
| info="Frames per second. Use 'Auto' to match lyric timing, or set fixed value (e.g., '24', '30')." | |
| ) | |
| seed_input = gr.Number( | |
| label="π± Random Seed", | |
| value=DEFAULT_SEED, | |
| precision=0, | |
| info="Set seed for reproducible results (0 = random). Use same seed to recreate results." | |
| ) | |
| with gr.Row(): | |
| image_mode_input = gr.Radio( | |
| label="πΌοΈ Scene Generation Mode", | |
| choices=IMAGE_MODES, | |
| value=DEFAULT_IMAGE_MODE, | |
| info="Independent: each scene is unique. Consistent: scenes influence each other for style continuity." | |
| ) | |
| strength_slider = gr.Slider( | |
| label="π― Style Consistency Strength", | |
| minimum=0.1, | |
| maximum=0.9, | |
| step=0.05, | |
| value=0.5, | |
| visible=False, | |
| info="How much each scene influences the next (lower = more influence, higher = more variety)." | |
| ) | |
| crossfade_slider = gr.Slider( | |
| label="π Scene Transition Duration", | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.05, | |
| value=DEFAULT_CROSSFADE, | |
| info="Smooth crossfade between scenes in seconds (0 = hard cuts, 0.25 = subtle blend)." | |
| ) | |
| # Quick preset handling | |
| def apply_quality_preset(preset): | |
| if preset == "Fast (512x288)": | |
| return gr.update(value="512x288"), gr.update(value="tiny"), gr.update(value="stabilityai/sdxl-turbo") | |
| elif preset == "High Quality (1280x720)": | |
| return gr.update(value="1280x720"), gr.update(value="large"), gr.update(value="stabilityai/stable-diffusion-xl-base-1.0") | |
| else: # Balanced | |
| return gr.update(value="1024x576"), gr.update(value="medium.en"), gr.update(value="stabilityai/stable-diffusion-xl-base-1.0") | |
| quick_quality.change( | |
| apply_quality_preset, | |
| inputs=[quick_quality], | |
| outputs=[res_dropdown, whisper_dropdown, image_dropdown] | |
| ) | |
| # Make strength slider visible only when Consistent mode is selected | |
| def update_strength_visibility(mode): | |
| return gr.update(visible=(mode == "Consistent (Img2Img)")) | |
| image_mode_input.change(update_strength_visibility, inputs=image_mode_input, outputs=strength_slider) | |
| # Enhanced preview section | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| preview_btn = gr.Button("π Preview First Scene", variant="secondary", size="lg") | |
| gr.Markdown("*Generate a quick preview of the first scene to test your settings*") | |
| with gr.Column(scale=2): | |
| generate_btn = gr.Button("π¬ Generate Complete Music Video", variant="primary", size="lg") | |
| gr.Markdown("*Start the full video generation process (this may take several minutes)*") | |
| # Preview results | |
| with gr.Row(visible=False) as preview_row: | |
| with gr.Column(): | |
| preview_img = gr.Image(label="Preview Scene", type="pil", height=300) | |
| with gr.Column(): | |
| preview_prompt = gr.Textbox(label="Generated Scene Description", lines=3) | |
| preview_info = gr.Markdown("") | |
| # Progress and status | |
| progress_bar = gr.Progress() | |
| status_text = gr.Textbox( | |
| label="π Generation Status", | |
| value="Ready to start...", | |
| interactive=False, | |
| lines=2 | |
| ) | |
| # Results section with better organization | |
| with gr.Tabs(): | |
| with gr.TabItem("π₯ Final Video"): | |
| output_video = gr.Video(label="Generated Music Video", format="mp4", height=400) | |
| with gr.Row(): | |
| download_file = gr.File(label="π₯ Download Video File", file_count="single") | |
| video_info = gr.Textbox(label="Video Information", lines=2, interactive=False) | |
| with gr.TabItem("πΌοΈ Generated Images"): | |
| image_gallery = gr.Gallery( | |
| label="Scene Images from Video Generation", | |
| columns=3, | |
| rows=2, | |
| height="auto", | |
| object_fit="contain", | |
| show_label=True | |
| ) | |
| gallery_info = gr.Markdown("*Scene images will appear here after generation*") | |
| with gr.TabItem("π Scene Descriptions"): | |
| with gr.Accordion("Generated Scene Prompts", open=True): | |
| prompt_text = gr.Markdown("", elem_id="prompt_markdown") | |
| segment_info = gr.Textbox( | |
| label="Segmentation Summary", | |
| lines=3, | |
| interactive=False, | |
| placeholder="Segment analysis will appear here..." | |
| ) | |
| # Preview function | |
| def on_preview( | |
| audio, whisper_model, llm_model, image_model, | |
| prompt_template, max_words, max_sentences, style_suffix, resolution | |
| ): | |
| if not audio: | |
| return (gr.update(visible=False), None, "Please upload audio first", | |
| "β οΈ **No audio file provided**\n\nPlease upload an audio file to generate a preview.") | |
| # Quick transcription and segmentation of first few seconds | |
| try: | |
| # Extract first 10 seconds of audio for quick preview | |
| import subprocess | |
| import tempfile | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: | |
| temp_audio_path = temp_audio.name | |
| # Use ffmpeg to extract first 10 seconds | |
| subprocess.run([ | |
| "ffmpeg", "-y", "-i", audio, "-ss", "0", "-t", "10", | |
| "-acodec", "pcm_s16le", temp_audio_path | |
| ], check=True, capture_output=True) | |
| # Transcribe with fastest model for preview | |
| result = transcribe_audio(temp_audio_path, "tiny") | |
| segments = segment_lyrics(result) | |
| os.unlink(temp_audio_path) | |
| if not segments: | |
| return (gr.update(visible=False), None, "No speech detected in first 10 seconds", | |
| "β οΈ **No speech detected**\n\nTry with audio that has clear vocals at the beginning.") | |
| first_segment = segments[0] | |
| # Format prompt template | |
| formatted_prompt = prompt_template.format( | |
| max_words=max_words, | |
| max_sentences=max_sentences, | |
| lyrics=first_segment["text"] | |
| ) | |
| # Generate prompt | |
| scene_prompt = generate_scene_prompts( | |
| [first_segment], | |
| llm_model=llm_model, | |
| prompt_template=formatted_prompt, | |
| style_suffix=style_suffix | |
| )[0] | |
| # Generate image | |
| if resolution and "x" in resolution.lower(): | |
| width, height = map(int, resolution.lower().split("x")) | |
| else: | |
| width, height = 1024, 576 | |
| image = preview_image_generation( | |
| scene_prompt, | |
| image_model=image_model, | |
| width=width, | |
| height=height | |
| ) | |
| # Create info text | |
| duration = first_segment['end'] - first_segment['start'] | |
| info_text = f""" | |
| β **Preview Generated Successfully** | |
| **Detected Lyrics:** "{first_segment['text'][:100]}{'...' if len(first_segment['text']) > 100 else ''}" | |
| **Scene Duration:** {duration:.1f} seconds | |
| **Generated Description:** {scene_prompt[:150]}{'...' if len(scene_prompt) > 150 else ''} | |
| **Image Resolution:** {width}x{height} | |
| """ | |
| return gr.update(visible=True), image, scene_prompt, info_text | |
| except subprocess.CalledProcessError as e: | |
| return (gr.update(visible=False), None, "Audio processing failed", | |
| "β **Audio Processing Error**\n\nFFmpeg failed to process the audio file. Please check the format.") | |
| except Exception as e: | |
| print(f"Preview error: {e}") | |
| return (gr.update(visible=False), None, f"Preview failed: {str(e)}", | |
| f"β **Preview Error**\n\n{str(e)}\n\nPlease check your audio file and model settings.") | |
| # Bind button click to processing function | |
| def on_generate( | |
| audio, whisper_model, llm_model, image_model, video_model, | |
| template_name, resolution, fps, seed, prompt_template, | |
| max_words, max_sentences, style_suffix, image_mode, strength, | |
| crossfade_duration | |
| ): | |
| if not audio: | |
| return (None, None, gr.update(value="**No audio file provided**\n\nPlease upload an audio file to start generation.", visible=True), | |
| [], "Ready to start...", "", "") | |
| try: | |
| # Start generation | |
| start_time = time.time() | |
| final_path, work_dir = process_audio( | |
| audio, whisper_model, llm_model, image_model, video_model, | |
| template_name, resolution, fps, int(seed), prompt_template, | |
| max_words, max_sentences, style_suffix, image_mode, strength, | |
| crossfade_duration | |
| ) | |
| generation_time = time.time() - start_time | |
| # Load prompts from file to display | |
| prompts_file = os.path.join(work_dir, "prompts.txt") | |
| prompts_markdown = "" | |
| try: | |
| with open(prompts_file, 'r', encoding='utf-8') as pf: | |
| content = pf.read() | |
| # Format prompts as numbered list | |
| prompts_lines = content.strip().splitlines() | |
| prompts_markdown = "\n".join([f"**{line}**" for line in prompts_lines]) | |
| except: | |
| prompts_markdown = "Scene prompts not available" | |
| # Load segment information | |
| segment_summary = "" | |
| try: | |
| # Get audio duration and file info | |
| import subprocess | |
| duration_cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", | |
| "-of", "default=noprint_wrappers=1:nokey=1", audio] | |
| audio_duration = float(subprocess.check_output(duration_cmd, text=True).strip()) | |
| file_size = os.path.getsize(final_path) / (1024 * 1024) # MB | |
| segment_summary = f"""π **Generation Summary:** | |
| β’ Audio Duration: {audio_duration:.1f} seconds | |
| β’ Processing Time: {generation_time/60:.1f} minutes | |
| β’ Final Video Size: {file_size:.1f} MB | |
| β’ Resolution: {resolution} | |
| β’ Template: {template_name}""" | |
| except: | |
| segment_summary = f"Generation completed in {generation_time/60:.1f} minutes" | |
| # Load generated images for the gallery | |
| images = [] | |
| try: | |
| import glob | |
| image_files = glob.glob(os.path.join(work_dir, "*_img.png")) | |
| for img_file in sorted(image_files): | |
| try: | |
| img = Image.open(img_file) | |
| images.append(img) | |
| except: | |
| pass | |
| except Exception as e: | |
| print(f"Error loading images for gallery: {e}") | |
| # Create video info | |
| video_info = f"β Video generated successfully!\nFile: {os.path.basename(final_path)}\nSize: {file_size:.1f} MB" | |
| gallery_info_text = f"**{len(images)} scene images generated**" if images else "No images available" | |
| return (final_path, final_path, gr.update(value=prompts_markdown, visible=True), | |
| images, f"β Generation complete! ({generation_time/60:.1f} minutes)", | |
| video_info, segment_summary) | |
| except Exception as e: | |
| error_msg = str(e) | |
| print(f"Generation error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return (None, None, gr.update(value=f"**β Generation Failed**\n\n{error_msg}", visible=True), | |
| [], f"β Error: {error_msg}", "", "") | |
| preview_btn.click( | |
| on_preview, | |
| inputs=[ | |
| audio_input, whisper_dropdown, llm_dropdown, image_dropdown, | |
| prompt_template_input, max_words_input, max_sentences_input, | |
| style_suffix_input, res_dropdown | |
| ], | |
| outputs=[preview_row, preview_img, preview_prompt, preview_info] | |
| ) | |
| generate_btn.click( | |
| on_generate, | |
| inputs=[ | |
| audio_input, whisper_dropdown, llm_dropdown, image_dropdown, video_dropdown, | |
| template_dropdown, res_dropdown, fps_input, seed_input, prompt_template_input, | |
| max_words_input, max_sentences_input, style_suffix_input, | |
| image_mode_input, strength_slider, crossfade_slider | |
| ], | |
| outputs=[output_video, download_file, prompt_text, image_gallery, status_text, video_info, segment_info] | |
| ) | |
| if __name__ == "__main__": | |
| # Uncomment for custom hosting options | |
| # demo.launch(server_name='0.0.0.0', server_port=7860) | |
| demo.launch() |