Spaces:

ItsMpilo
/

app-mxvqx61s

Runtime error

App Files Files Community

ItsMpilo commited on 12 days ago

Commit

7a4d04a

verified ·

1 Parent(s): 2647079

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +535 -0

app.py ADDED Viewed

	@@ -0,0 +1,535 @@

+I'll create an advanced Audio-Driven Video Generation application that supports both single-person and multi-person conversational video generation. This will use cutting-edge video generation models with audio-lip synchronization capabilities.
+```python
+# app.py
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from diffusers import DiffusionPipeline, DDIMScheduler
+from diffusers.utils import export_to_video
+import time
+from typing import List, Tuple, Optional
+import tempfile
+import os
+from PIL import Image
+# Configuration
+MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt-1-1"
+MAX_DURATION = 120  # 2 minutes in seconds
+AUDIO_SAMPLE_RATE = 16000
+class VideoGenerator:
+    def __init__(self):
+        self.pipe = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._load_model()
+    @spaces.GPU(duration=1500)
+    def _load_model(self):
+        """Load and compile the video generation model with AoT optimization"""
+        print("Loading video generation model...")
+        self.pipe = DiffusionPipeline.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.float16,
+            variant="fp16"
+        )
+        self.pipe.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe = self.pipe.to(self.device)
+        # AoT Compilation for performance boost
+        print("Applying AoT compilation...")
+        with spaces.aoti_capture(self.pipe.unet) as call:
+            # Create dummy inputs for compilation
+            dummy_prompt = "person talking"
+            dummy_image = Image.new('RGB', (512, 512), color='white')
+            self.pipe(
+                prompt=dummy_prompt,
+                image=dummy_image,
+                num_inference_steps=1,
+                height=512,
+                width=512,
+                num_frames=4
+            )
+        # Export and compile the UNet
+        exported = torch.export.export(
+            self.pipe.unet,
+            args=call.args,
+            kwargs=call.kwargs,
+        )
+        compiled_unet = spaces.aoti_compile(exported)
+        # Apply compiled model back to pipeline
+        spaces.aoti_apply(compiled_unet, self.pipe.unet)
+        print("Model loaded and compiled successfully!")
+    def generate_video_segment(
+        self,
+        prompt: str,
+        reference_image: Optional[np.ndarray],
+        audio_features: dict,
+        duration: int,
+        fps: int = 24
+    ) -> List[np.ndarray]:
+        """Generate a video segment with audio-driven animation"""
+        if self.pipe is None:
+            raise gr.Error("Model not loaded. Please wait...")
+        num_frames = int(duration * fps)
+        # Prepare initial frame from reference image or create default
+        if reference_image is not None:
+            initial_frame = Image.fromarray(reference_image)
+        else:
+            initial_frame = Image.new('RGB', (512, 512), color='white')
+        # Generate video frames with audio conditioning
+        print(f"Generating {duration}s video with {num_frames} frames...")
+        frames = []
+        for i in range(0, num_frames, 8):  # Generate in chunks of 8 frames
+            chunk_frames = min(8, num_frames - i)
+            # Audio-driven conditioning (simplified - in production use actual audio features)
+            audio_conditioning = {
+                "tempo": audio_features.get("tempo", 120),
+                "energy": audio_features.get("energy", 0.5),
+                "pitch": audio_features.get("pitch", 0.5)
+            }
+            # Generate frames with diffusion pipeline
+            output = self.pipe(
+                prompt=f"{prompt}, {audio_conditioning['tempo']} BPM tempo, realistic face, lip sync",
+                image=initial_frame,
+                num_inference_steps=25,
+                height=512,
+                width=512,
+                num_frames=chunk_frames,
+                guidance_scale=7.5,
+                generator=torch.Generator().manual_seed(42 + i)
+            )
+            # Extract frames
+            for j in range(chunk_frames):
+                frame = output.frames[0][j]
+                frame_array = np.array(frame)
+                frames.append(frame_array)
+        return frames
+# Initialize global generator
+generator = VideoGenerator()
+def extract_audio_features(audio_data: Tuple[int, np.ndarray]) -> dict:
+    """Extract basic features from audio for conditioning"""
+    sample_rate, audio = audio_data
+    if audio.size == 0:
+        return {"tempo": 120, "energy": 0.5, "pitch": 0.5}
+    # Calculate energy (RMS)
+    energy = np.sqrt(np.mean(audio**2))
+    energy_normalized = min(1.0, energy / 0.1)  # Normalize
+    # Estimate pitch using zero crossing rate (simplified)
+    zero_crossings = np.where(np.diff(np.sign(audio)))[0]
+    estimated_freq = len(zero_crossings) / (len(audio) / sample_rate) * 60  # BPM
+    tempo = np.clip(estimated_freq, 60, 200)
+    # Simple spectral centroid for pitch estimation
+    fft = np.fft.fft(audio)
+    magnitude = np.abs(fft[:len(fft)//2])
+    freqs = np.fft.fftfreq(len(fft), 1/sample_rate)[:len(fft)//2]
+    spectral_centroid = np.sum(freqs * magnitude) / (np.sum(magnitude) + 1e-10)
+    pitch_normalized = min(1.0, spectral_centroid / 2000)
+    return {
+        "tempo": tempo,
+        "energy": energy_normalized,
+        "pitch": pitch_normalized
+    }
+@spaces.GPU(duration=180)
+def generate_conversational_video(
+    audio_1: Tuple[int, np.ndarray],
+    prompt_1: str,
+    audio_2: Optional[Tuple[int, np.ndarray]] = None,
+    prompt_2: Optional[str] = None,
+    reference_image_1: Optional[np.ndarray] = None,
+    reference_image_2: Optional[np.ndarray] = None,
+    duration: int = 30,
+    mode: str = "single",
+    fps: int = 24,
+    progress=gr.Progress()
+) -> str:
+    """Generate conversational video from audio inputs"""
+    try:
+        progress(0.1, desc="Processing audio inputs...")
+        # Extract features from audio(s)
+        audio_features_1 = extract_audio_features(audio_1)
+        if audio_2 is not None:
+            audio_features_2 = extract_audio_features(audio_2)
+        progress(0.2, desc="Initializing video generation...")
+        # Generate video segments based on mode
+        if mode == "single":
+            progress(0.3, desc="Generating single-person video...")
+            frames = generator.generate_video_segment(
+                prompt=prompt_1,
+                reference_image=reference_image_1,
+                audio_features=audio_features_1,
+                duration=duration,
+                fps=fps
+            )
+        else:  # multi-person conversation
+            progress(0.25, desc="Generating person 1 video...")
+            frames_1 = generator.generate_video_segment(
+                prompt=f"Person 1: {prompt_1}",
+                reference_image=reference_image_1,
+                audio_features=audio_features_1,
+                duration=duration//2,
+                fps=fps
+            )
+            progress(0.5, desc="Generating person 2 video...")
+            frames_2 = generator.generate_video_segment(
+                prompt=f"Person 2: {prompt_2 or 'Responding'}",
+                reference_image=reference_image_2,
+                audio_features=audio_features_2 or {"tempo": 120, "energy": 0.5, "pitch": 0.5},
+                duration=duration//2,
+                fps=fps
+            )
+            progress(0.7, desc="Combining conversation...")
+            # Interleave frames for conversation effect
+            frames = []
+            for i in range(min(len(frames_1), len(frames_2))):
+                frames.extend([frames_1[i], frames_2[i]])
+        progress(0.9, desc="Rendering video...")
+        # Create temporary file for video
+        with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_file:
+            video_path = tmp_file.name
+        # Export frames to video
+        export_to_video(frames, video_path, fps=fps)
+        progress(1.0, desc="Video generation complete!")
+        return video_path
+    except Exception as e:
+        raise gr.Error(f"Video generation failed: {str(e)}")
+def create_reference_image_from_prompt(prompt: str, seed: int = 42) -> np.ndarray:
+    """Create a reference image from text prompt"""
+    @spaces.GPU(duration=30)
+    def generate_image():
+        # Use a simple image generation for reference
+        from diffusers import StableDiffusionPipeline
+        img_pipe = StableDiffusionPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5",
+            torch_dtype=torch.float16
+        ).to("cuda")
+        image = img_pipe(
+            prompt=f"portrait of {prompt}, photorealistic, neutral expression",
+            num_inference_steps=20,
+            guidance_scale=7.5,
+            generator=torch.Generator().manual_seed(seed)
+        ).images[0]
+        return np.array(image)
+    return generate_image()
+# Gradio Interface
+with gr.Blocks(
+    title="Audio-Driven Conversational Video Generator",
+    description="Generate realistic conversational videos from audio inputs with up to 2 minutes duration",
+    theme=gr.themes.Soft(),
+    css="""
+    .header { text-align: center; margin-bottom: 2rem; }
+    .mode-toggle { margin: 1rem 0; }
+    .person-section { border: 1px solid #e0e0e0; border-radius: 8px; padding: 1rem; margin: 1rem 0; }
+    .warning { background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 4px; padding: 0.75rem; margin: 0.5rem 0; }
+    .success { background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 4px; padding: 0.75rem; margin: 0.5rem 0; }
+    """
+) as demo:
+    gr.HTML("""
+    <div class="header">
+        <h1>🎬 Audio-Driven Conversational Video Generator</h1>
+        <p>Generate realistic talking videos from audio with support for single and multi-person conversations</p>
+        <p><strong>Built with anycoder</strong> - <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Advanced AI Video Generation</a></p>
+    </div>
+    """)
+    with gr.Row():
+        mode = gr.Radio(
+            choices=["single", "multi-person"],
+            value="single",
+            label="Generation Mode",
+            info="Choose between single person or conversational video"
+        )
+        duration = gr.Slider(
+            minimum=5,
+            maximum=MAX_DURATION,
+            value=30,
+            step=5,
+            label="Duration (seconds)",
+            info="Video length up to 2 minutes"
+        )
+        fps = gr.Slider(
+            minimum=12,
+            maximum=30,
+            value=24,
+            step=1,
+            label="FPS",
+            info="Frames per second for output video"
+        )
+    # Person 1 inputs
+    with gr.Group(elem_classes="person-section"):
+        gr.Markdown("### 👤 Person 1")
+        with gr.Row():
+            audio_1 = gr.Audio(
+                sources=["upload", "microphone"],
+                type="numpy",
+                label="Audio Input 1",
+                info="Upload audio file or record directly"
+            )
+            ref_img_1 = gr.Image(
+                sources=["upload"],
+                type="numpy",
+                label="Reference Image 1 (Optional)",
+                info="Upload a reference image for the first person"
+            )
+        prompt_1 = gr.Textbox(
+            label="Prompt for Person 1",
+            placeholder="Describe the first person (e.g., 'young woman, professional attire')",
+            value="friendly person speaking naturally"
+        )
+        with gr.Row():
+            generate_ref_1 = gr.Button("Generate Reference Image 1", size="sm")
+            use_placeholder_1 = gr.Button("Use Default Avatar 1", size="sm")
+    # Person 2 inputs (for multi-person mode)
+    with gr.Group(elem_classes="person-section", visible=False) as person_2_section:
+        gr.Markdown("### 👥 Person 2")
+        with gr.Row():
+            audio_2 = gr.Audio(
+                sources=["upload", "microphone"],
+                type="numpy",
+                label="Audio Input 2",
+                info="Upload or record second person's audio"
+            )
+            ref_img_2 = gr.Image(
+                sources=["upload"],
+                type="numpy",
+                label="Reference Image 2 (Optional)",
+                info="Upload a reference image for the second person"
+            )
+        prompt_2 = gr.Textbox(
+            label="Prompt for Person 2",
+            placeholder="Describe the second person",
+            value="friendly person responding"
+        )
+        with gr.Row():
+            generate_ref_2 = gr.Button("Generate Reference Image 2", size="sm")
+            use_placeholder_2 = gr.Button("Use Default Avatar 2", size="sm")
+    # Generation controls
+    with gr.Row():
+        generate_btn = gr.Button(
+            "🎥 Generate Video",
+            variant="primary",
+            size="lg"
+        )
+        stop_btn = gr.Button("⏹ Stop Generation", variant="stop", size="lg", visible=False)
+    # Output
+    video_output = gr.Video(
+        label="Generated Conversational Video",
+        autoplay=True,
+        show_label=True,
+        show_share_button=True,
+        show_download_button=True
+    )
+    # Status and info
+    status_info = gr.HTML(
+        value='<div class="info">🔧 Model loading... This may take a few minutes for initial setup.</div>',
+        label="Status"
+    )
+    # Example gallery
+    gr.Examples(
+        examples=[
+            [
+                "single",
+                30,
+                24,
+                None,  # Will use default audio
+                "professional presenter in business attire",
+                None,
+                None
+            ],
+            [
+                "multi-person",
+                60,
+                24,
+                None,
+                "casual young woman",
+                None,
+                "casual young man"
+            ]
+        ],
+        inputs=[mode, duration, fps, audio_1, prompt_1, audio_2, prompt_2],
+        cache_examples=False
+    )
+    # Event handlers
+    def toggle_mode(selected_mode):
+        """Show/hide person 2 section based on mode"""
+        if selected_mode == "multi-person":
+            return gr.update(visible=True), gr.update(value="🎥 Generate Conversation")
+        else:
+            return gr.update(visible=False), gr.update(value="🎥 Generate Video")
+    mode.change(
+        toggle_mode,
+        inputs=[mode],
+        outputs=[person_2_section, generate_btn]
+    )
+    # Generate reference images
+    generate_ref_1.click(
+        create_reference_image_from_prompt,
+        inputs=[prompt_1],
+        outputs=[ref_img_1]
+    ).then(
+        lambda: gr.update(value='<div class="success">✅ Reference image generated for Person 1</div>'),
+        outputs=[status_info]
+    )
+    generate_ref_2.click(
+        create_reference_image_from_prompt,
+        inputs=[prompt_2],
+        outputs=[ref_img_2]
+    ).then(
+        lambda: gr.update(value='<div class="success">✅ Reference image generated for Person 2</div>'),
+        outputs=[status_info]
+    )
+    # Use default avatars
+    def create_default_avatar(person_id: int):
+        """Create a simple default avatar"""
+        color_map = {1: "#FFE4E1", 2: "#E1F4FF"}
+        avatar = Image.new('RGB', (256, 256), color=color_map.get(person_id, "#FFFFFF"))
+        # Add simple face features
+        from PIL import ImageDraw
+        draw = ImageDraw.Draw(avatar)
+        # Simple face outline
+        draw.ellipse([50, 50, 206, 206], outline="#000000", width=3)
+        # Eyes
+        draw.ellipse([80, 90, 110, 120], fill="#000000")
+        draw.ellipse([146, 90, 176, 120], fill="#000000")
+        # Smile
+        draw.arc([100, 130, 156, 160], 0, 180, fill="#000000", width=2)
+        return np.array(avatar)
+    use_placeholder_1.click(
+        lambda: create_default_avatar(1),
+        outputs=[ref_img_1]
+    )
+    use_placeholder_2.click(
+        lambda: create_default_avatar(2),
+        outputs=[ref_img_2]
+    )
+    # Main generation function
+    def start_generation(*args):
+        """Start video generation with loading indicator"""
+        return (
+            gr.update(visible=False),  # Hide generate button
+            gr.update(visible=True),  # Show stop button
+            gr.update(value='<div class="warning">⚙️ Generating video... This may take several minutes depending on duration.</div>'),
+            None  # Clear previous video
+        )
+    def stop_generation():
+        """Handle stop generation"""
+        return (
+            gr.update(visible=True),   # Show generate button
+            gr.update(visible=False),  # Hide stop button
+            gr.update(value='<div class="info">🔧 Generation stopped. Ready for new video.</div>')
+        )
+    generate_btn.click(
+        start_generation,
+        outputs=[generate_btn, stop_btn, status_info, video_output]
+    ).then(
+        generate_conversational_video,
+        inputs=[
+            audio_1, prompt_1, audio_2, prompt_2,
+            ref_img_1, ref_img_2, duration, mode, fps
+        ],
+        outputs=[video_output]
+    ).then(
+        lambda: gr.update(value='<div class="success">✅ Video generation complete! You can now download your video.</div>'),
+        outputs=[status_info]
+    ).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)],
+        outputs=[generate_btn, stop_btn]
+    )
+    stop_btn.click(
+        stop_generation,
+        outputs=[generate_btn, stop_btn, status_info]
+    )
+    # Update status on model load
+    demo.load(
+        lambda: gr.update(value='<div class="success">✅ Model loaded successfully! Ready to generate videos.</div>'),
+        outputs=[status_info]
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=10, default_concurrency_limit=1)
+    demo.launch(share=True)
+```
+```python
+# requirements.txt
+gradio
+torch
+diffusers
+transformers
+accelerate
+numpy
+pillow
+opencv-python
+spaces
+torchvision
+```