Spaces:

toninio19
/

keysync-demo

Running

App Files Files Community

Antoni Bigata commited on Apr 30

Commit

4ef25d2

1 Parent(s): 4fd1a69

requirements

Browse files

Files changed (1) hide show

app.py +227 -231

app.py CHANGED Viewed

@@ -614,257 +614,253 @@ def process_video(video_input, audio_input, max_num_seconds):
         audio_input = DEFAULT_AUDIO_PATH
         print(f"Using default audio: {DEFAULT_AUDIO_PATH}")
-    try:
-        # Calculate hashes for cache keys
-        video_path_hash = video_input
-        audio_path_hash = audio_input
-        # Check if we need to recompute video embeddings
-        video_cache_hit = cache["video"]["path"] == video_path_hash
-        audio_cache_hit = cache["audio"]["path"] == audio_path_hash
-        if video_cache_hit and audio_cache_hit:
-            print("Using cached video and audio computations")
-            # Make copies of cached data to avoid modifying cache
-            video_embedding = cache["video"]["embedding"].clone()
-            video_frames = cache["video"]["frames"].clone()
-            video_landmarks = cache["video"]["landmarks"].copy()
-            raw_audio = cache["audio"]["raw_audio"].clone()
-            raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
-            hubert_embedding = cache["audio"]["hubert_embedding"].clone()
-            wavlm_embedding = cache["audio"]["wavlm_embedding"].clone()
-            # Ensure all data is truncated to the same length if needed
-            min_len = min(
-                len(video_frames),
-                len(raw_audio),
-                len(hubert_embedding),
-                len(wavlm_embedding),
             )
-            video_frames = video_frames[:min_len]
             video_embedding = video_embedding[:min_len]
             video_landmarks = video_landmarks[:min_len]
-            raw_audio = raw_audio[:min_len]
-            hubert_embedding = hubert_embedding[:min_len]
-            wavlm_embedding = wavlm_embedding[:min_len]
-            raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
         else:
-            # Process video if needed
-            if not video_cache_hit:
-                print("Computing video embeddings and landmarks")
-                video_reader = decord.VideoReader(video_input)
-                decord.bridge.set_bridge("torch")
-                if not audio_cache_hit:
-                    # Need to process audio to determine min_len
-                    raw_audio = get_raw_audio(audio_input, 16000)
-                    if len(raw_audio) == 0 or len(video_reader) == 0:
-                        raise ValueError("Empty audio or video input")
-                    min_len = min(len(raw_audio), len(video_reader))
-                    # Store full audio in cache
-                    cache["audio"]["path"] = audio_path_hash
-                    cache["audio"]["raw_audio"] = raw_audio.clone()
-                    # Create truncated copy for processing
-                    raw_audio = raw_audio[:min_len]
-                    raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
-                else:
-                    # Use cached audio - make a copy
-                    if cache["audio"]["raw_audio"] is None:
-                        raise ValueError("Cached audio is None")
-                    raw_audio = cache["audio"]["raw_audio"].clone()
-                    if len(raw_audio) == 0 or len(video_reader) == 0:
-                        raise ValueError("Empty cached audio or video input")
-                    min_len = min(len(raw_audio), len(video_reader))
-                    # Create truncated copy for processing
-                    raw_audio = raw_audio[:min_len]
-                    raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
-                # Compute video embeddings and landmarks - store full version in cache
-                video_embedding, video_frames = compute_video_embedding(
-                    video_reader, len(video_reader)
-                )
-                video_landmarks = extract_video_landmarks(video_frames)
-                # Update video cache with full versions
-                cache["video"]["path"] = video_path_hash
-                cache["video"]["embedding"] = video_embedding
-                cache["video"]["frames"] = video_frames
-                cache["video"]["landmarks"] = video_landmarks
-                # Create truncated copies for processing
-                video_embedding = video_embedding[:min_len]
                 video_frames = video_frames[:min_len]
                 video_landmarks = video_landmarks[:min_len]
             else:
-                # Use cached video data - make copies
-                print("Using cached video computations")
-                if (
-                    cache["video"]["embedding"] is None
-                    or cache["video"]["frames"] is None
-                    or cache["video"]["landmarks"] is None
-                ):
-                    raise ValueError("One or more video cache entries are None")
-                if not audio_cache_hit:
-                    # New audio with cached video
-                    raw_audio = get_raw_audio(audio_input, 16000)
-                    if len(raw_audio) == 0:
-                        raise ValueError("Empty audio input")
-                    # Store full audio in cache
-                    cache["audio"]["path"] = audio_path_hash
-                    cache["audio"]["raw_audio"] = raw_audio.clone()
-                    # Make copies of video data
-                    video_embedding = cache["video"]["embedding"].clone()
-                    video_frames = cache["video"]["frames"].clone()
-                    video_landmarks = cache["video"]["landmarks"].copy()
-                    # Determine truncation length and create truncated copies
-                    min_len = min(len(raw_audio), len(video_frames))
-                    raw_audio = raw_audio[:min_len]
-                    raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
-                    video_frames = video_frames[:min_len]
-                    video_embedding = video_embedding[:min_len]
-                    video_landmarks = video_landmarks[:min_len]
-                else:
-                    # Both video and audio are cached - should not reach here
-                    # as it's handled in the first if statement
-                    pass
-            # Process audio if needed
-            if not audio_cache_hit:
-                print("Computing audio embeddings")
-                # Compute audio embeddings with the truncated audio
-                hubert_embedding = compute_hubert_embedding(raw_audio_reshape)
-                wavlm_embedding = compute_wavlm_embedding(raw_audio_reshape)
-                # Update audio cache with full embeddings
-                # Note: raw_audio was already cached above
-                cache["audio"]["hubert_embedding"] = hubert_embedding.clone()
-                cache["audio"]["wavlm_embedding"] = wavlm_embedding.clone()
-            else:
-                # Use cached audio data - make copies
-                if (
-                    cache["audio"]["hubert_embedding"] is None
-                    or cache["audio"]["wavlm_embedding"] is None
-                ):
-                    raise ValueError(
-                        "One or more audio embedding cache entries are None"
-                    )
-                hubert_embedding = cache["audio"]["hubert_embedding"].clone()
-                wavlm_embedding = cache["audio"]["wavlm_embedding"].clone()
-                # Make sure embeddings match the truncated video length if needed
-                if "min_len" in locals() and (
-                    min_len < len(hubert_embedding) or min_len < len(wavlm_embedding)
-                ):
-                    hubert_embedding = hubert_embedding[:min_len]
-                    wavlm_embedding = wavlm_embedding[:min_len]
-        # Apply max_num_seconds limit if specified
-        if max_num_seconds > 0:
-            # Convert seconds to frames (assuming 25 fps)
-            max_frames = int(max_num_seconds * 25)
-            # Truncate all data to max_frames
-            video_embedding = video_embedding[:max_frames]
-            video_frames = video_frames[:max_frames]
-            video_landmarks = video_landmarks[:max_frames]
-            hubert_embedding = hubert_embedding[:max_frames]
-            wavlm_embedding = wavlm_embedding[:max_frames]
-            raw_audio = raw_audio[:max_frames]
-            raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
-        # Validate shapes before proceeding
-        assert video_embedding.shape[0] == hubert_embedding.shape[0], (
-            f"Video embedding length ({video_embedding.shape[0]}) doesn't match Hubert embedding length ({hubert_embedding.shape[0]})"
-        )
-        assert video_embedding.shape[0] == wavlm_embedding.shape[0], (
-            f"Video embedding length ({video_embedding.shape[0]}) doesn't match WavLM embedding length ({wavlm_embedding.shape[0]})"
-        )
-        assert video_embedding.shape[0] == video_landmarks.shape[0], (
-            f"Video embedding length ({video_embedding.shape[0]}) doesn't match landmarks length ({video_landmarks.shape[0]})"
-        )
-        print(f"Hubert embedding shape: {hubert_embedding.shape}")
-        print(f"WavLM embedding shape: {wavlm_embedding.shape}")
-        print(f"Video embedding shape: {video_embedding.shape}")
-        print(f"Video landmarks shape: {video_landmarks.shape}")
-        # Create pipeline inputs for models
-        (
-            interpolation_chunks,
-            keyframe_chunks,
-            audio_interpolation_chunks,
-            audio_keyframe_chunks,
-            emb_cond,
-            masks_keyframe_chunks,
-            masks_interpolation_chunks,
-            to_remove,
-            audio_interpolation_idx,
-            audio_keyframe_idx,
-        ) = create_pipeline_inputs(
-            hubert_embedding,
-            wavlm_embedding,
-            14,
-            video_embedding,
-            video_landmarks,
-            overlap=1,
-            add_zero_flag=True,
-            mask_arms=None,
-            nose_index=28,
-        )
-        complete_video = sample(
-            audio_keyframe_chunks,
-            keyframe_chunks,
-            masks_keyframe_chunks,
-            to_remove,
-            audio_keyframe_idx,
-            14,
-            "cuda",
-            emb_cond,
-            [],
-            3,
-            3,
-            audio_interpolation_idx,
-            audio_interpolation_chunks,
-            masks_interpolation_chunks,
-            interpolation_chunks,
-            keyframe_model,
-            interpolation_model,
-        )
-        complete_audio = rearrange(
-            raw_audio[: complete_video.shape[0]], "f s -> () (f s)"
-        )
-        # 4. Convert frames to video and combine with audio
-        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video:
-            output_path = temp_video.name
-        print("Saving video to", output_path)
-        save_audio_video(complete_video, audio=complete_audio, save_path=output_path)
-        torch.cuda.empty_cache()
-        return output_path
-    except Exception as e:
-        raise e
-        print(f"Error processing video: {str(e)}")
-        return None
 def get_max_duration(video_input, audio_input):

         audio_input = DEFAULT_AUDIO_PATH
         print(f"Using default audio: {DEFAULT_AUDIO_PATH}")
+    # try:
+    # Calculate hashes for cache keys
+    video_path_hash = video_input
+    audio_path_hash = audio_input
+    # Check if we need to recompute video embeddings
+    video_cache_hit = cache["video"]["path"] == video_path_hash
+    audio_cache_hit = cache["audio"]["path"] == audio_path_hash
+    if video_cache_hit and audio_cache_hit:
+        print("Using cached video and audio computations")
+        # Make copies of cached data to avoid modifying cache
+        video_embedding = cache["video"]["embedding"].clone()
+        video_frames = cache["video"]["frames"].clone()
+        video_landmarks = cache["video"]["landmarks"].copy()
+        raw_audio = cache["audio"]["raw_audio"].clone()
+        raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
+        hubert_embedding = cache["audio"]["hubert_embedding"].clone()
+        wavlm_embedding = cache["audio"]["wavlm_embedding"].clone()
+        # Ensure all data is truncated to the same length if needed
+        min_len = min(
+            len(video_frames),
+            len(raw_audio),
+            len(hubert_embedding),
+            len(wavlm_embedding),
+        )
+        video_frames = video_frames[:min_len]
+        video_embedding = video_embedding[:min_len]
+        video_landmarks = video_landmarks[:min_len]
+        raw_audio = raw_audio[:min_len]
+        hubert_embedding = hubert_embedding[:min_len]
+        wavlm_embedding = wavlm_embedding[:min_len]
+        raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
+    else:
+        # Process video if needed
+        if not video_cache_hit:
+            print("Computing video embeddings and landmarks")
+            video_reader = decord.VideoReader(video_input)
+            decord.bridge.set_bridge("torch")
+            if not audio_cache_hit:
+                # Need to process audio to determine min_len
+                raw_audio = get_raw_audio(audio_input, 16000)
+                if len(raw_audio) == 0 or len(video_reader) == 0:
+                    raise ValueError("Empty audio or video input")
+                min_len = min(len(raw_audio), len(video_reader))
+                # Store full audio in cache
+                cache["audio"]["path"] = audio_path_hash
+                cache["audio"]["raw_audio"] = raw_audio.clone()
+                # Create truncated copy for processing
+                raw_audio = raw_audio[:min_len]
+                raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
+            else:
+                # Use cached audio - make a copy
+                if cache["audio"]["raw_audio"] is None:
+                    raise ValueError("Cached audio is None")
+                raw_audio = cache["audio"]["raw_audio"].clone()
+                if len(raw_audio) == 0 or len(video_reader) == 0:
+                    raise ValueError("Empty cached audio or video input")
+                min_len = min(len(raw_audio), len(video_reader))
+                # Create truncated copy for processing
+                raw_audio = raw_audio[:min_len]
+                raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
+            # Compute video embeddings and landmarks - store full version in cache
+            video_embedding, video_frames = compute_video_embedding(
+                video_reader, len(video_reader)
             )
+            video_landmarks = extract_video_landmarks(video_frames)
+            # Update video cache with full versions
+            cache["video"]["path"] = video_path_hash
+            cache["video"]["embedding"] = video_embedding
+            cache["video"]["frames"] = video_frames
+            cache["video"]["landmarks"] = video_landmarks
+            # Create truncated copies for processing
             video_embedding = video_embedding[:min_len]
+            video_frames = video_frames[:min_len]
             video_landmarks = video_landmarks[:min_len]
         else:
+            # Use cached video data - make copies
+            print("Using cached video computations")
+            if (
+                cache["video"]["embedding"] is None
+                or cache["video"]["frames"] is None
+                or cache["video"]["landmarks"] is None
+            ):
+                raise ValueError("One or more video cache entries are None")
+            if not audio_cache_hit:
+                # New audio with cached video
+                raw_audio = get_raw_audio(audio_input, 16000)
+                if len(raw_audio) == 0:
+                    raise ValueError("Empty audio input")
+                # Store full audio in cache
+                cache["audio"]["path"] = audio_path_hash
+                cache["audio"]["raw_audio"] = raw_audio.clone()
+                # Make copies of video data
+                video_embedding = cache["video"]["embedding"].clone()
+                video_frames = cache["video"]["frames"].clone()
+                video_landmarks = cache["video"]["landmarks"].copy()
+                # Determine truncation length and create truncated copies
+                min_len = min(len(raw_audio), len(video_frames))
+                raw_audio = raw_audio[:min_len]
+                raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
                 video_frames = video_frames[:min_len]
+                video_embedding = video_embedding[:min_len]
                 video_landmarks = video_landmarks[:min_len]
             else:
+                # Both video and audio are cached - should not reach here
+                # as it's handled in the first if statement
+                pass
+        # Process audio if needed
+        if not audio_cache_hit:
+            print("Computing audio embeddings")
+            # Compute audio embeddings with the truncated audio
+            hubert_embedding = compute_hubert_embedding(raw_audio_reshape)
+            wavlm_embedding = compute_wavlm_embedding(raw_audio_reshape)
+            # Update audio cache with full embeddings
+            # Note: raw_audio was already cached above
+            cache["audio"]["hubert_embedding"] = hubert_embedding.clone()
+            cache["audio"]["wavlm_embedding"] = wavlm_embedding.clone()
+        else:
+            # Use cached audio data - make copies
+            if (
+                cache["audio"]["hubert_embedding"] is None
+                or cache["audio"]["wavlm_embedding"] is None
+            ):
+                raise ValueError("One or more audio embedding cache entries are None")
+            hubert_embedding = cache["audio"]["hubert_embedding"].clone()
+            wavlm_embedding = cache["audio"]["wavlm_embedding"].clone()
+            # Make sure embeddings match the truncated video length if needed
+            if "min_len" in locals() and (
+                min_len < len(hubert_embedding) or min_len < len(wavlm_embedding)
+            ):
+                hubert_embedding = hubert_embedding[:min_len]
+                wavlm_embedding = wavlm_embedding[:min_len]
+    # Apply max_num_seconds limit if specified
+    if max_num_seconds > 0:
+        # Convert seconds to frames (assuming 25 fps)
+        max_frames = int(max_num_seconds * 25)
+        # Truncate all data to max_frames
+        video_embedding = video_embedding[:max_frames]
+        video_frames = video_frames[:max_frames]
+        video_landmarks = video_landmarks[:max_frames]
+        hubert_embedding = hubert_embedding[:max_frames]
+        wavlm_embedding = wavlm_embedding[:max_frames]
+        raw_audio = raw_audio[:max_frames]
+        raw_audio_reshape = rearrange(raw_audio, "f s -> (f s)")
+    # Validate shapes before proceeding
+    assert video_embedding.shape[0] == hubert_embedding.shape[0], (
+        f"Video embedding length ({video_embedding.shape[0]}) doesn't match Hubert embedding length ({hubert_embedding.shape[0]})"
+    )
+    assert video_embedding.shape[0] == wavlm_embedding.shape[0], (
+        f"Video embedding length ({video_embedding.shape[0]}) doesn't match WavLM embedding length ({wavlm_embedding.shape[0]})"
+    )
+    assert video_embedding.shape[0] == video_landmarks.shape[0], (
+        f"Video embedding length ({video_embedding.shape[0]}) doesn't match landmarks length ({video_landmarks.shape[0]})"
+    )
+    print(f"Hubert embedding shape: {hubert_embedding.shape}")
+    print(f"WavLM embedding shape: {wavlm_embedding.shape}")
+    print(f"Video embedding shape: {video_embedding.shape}")
+    print(f"Video landmarks shape: {video_landmarks.shape}")
+    # Create pipeline inputs for models
+    (
+        interpolation_chunks,
+        keyframe_chunks,
+        audio_interpolation_chunks,
+        audio_keyframe_chunks,
+        emb_cond,
+        masks_keyframe_chunks,
+        masks_interpolation_chunks,
+        to_remove,
+        audio_interpolation_idx,
+        audio_keyframe_idx,
+    ) = create_pipeline_inputs(
+        hubert_embedding,
+        wavlm_embedding,
+        14,
+        video_embedding,
+        video_landmarks,
+        overlap=1,
+        add_zero_flag=True,
+        mask_arms=None,
+        nose_index=28,
+    )
+    complete_video = sample(
+        audio_keyframe_chunks,
+        keyframe_chunks,
+        masks_keyframe_chunks,
+        to_remove,
+        audio_keyframe_idx,
+        14,
+        "cuda",
+        emb_cond,
+        [],
+        3,
+        3,
+        audio_interpolation_idx,
+        audio_interpolation_chunks,
+        masks_interpolation_chunks,
+        interpolation_chunks,
+        keyframe_model,
+        interpolation_model,
+    )
+    complete_audio = rearrange(raw_audio[: complete_video.shape[0]], "f s -> () (f s)")
+    # 4. Convert frames to video and combine with audio
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video:
+        output_path = temp_video.name
+    print("Saving video to", output_path)
+    save_audio_video(complete_video, audio=complete_audio, save_path=output_path)
+    torch.cuda.empty_cache()
+    return output_path
+    # except Exception as e:
+    #     raise e
+    #     print(f"Error processing video: {str(e)}")
+    #     return None
 def get_max_duration(video_input, audio_input):