Spaces:

amphion
/

AnyAccomp

Running on Zero

File size: 6,562 Bytes

import spaces # added
import torch
import json
from anyaccomp.inference_utils import Sing2SongInferencePipeline
import os
import random
import librosa
import numpy as np
import soundfile as sf
import gradio as gr
import time
from huggingface_hub import snapshot_download

repo_id = "amphion/anyaccomp"
base_dir = os.path.dirname(os.path.abspath(__file__))

print(f"Downloading model files from {repo_id}...")
model_dir = snapshot_download(repo_id=repo_id, local_dir=base_dir)
print(f"Model files downloaded to: {model_dir}")

CFG_PATH = os.path.join(base_dir, "config/flow_matching.json")
VOCODER_CFG_PATH = os.path.join(base_dir, "config/vocoder.json")
CHECKPOINT_PATH = os.path.join(model_dir, "pretrained/flow_matching")
VOCODER_CHECKPOINT_PATH = os.path.join(model_dir, "pretrained/vocoder")

INFER_DST = os.path.join(base_dir, "output_gradio")
EXAMPLE_DIR = os.path.join(base_dir, "example/gradio")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

os.makedirs(INFER_DST, exist_ok=True)
acc_dst = os.path.join(INFER_DST, "accompaniment")
mixture_dst = os.path.join(INFER_DST, "mixture")
os.makedirs(acc_dst, exist_ok=True)
os.makedirs(mixture_dst, exist_ok=True)

print("Initializing AnyAccomp InferencePipeline...")
try:
    inference_pipeline = Sing2SongInferencePipeline(
        CHECKPOINT_PATH,
        CFG_PATH,
        VOCODER_CHECKPOINT_PATH,
        VOCODER_CFG_PATH,
        device=DEVICE,
    )
    inference_pipeline.sample_rate = 24000
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    inference_pipeline = None

@spaces.GPU # added
def sing2song_inference(vocal_filepath, n_timesteps, cfg_scale, seed):
    if inference_pipeline is None:
        raise gr.Error(
            "Model could not be loaded. Please check paths and environment configuration."
        )

    if vocal_filepath is None:
        raise gr.Error("Please upload a vocal audio file.")

    if seed == -1 or seed is None:
        seed = random.randint(0, 2**32 - 1)

    seed = int(seed)
    print(f"Using seed: {seed}")

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

    try:
        duration = librosa.get_duration(path=vocal_filepath)
        if not (3 <= duration <= 30):
            raise gr.Error("Audio duration must be between 3 and 30 seconds.")
    except Exception as e:
        raise gr.Error(f"Cannot read audio file or get duration: {e}")

    try:
        vocal_audio, _ = librosa.load(vocal_filepath, sr=24000, mono=True)
        vocal_tensor = torch.tensor(vocal_audio).unsqueeze(0).to(DEVICE)

        vocal_mel = inference_pipeline.encode_vocal(vocal_tensor)

        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
            mel = inference_pipeline.model.reverse_diffusion(
                vocal_mel=vocal_mel,
                n_timesteps=int(n_timesteps),
                cfg=cfg_scale,
            )

        mel = mel.float()
        wav = inference_pipeline._generate_audio(mel)
        wav = wav.squeeze().detach().cpu().numpy()

        wav = librosa.util.fix_length(data=wav, size=len(vocal_audio))
        mixture_wav = wav + vocal_audio

        timestamp = int(time.time())
        original_filename = os.path.basename(vocal_filepath)
        base_filename = f"{os.path.splitext(original_filename)[0]}_{timestamp}.wav"

        accompaniment_path = os.path.join(acc_dst, base_filename)
        mixture_path = os.path.join(mixture_dst, base_filename)

        sf.write(accompaniment_path, wav, 24000)
        sf.write(mixture_path, mixture_wav, 24000)

        return accompaniment_path, mixture_path, "Status: Complete!"

    except Exception as e:
        import traceback

        traceback.print_exc()
        raise gr.Error(f"An error occurred during processing: {e}")


def randomize_seed():
    return random.randint(0, 2**32 - 1)


with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # AnyAccomp: GENERALIZABLE ACCOMPANIMENT GENERATION
        Upload a 3-30 second vocal or instrument track (.wav or .mp3) and the model will generate an accompaniment for it.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 1. Upload or Select Audio")
            vocal_input = gr.Audio(
                type="filepath",
                label="Upload Vocal or Instrument Audio",
                sources=["upload", "microphone"],
            )

            example1_path = os.path.join(EXAMPLE_DIR, "example1.mp3")
            example2_path = os.path.join(EXAMPLE_DIR, "example2.wav")
            example3_path = os.path.join(EXAMPLE_DIR, "example3.wav")
            
            gr.Examples(
                examples=[[example1_path], [example2_path], [example3_path]],
                inputs=[vocal_input],
                label="Or click an example to start",
            )
            gr.Markdown("### 2. Adjust Parameters (Optional)")
            with gr.Accordion("Advanced Settings", open=True):
                n_timesteps_slider = gr.Slider(
                    minimum=10,
                    maximum=100,
                    value=50,
                    step=1,
                    label="Inference Steps (n_timesteps)",
                )
                cfg_slider = gr.Slider(
                    minimum=1.0, maximum=10.0, value=3.0, step=0.1, label="CFG Scale"
                )

                with gr.Row():
                    seed_input = gr.Number(
                        value=-1, label="Seed (-1 for random)", precision=0
                    )
                    random_seed_btn = gr.Button("🎲")

        with gr.Column(scale=1):
            gr.Markdown("### 3. Generate and Listen to the Result")

            status_text = gr.Markdown("Status: Ready")

            accompaniment_output = gr.Audio(
                label="Generated Accompaniment", type="filepath"
            )
            mixture_output = gr.Audio(
                label="Mixture (Vocal + Accompaniment)", type="filepath"
            )

            submit_btn = gr.Button("Generate Accompaniment", variant="primary")

    submit_btn.click(
        fn=sing2song_inference,
        inputs=[vocal_input, n_timesteps_slider, cfg_slider, seed_input],
        outputs=[accompaniment_output, mixture_output, status_text],
    )

    random_seed_btn.click(fn=randomize_seed, inputs=None, outputs=seed_input)


if __name__ == "__main__":
    demo.launch()