Spaces:
Build error
Build error
| import gradio as gr | |
| import requests | |
| import json | |
| # from volcenginesdkarkruntime import Ark | |
| import torch | |
| import torchaudio | |
| from einops import rearrange | |
| import argparse | |
| import json | |
| import os | |
| import spaces | |
| from tqdm import tqdm | |
| import random | |
| import numpy as np | |
| import sys | |
| import base64 | |
| from diffrhythm.infer.infer_utils import ( | |
| get_reference_latent, | |
| get_lrc_token, | |
| get_style_prompt, | |
| prepare_model, | |
| get_negative_style_prompt | |
| ) | |
| from diffrhythm.infer.infer import inference | |
| MAX_SEED = np.iinfo(np.int32).max | |
| device='cuda' | |
| cfm, tokenizer, muq, vae = prepare_model(device) | |
| cfm = torch.compile(cfm) | |
| def infer_music(lrc, ref_audio_path, seed=42, randomize_seed=False, steps=32, file_type='wav', max_frames=2048, device='cuda'): | |
| if randomize_seed: | |
| seed = random.randint(0, MAX_SEED) | |
| torch.manual_seed(seed) | |
| sway_sampling_coef = -1 if steps < 32 else None | |
| lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device) | |
| style_prompt = get_style_prompt(muq, ref_audio_path) | |
| negative_style_prompt = get_negative_style_prompt(device) | |
| latent_prompt = get_reference_latent(device, max_frames) | |
| generated_song = inference(cfm_model=cfm, | |
| vae_model=vae, | |
| cond=latent_prompt, | |
| text=lrc_prompt, | |
| duration=max_frames, | |
| style_prompt=style_prompt, | |
| negative_style_prompt=negative_style_prompt, | |
| steps=steps, | |
| sway_sampling_coef=sway_sampling_coef, | |
| start_time=start_time, | |
| file_type=file_type | |
| ) | |
| return generated_song | |
| import re | |
| from transformers import pipeline | |
| zephyr_model = "HuggingFaceH4/zephyr-7b-beta" | |
| mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
| pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto") | |
| def prepare_lyrics_with_llm(theme, tags, lyrics): | |
| language = "en" | |
| standard_sys = f""" | |
| Please generate a complete song with lyrics in {language}, following the {tags} style and centered around the theme "{theme}". If {lyrics} is provided, format it accordingly. If {lyrics} is None, generate original lyrics based on the given theme and style. Strictly adhere to the following requirements: | |
| ### Mandatory Formatting Rules | |
| 1. Only output the formatted lyrics—do not include any explanations, introductions, or additional messages. | |
| 2. Only include timestamps and lyrics. Do not use brackets, side notes, or section markers (e.g., chorus, instrumental, outro). | |
| 3. **Each line must start with a timestamp**, following the format [mm:ss.xx]Lyrics content, with no spaces between the timestamp and lyrics. The lyrics should be continuous and complete. | |
| 4. The total song length must not exceed 1 minute 30 seconds. | |
| 5. Timestamps should be naturally distributed. **The first lyric must not start at [00:00.00]**—there should always be an intro with no lyrics, and the first lyric should start around 8 to 10 seconds into the song. Do not start timestamps at [00:00.00]. | |
| 6. The intro time should always be left blank (with no lyrics) before the first lyric, ensuring the song naturally begins after an intro section. | |
| 7. **Every single line must begin with a timestamp.** No line should be missing a timestamp. | |
| ### Prohibited Examples (Do Not Include) | |
| - Incorrect: [01:30.00](Piano solo) | |
| - Incorrect: [00:45.00][Chorus] | |
| - Incorrect: Lyrics without a timestamp at the beginning of the line. | |
| """ | |
| instruction = f""" | |
| <|system|> | |
| {standard_sys}</s> | |
| <|user|> | |
| theme: {theme} | |
| tags: {tags} | |
| lyrics: {lyrics} | |
| """ | |
| prompt = f"{instruction.strip()}</s>" | |
| outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) | |
| pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>' | |
| cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL) | |
| print(f"SUGGESTED Lyrics: {cleaned_text}") | |
| return cleaned_text.lstrip("\n") | |
| from gradio_client import Client | |
| def generate_audio_ref(tags): | |
| client = Client("declare-lab/mustango") | |
| result = client.predict( | |
| prompt=tags, | |
| steps=200, | |
| guidance=3, | |
| api_name="/predict" | |
| ) | |
| print(result) | |
| return result | |
| def general_process(theme, tags, lyrics): | |
| gr.Info("Generating Lyrics") | |
| lyrics_result = prepare_lyrics_with_llm(theme, tags, lyrics) | |
| gr.Info("Generating audio ref") | |
| audio_ref = generate_audio_ref(tags) | |
| if lyrics_result and audio_ref: | |
| gr.Info("Generating Song") | |
| generated_song = infer_music(lyrics_result, audio_ref) | |
| return audio_ref, lyrics_result, generated_song | |
| with gr.Blocks() as demo: | |
| with gr.Column(): | |
| gr.Markdown("# Simpler Diff Rythm") | |
| theme_song = gr.Textbox(label="Theme") | |
| style_tags = gr.Textbox(label="Music style tags") | |
| lyrics = gr.Textbox(label="Lyrics optional") | |
| submit_btn = gr.Button("Submit") | |
| audio_ref = gr.Audio(label="Audio ref used") | |
| generated_lyrics = gr.Textbox(label="Generated Lyrics") | |
| song_result = gr.Audio(label="Your generated Song") | |
| submit_btn.click( | |
| fn = general_process, | |
| inputs = [theme_song, style_tags, lyrics], | |
| outputs = [audio_ref, generated_lyrics, song_result] | |
| ) | |
| demo.queue().launch(show_api=False, show_error=True, ssr_mode=False) |