from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import io
import soundfile as sf
import requests
import numpy as np

app = FastAPI(title="SpeechT5 TTS API")

# Adjustable parameters
NORMALIZATION_LEVEL = 0.1
SMOOTHING_WINDOW = 7
BIT_DEPTH = "32f"

# Load models once at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Map integer to speaker embedding URL
SPEAKER_EMBEDDINGS = {
    0: "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin",  # Normal
    1: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin",  # US female 1
    2: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_clb_arctic-wav-arctic_a0001.bin",  # US female 2
    3: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_bdl_arctic-wav-arctic_a0003.bin",  # US male 1
    4: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_rms_arctic-wav-arctic_a0003.bin",  # US male 2
    5: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_jmk_arctic-wav-arctic_a0002.bin",  # Canadian male
    6: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_b0002.bin",  # Scottish male
    7: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_ksp_arctic-wav-arctic_a0007.bin",  # Indian male
}


def load_speaker_embedding(url: str) -> torch.Tensor:
    response = requests.get(url)
    response.raise_for_status()
    embedding = torch.frombuffer(response.content, dtype=torch.float32)
    return embedding.unsqueeze(0)


def smooth_audio_nodejs(audio: np.ndarray, window_size: int) -> np.ndarray:
    smoothed = np.copy(audio)
    half = window_size // 2
    for i in range(len(audio)):
        start = max(0, i - half)
        end = min(len(audio), i + half + 1)
        smoothed[i] = np.mean(audio[start:end])
    return smoothed


def apply_fade_out(audio: np.ndarray, fade_samples: int = 256) -> np.ndarray:
    fade = np.linspace(1, 0, fade_samples)
    audio[-fade_samples:] *= fade
    return audio

@app.get("/speak")
def speak(
    text: str = Query(..., description="Text to convert to speech"),
    speaker: int = Query(1, ge=0, le=7, description="Speaker ID (0-7)")
):
    embedding_url = SPEAKER_EMBEDDINGS[speaker]
    speaker_embedding = load_speaker_embedding(embedding_url)

    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
    audio = speech.numpy().astype(np.float32)

    # --- Normalize ---
    peak = np.max(np.abs(audio))
    if peak > 0:
        audio = (audio / peak) * NORMALIZATION_LEVEL

    # --- Smooth audio ---
    audio = smooth_audio_nodejs(audio, SMOOTHING_WINDOW * 2)  # Slightly larger window

    # --- Fade out to remove clicks at the end ---
    fade_samples = min(512, len(audio)//10)  # ~30ms at 16kHz
    audio = apply_fade_out(audio, fade_samples)

    # --- Bit depth ---
    if BIT_DEPTH == "16":
        pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16)
    else:
        pcm = audio

    # --- Write WAV ---
    buf = io.BytesIO()
    subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT"
    sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype)
    buf.seek(0)

    return StreamingResponse(buf, media_type="audio/wav")