wasmdashai commited on
Commit
42d241e
·
verified ·
1 Parent(s): c01701d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -38
app.py CHANGED
@@ -1,24 +1,79 @@
1
- from fastapi import FastAPI, HTTPException
2
- from pydantic import BaseModel
3
- from transformers import AutoTokenizer, VitsModel
4
  import torch
5
- import numpy as np
6
  import os
 
7
  import noisereduce as nr
 
 
8
 
9
  # قراءة التوكن من Secrets
10
- token = os.getenv("acees-token")
11
 
12
- # تخزين النماذج
13
  models = {}
14
 
15
- # اختيار الجهاز
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
 
18
  # دالة إزالة الضوضاء
19
  def remove_noise_nr(audio_data, sr=16000):
20
  return nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # تحميل النموذج + التوكن
23
  def get_model(name_model):
24
  global models
@@ -35,35 +90,42 @@ def get_model(name_model):
35
  tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
36
  return models[name_model], tokenizer
37
 
38
- # نموذج البيانات للـ POST
39
- class TTSRequest(BaseModel):
40
- text: str
41
- name_model: str = "wasmdashai/vits-ar-sa-huba-v2"
42
- speaking_rate: int = 16000
43
-
44
- # إنشاء التطبيق
45
- app = FastAPI(title="VITS TTS API")
46
-
47
- # مسار صحة الخدمة
48
- @app.get("/")
49
- def home():
50
- return {"message": "FastAPI VITS TTS service is running"}
51
-
52
- # مسار تحويل النص إلى كلام
53
- @app.post("/predict/")
54
- def modelspeech(req: TTSRequest):
55
- try:
56
- model, tokenizer = get_model(req.name_model)
57
- inputs = tokenizer(req.text, return_tensors="pt").to(device)
58
- model.speaking_rate = req.speaking_rate
59
- with torch.no_grad():
60
- outputs = model(**inputs)
61
- waveform = outputs.waveform[0].cpu().numpy()
62
- audio = remove_noise_nr(waveform)
63
- return {
64
- "sampling_rate": model.config.sampling_rate,
65
- "audio": audio.tolist() # تحويل numpy array إلى قائمة
66
- }
67
- except Exception as e:
68
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
69
 
 
1
+ from logging import error
2
+ import gradio as gr
3
+ import spaces
4
  import torch
5
+ from transformers import AutoTokenizer, VitsModel
6
  import os
7
+ import numpy as np
8
  import noisereduce as nr
9
+ import torch.nn as nn
10
+ from typing import Optional, Iterator
11
 
12
  # قراءة التوكن من Secrets
13
+ token = os.getenv("acees-token") # تأكد أنك سميته بنفس الاسم في Settings → Repository secrets
14
 
15
+ # كائن لتخزين النماذج
16
  models = {}
17
 
18
+ # اختيار الجهاز (CUDA لو متوفر، غير كذا CPU)
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
20
 
21
+
22
  # دالة إزالة الضوضاء
23
  def remove_noise_nr(audio_data, sr=16000):
24
  return nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
25
 
26
+
27
+ # دالة inference (streaming / non-streaming)
28
+ def _inference_forward_stream(
29
+ self,
30
+ input_ids: Optional[torch.Tensor] = None,
31
+ attention_mask: Optional[torch.Tensor] = None,
32
+ speaker_embeddings: Optional[torch.Tensor] = None,
33
+ chunk_size: int = 32,
34
+ is_streaming: bool = True
35
+ ) -> Iterator[torch.Tensor]:
36
+
37
+ padding_mask = attention_mask.unsqueeze(-1).float() if attention_mask is not None else torch.ones_like(input_ids).unsqueeze(-1).float()
38
+ text_encoder_output = self.text_encoder(input_ids=input_ids, padding_mask=padding_mask, attention_mask=attention_mask)
39
+ hidden_states = text_encoder_output[0].transpose(1, 2)
40
+ input_padding_mask = padding_mask.transpose(1, 2)
41
+
42
+ log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
43
+ length_scale = 1.0 / self.speaking_rate
44
+ duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
45
+ predicted_lengths = torch.clamp_min(torch.sum(duration, [1,2]), 1).long()
46
+
47
+ indices = torch.arange(predicted_lengths.max(), device=predicted_lengths.device)
48
+ output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
49
+ output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
50
+
51
+ attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
52
+ batch_size, _, output_length, input_length = attn_mask.shape
53
+ cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
54
+ indices = torch.arange(output_length, device=duration.device)
55
+ valid_indices = indices.unsqueeze(0) < cum_duration
56
+ valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
57
+ padded_indices = valid_indices - nn.functional.pad(valid_indices, [0,0,1,0,0,0])[:, :-1]
58
+ attn = padded_indices.unsqueeze(1).transpose(2,3) * attn_mask
59
+
60
+ prior_means = text_encoder_output[1]
61
+ prior_log_variances = text_encoder_output[2]
62
+ prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
63
+ latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
64
+ spectrogram = latents * output_padding_mask
65
+
66
+ if is_streaming:
67
+ for i in range(0, spectrogram.size(-1), chunk_size):
68
+ with torch.no_grad():
69
+ wav = self.decoder(spectrogram[:,:,i:i+chunk_size], speaker_embeddings)
70
+ yield wav.squeeze().cpu().numpy()
71
+ else:
72
+ with torch.no_grad():
73
+ wav = self.decoder(spectrogram, speaker_embeddings)
74
+ yield wav.squeeze().cpu().numpy()
75
+
76
+
77
  # تحميل النموذج + التوكن
78
  def get_model(name_model):
79
  global models
 
90
  tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
91
  return models[name_model], tokenizer
92
 
93
+
94
+ # النص الافتراضي
95
+ TXT = "السلا�� عليكم ورحمة الله وبركاته يا هلا وسهلا ومراحب بالغالي"
96
+
97
+
98
+ # دالة تحويل النص إلى كلام
99
+ def modelspeech(text=TXT, name_model="wasmdashai/vits-ar-sa-huba-v2", speaking_rate=16000):
100
+ model, tokenizer = get_model(name_model)
101
+ inputs = tokenizer(text, return_tensors="pt").to(device) # يشتغل على CPU أو GPU حسب المتوفر
102
+ model.speaking_rate = speaking_rate
103
+ with torch.no_grad():
104
+ outputs = model(**inputs)
105
+ waveform = outputs.waveform[0].cpu().numpy()
106
+ return model.config.sampling_rate, remove_noise_nr(waveform)
107
+
108
+
109
+ # واجهة Gradio
110
+ model_choices = gr.Dropdown(
111
+ choices=[
112
+ "wasmdashai/vits-ar-sa-huba-v1",
113
+ "wasmdashai/vits-ar-sa-huba-v2",
114
+ "wasmdashai/vits-ar-sa-A",
115
+ "wasmdashai/vits-ar-ye-sa",
116
+ "wasmdashai/vits-ar-sa-M-v1",
117
+ "wasmdashai/vits-en-v1"
118
+ ],
119
+ label="اختر النموذج",
120
+ value="wasmdashai/vits-ar-sa-huba-v2"
121
+ )
122
+
123
+ demo = gr.Interface(
124
+ fn=modelspeech,
125
+ inputs=["text", model_choices, gr.Slider(0.1, 1, step=0.1, value=0.8)],
126
+ outputs=["audio"]
127
+ )
128
+
129
+ demo.queue()
130
+ demo.launch(server_name="0.0.0.0", server_port=7860)
131