wasmdashai commited on
Commit
43c1b9a
·
verified ·
1 Parent(s): 619239d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -0
app.py CHANGED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from logging import error
2
+ import gradio as gr
3
+ import spaces
4
+ import torch
5
+ from transformers import AutoTokenizer, VitsModel
6
+ import os
7
+ import numpy as np
8
+ import noisereduce as nr
9
+ import torch.nn as nn
10
+ from typing import Optional, Iterator
11
+
12
+ token= os.getenv("HF_TOKEN")
13
+ token ="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"
14
+
15
+ models = {}
16
+
17
+ # دالة إز
18
+ def remove_noise_nr(audio_data, sr=16000):
19
+ return nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
20
+
21
+ # دالة inference
22
+ def _inference_forward_stream(
23
+ self,
24
+ input_ids: Optional[torch.Tensor] = None,
25
+ attention_mask: Optional[torch.Tensor] = None,
26
+ speaker_embeddings: Optional[torch.Tensor] = None,
27
+ chunk_size: int = 32,
28
+ is_streaming: bool = True
29
+ ) -> Iterator[torch.Tensor]:
30
+ padding_mask = attention_mask.unsqueeze(-1).float() if attention_mask is not None else torch.ones_like(input_ids).unsqueeze(-1).float()
31
+
32
+ text_encoder_output = self.text_encoder(input_ids=input_ids, padding_mask=padding_mask, attention_mask=attention_mask)
33
+ hidden_states = text_encoder_output[0].transpose(1, 2)
34
+ input_padding_mask = padding_mask.transpose(1, 2)
35
+
36
+ log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
37
+ length_scale = 1.0 / self.speaking_rate
38
+ duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
39
+ predicted_lengths = torch.clamp_min(torch.sum(duration, [1,2]), 1).long()
40
+
41
+ # إنشاء attention mask
42
+ indices = torch.arange(predicted_lengths.max(), device=predicted_lengths.device)
43
+ output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
44
+ output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
45
+
46
+ attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
47
+ batch_size, _, output_length, input_length = attn_mask.shape
48
+ cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
49
+ indices = torch.arange(output_length, device=duration.device)
50
+ valid_indices = indices.unsqueeze(0) < cum_duration
51
+ valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
52
+ padded_indices = valid_indices - nn.functional.pad(valid_indices, [0,0,1,0,0,0])[:, :-1]
53
+ attn = padded_indices.unsqueeze(1).transpose(2,3) * attn_mask
54
+
55
+ prior_means = text_encoder_output[1]
56
+ prior_log_variances = text_encoder_output[2]
57
+ prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
58
+ latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
59
+ spectrogram = latents * output_padding_mask
60
+
61
+ if is_streaming:
62
+ for i in range(0, spectrogram.size(-1), chunk_size):
63
+ with torch.no_grad():
64
+ wav = self.decoder(spectrogram[:,:,i:i+chunk_size], speaker_embeddings)
65
+ yield wav.squeeze().cpu().numpy()
66
+ else:
67
+ with torch.no_grad():
68
+ print("fff")
69
+ wav = self.decoder(spectrogram, speaker_embeddings)
70
+ yield wav.squeeze().cpu().numpy()
71
+
72
+ def get_model(name_model):
73
+ global models
74
+ if name_model in models:
75
+ tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
76
+ return models[name_model], tokenizer
77
+
78
+ models[name_model] = VitsModel.from_pretrained(name_model, token=token).cuda()
79
+ models[name_model].decoder.apply_weight_norm()
80
+ for flow in models[name_model].flow.flows:
81
+ torch.nn.utils.weight_norm(flow.conv_pre)
82
+ torch.nn.utils.weight_norm(flow.conv_post)
83
+
84
+ tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
85
+ return models[name_model], tokenizer
86
+
87
+ TXT = "السلام عليكم ورحمة الله وبركاته يا هلا وسهلا ومراحب بالغالي"
88
+ def modelspeech(text=TXT, name_model="wasmdashai/vits-ar-sa-huba-v2", speaking_rate=16000):
89
+ model, tokenizer = get_model(name_model)
90
+ inputs = tokenizer(text, return_tensors="pt").to("cuda")
91
+ model.speaking_rate = speaking_rate
92
+ with torch.no_grad():
93
+ outputs = model(**inputs)
94
+ waveform = outputs.waveform[0].cpu().numpy()
95
+ #wav = list(_inference_forward_stream(model, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, speaker_embeddings=None, is_streaming=False))[0]
96
+ return model.config.sampling_rate, remove_noise_nr(waveform)
97
+
98
+
99
+ model_choices = gr.Dropdown(
100
+ choices=[
101
+ "wasmdashai/vits-ar-sa-huba-v1",
102
+ "wasmdashai/vits-ar-sa-huba-v2",
103
+ "wasmdashai/vits-ar-sa-A",
104
+ "wasmdashai/vits-ar-ye-sa",
105
+ "wasmdashai/vits-ar-sa-M-v1",
106
+ "wasmdashai/vits-en-v1"
107
+ ],
108
+ label="اختر النموذج",
109
+ value="wasmdashai/vits-ar-sa-huba-v2"
110
+ )
111
+
112
+ demo = gr.Interface(fn=modelspeech, inputs=["text", model_choices, gr.Slider(0.1, 1, step=0.1, value=0.8)], outputs=["audio"])
113
+ demo.queue()
114
+ demo.launch(share=True, debug=True)