Moustafa1111111111 commited on
Commit
6864301
·
1 Parent(s): 000b27a

Added TTS model, Dockerfile, and app server

Browse files
Dockerfile ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim-buster
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ build-essential \
6
+ cmake \
7
+ pkg-config \
8
+ libblis-dev \
9
+ python3-dev \
10
+ wget \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Set the working directory
14
+ WORKDIR /app
15
+
16
+ # Copy TTS and install it
17
+ COPY TTS /app/TTS
18
+ WORKDIR /app/TTS
19
+ ENV BLIS_ARCH="generic"
20
+ ENV COQUI_TTS_AGREED=1
21
+ RUN pip install -r requirements.txt --timeout=300
22
+ RUN pip install -e . --timeout=300
23
+
24
+ # Go back to main app dir
25
+ WORKDIR /app
26
+
27
+ # Download XTTS model files
28
+ RUN mkdir -p /app/models/xtts_v2
29
+ RUN wget -O /app/models/xtts_v2/config.json https://huggingface.co/coqui/XTTS-v2/resolve/main/config.json?download=true
30
+ RUN wget -O /app/models/xtts_v2/model.pth https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth?download=true
31
+ RUN wget -O /app/models/xtts_v2/vocab.json https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json?download=true
32
+ RUN wget -O /app/models/xtts_v2/dvae.pth https://huggingface.co/coqui/XTTS-v2/resolve/main/dvae.pth?download=true
33
+ RUN wget -O /app/models/xtts_v2/speakers_xtts.pth https://huggingface.co/coqui/XTTS-v2/resolve/main/speakers_xtts.pth?download=true
34
+
35
+ # Add speaker reference and other files
36
+ COPY audio/speaker_reference.wav /app/audio/speaker_reference.wav
37
+ COPY Web_Page /app/Web_Page
38
+ COPY local_server_new.py /app/
39
+ COPY requirements.txt /app/
40
+
41
+ # Install app requirements
42
+ RUN pip install -r /app/requirements.txt --timeout=300
43
+
44
+ # Expose default HF port
45
+ EXPOSE 7860
46
+
47
+ # Run the server directly
48
+ CMD ["python", "-m", "uvicorn", "local_server_new:app", "--host", "0.0.0.0", "--port", "7860"]
Web_Page/index.html ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Text to Speech</title>
5
+ <link rel="stylesheet" href="style.css">
6
+ </head>
7
+ <body>
8
+ <h1>Text to Speech</h1>
9
+ <textarea id="inputText" rows="5" cols="50"></textarea><br><br>
10
+ <button id="convertButton">Convert to Speech</button>
11
+ <div id="status"></div>
12
+ <div id="audioOutput" style="margin-top: 20px;">
13
+ <a id="downloadLink" href="#" download="output.wav" style="display: none;">Download Audio</a>
14
+ <audio id="audioPlayer" controls style="display: none;"></audio>
15
+ </div>
16
+
17
+ <script src="script.js"></script>
18
+ </body>
19
+ </html>
Web_Page/script.js ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ document.addEventListener('DOMContentLoaded', () => {
2
+ const convertButton = document.getElementById('convertButton');
3
+ const inputText = document.getElementById('inputText');
4
+ const statusDiv = document.getElementById('status');
5
+ const downloadLink = document.getElementById('downloadLink');
6
+ const audioPlayer = document.getElementById('audioPlayer');
7
+
8
+ convertButton.addEventListener('click', async () => {
9
+ const text = inputText.value;
10
+ statusDiv.textContent = 'Processing...';
11
+ downloadLink.style.display = 'none';
12
+ audioPlayer.style.display = 'none';
13
+
14
+ try {
15
+ const response = await fetch('http://localhost:5000/text-to-speech/', {
16
+ method: 'POST',
17
+ headers: {
18
+ 'Content-Type': 'application/json',
19
+ },
20
+ body: JSON.stringify({ text: text }),
21
+ });
22
+
23
+ const data = await response.json();
24
+
25
+ if (data.status === 'success') {
26
+ statusDiv.textContent = 'Speech generated successfully!';
27
+ downloadLink.href = 'http://localhost:5000' + data.url;
28
+ downloadLink.style.display = 'block';
29
+ audioPlayer.src = 'http://localhost:5000' + data.url;
30
+ audioPlayer.style.display = 'block';
31
+ } else {
32
+ statusDiv.textContent = `Error: ${data.message}`;
33
+ }
34
+ } catch (error) {
35
+ statusDiv.textContent = `Network error: ${error}`;
36
+ }
37
+ });
38
+ });
Web_Page/style.css ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
3
+ background-color: #f4f7f6;
4
+ margin: 40px;
5
+ display: flex;
6
+ flex-direction: column;
7
+ align-items: center;
8
+ color: #333;
9
+ }
10
+
11
+ h1 {
12
+ color: #2c3e50;
13
+ margin-bottom: 30px;
14
+ text-align: center;
15
+ font-size: 2.5em;
16
+ }
17
+
18
+ textarea {
19
+ padding: 15px;
20
+ border: 1px solid #ccc;
21
+ border-radius: 8px;
22
+ font-size: 1em;
23
+ margin-bottom: 20px;
24
+ width: 80%;
25
+ max-width: 600px;
26
+ box-sizing: border-box;
27
+ resize: vertical; /* Allows vertical resizing */
28
+ box-shadow: 2px 2px 10px rgba(0, 0, 0, 0.1);
29
+ }
30
+
31
+ button {
32
+ background-color: #3498db;
33
+ color: white;
34
+ padding: 12px 25px;
35
+ border: none;
36
+ border-radius: 8px;
37
+ cursor: pointer;
38
+ font-size: 1.1em;
39
+ transition: background-color 0.3s ease;
40
+ box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.15);
41
+ }
42
+
43
+ button:hover {
44
+ background-color: #2980b9;
45
+ }
46
+
47
+ #status {
48
+ margin-top: 20px;
49
+ font-weight: bold;
50
+ color: #27ae60; /* Green for success, you can change for errors */
51
+ }
52
+
53
+ #audioOutput {
54
+ margin-top: 30px;
55
+ text-align: center;
56
+ }
57
+
58
+ #downloadLink {
59
+ display: inline-block;
60
+ background-color: #2ecc71;
61
+ color: white;
62
+ padding: 10px 20px;
63
+ border-radius: 5px;
64
+ text-decoration: none;
65
+ font-size: 1em;
66
+ transition: background-color 0.3s ease;
67
+ box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.15);
68
+ }
69
+
70
+ #downloadLink:hover {
71
+ background-color: #27ae60;
72
+ }
73
+
74
+ #audioPlayer {
75
+ margin-top: 10px;
76
+ }
local_server_new.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from fastapi.responses import FileResponse
5
+ import logging
6
+ import torch
7
+ import os
8
+ from TTS.api import TTS
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
10
+ from langdetect import detect
11
+
12
+ # Allowlist XttsConfig so torch.load doesn't raise UnpicklingError
13
+ from torch.serialization import add_safe_globals
14
+ from TTS.tts.configs.xtts_config import XttsConfig
15
+ add_safe_globals([XttsConfig])
16
+
17
+ # ✅ Monkey-patch torch.load to always use weights_only=False
18
+ _original_torch_load = torch.load
19
+ def patched_torch_load(*args, **kwargs):
20
+ kwargs["weights_only"] = False
21
+ return _original_torch_load(*args, **kwargs)
22
+ torch.load = patched_torch_load
23
+
24
+ logging.basicConfig(level=logging.DEBUG)
25
+
26
+ # Initialize FastAPI
27
+ app = FastAPI()
28
+ app.add_middleware(
29
+ CORSMiddleware,
30
+ allow_origins=["*"],
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ # Load TTS model from local files
36
+ try:
37
+ model_dir = "/app/models/xtts_v2"
38
+ config_path = os.path.join(model_dir, "config.json")
39
+ # When providing config_path, TTS might expect the directory for model_path
40
+ tts = TTS(model_path=model_dir, config_path=config_path).to("cuda" if torch.cuda.is_available() else "cpu")
41
+ print("XTTS v2 model loaded successfully from local files.")
42
+ except Exception as e:
43
+ print(f"Error loading XTTS v2 model from local files: {e}")
44
+ print("Falling back to loading by model name (license might be required).")
45
+ tts = TTS("tts_models/multilingual/multi-dataset-xtts_v2").to("cuda" if torch.cuda.is_available() else "cpu")
46
+
47
+ # Load sentiment models
48
+ arabic_model_name = "aubmindlab/bert-base-arabertv02-twitter"
49
+ sentiment_tokenizer = AutoTokenizer.from_pretrained(arabic_model_name)
50
+ sentiment_model = AutoModelForSequenceClassification.from_pretrained("UBC-NLP/MARBERT")
51
+ sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
52
+
53
+ # Input class for POST body
54
+ class Message(BaseModel):
55
+ text: str
56
+
57
+ # Language detection
58
+ def detect_language_safely(text):
59
+ try:
60
+ if any('\u0600' <= c <= '\u06FF' for c in text):
61
+ return "ar"
62
+ return detect(text)
63
+ except:
64
+ return "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
65
+
66
+ # Sentiment to emotion mapping
67
+ def map_sentiment_to_emotion(sentiment, language="en"):
68
+ if language == "ar":
69
+ return "happy" if sentiment == "positive" else "sad" if sentiment == "negative" else "neutral"
70
+ return "happy" if "positive" in sentiment.lower() else "sad" if "negative" in sentiment.lower() else "neutral"
71
+
72
+ # Simple Arabic sentiment analysis
73
+ def arabic_sentiment_analysis(text):
74
+ pos_words = ["سعيد", "فرح", "ممتاز", "رائع", "جيد", "حب", "جميل", "نجاح", "أحسنت", "شكرا"]
75
+ neg_words = ["حزين", "غاضب", "سيء", "فشل", "خطأ", "مشكلة", "صعب", "لا أحب", "سخيف", "مؤسف"]
76
+ pos_count = sum(1 for word in pos_words if word in text.lower())
77
+ neg_count = sum(1 for word in neg_words if word in text.lower())
78
+
79
+ if pos_count > neg_count:
80
+ return "positive"
81
+ elif neg_count > pos_count:
82
+ return "negative"
83
+ else:
84
+ try:
85
+ inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
86
+ outputs = sentiment_model(**inputs)
87
+ sentiment_class = torch.argmax(outputs.logits).item()
88
+ return ["negative", "neutral", "positive"][sentiment_class]
89
+ except:
90
+ return "neutral"
91
+
92
+ # Main TTS endpoint
93
+ @app.post("/text-to-speech/")
94
+ def text_to_speech(msg: Message):
95
+ text = msg.text
96
+ language = detect_language_safely(text)
97
+ emotion = "neutral"
98
+
99
+ if language == "en":
100
+ try:
101
+ sentiment_result = sentiment_analyzer(text)[0]
102
+ emotion = map_sentiment_to_emotion(sentiment_result["label"])
103
+ except:
104
+ pass
105
+ else:
106
+ try:
107
+ sentiment_result = arabic_sentiment_analysis(text)
108
+ emotion = map_sentiment_to_emotion(sentiment_result, language="ar")
109
+ except:
110
+ pass
111
+
112
+ output_filename = "output.wav"
113
+ try:
114
+ tts.tts_to_file(
115
+ text=text,
116
+ file_path=output_filename,
117
+ emotion=emotion,
118
+ speaker_wav="/app/audio/speaker_reference.wav", # Updated path
119
+ language=language
120
+ )
121
+ return {
122
+ "status": "success",
123
+ "audio_file": output_filename,
124
+ "url": "/audio"
125
+ }
126
+ except Exception as e:
127
+ return {"status": "error", "message": str(e)}
128
+
129
+ # ✅ Serve the audio file
130
+ @app.get("/audio")
131
+ def get_audio():
132
+ return FileResponse("output.wav", media_type="audio/wav", filename="output.wav")
requirements.txt ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core deps
2
+ numpy==1.22.0;python_version<="3.10"
3
+ numpy>=1.24.3;python_version>"3.10"
4
+ cython>=0.29.30
5
+ scipy>=1.11.2
6
+ langdetect
7
+ torch>=2.1
8
+ torchaudio==2.6.0
9
+ soundfile>=0.12.0
10
+ librosa>=0.10.0
11
+ scikit-learn>=1.3.0
12
+ numba==0.55.1;python_version<"3.9"
13
+ numba>=0.57.0;python_version>="3.9"
14
+ inflect>=7.5.0
15
+ tqdm>=4.67.1
16
+ anyascii>=0.3.2
17
+ pyyaml>=6.0.2
18
+ fsspec>=2025.3.2
19
+ aiohttp>=3.8.1
20
+ packaging>=24.2
21
+ mutagen==1.47.0
22
+ # deps for examples
23
+ flask>=3.1.0
24
+ # deps for inference
25
+ pysbd>=0.3.4
26
+ # deps for notebooks
27
+ umap-learn>=0.5.7
28
+ pandas>=1.4,<2.0
29
+ # deps for training
30
+ matplotlib>=3.8.4
31
+ # coqui stack
32
+ trainer>=0.0.36
33
+ # config management
34
+ coqpit>=0.0.17
35
+ # chinese g2p deps
36
+ jieba==0.42.1
37
+ pypinyin==0.54.0
38
+ # korean
39
+ hangul-romanize==0.1.0
40
+ # gruut+supported langs
41
+ gruut[de,es,fr]==2.2.3
42
+ gruut-ipa==0.13.0
43
+ gruut_lang_de==2.0.1
44
+ gruut_lang_en==2.0.1
45
+ gruut_lang_es==2.0.1
46
+ gruut_lang_fr==2.0.2
47
+ # deps for korean
48
+ jamo==0.4.1
49
+ nltk==3.9.1
50
+ g2pkk>=0.1.2
51
+ # deps for bangla
52
+ bangla==0.0.1
53
+ bnnumerizer==0.0.2
54
+ bnunicodenormalizer==0.1.7
55
+ #deps for tortoise
56
+ einops==0.8.1
57
+ transformers==4.51.2
58
+ #deps for bark
59
+ encodec==0.1.1
60
+ # deps for XTTS
61
+ unidecode>=1.3.8
62
+ num2words==0.5.14
63
+ spacy[ja]>=3
64
+ # Additional dependencies from TTS requirements
65
+ absl-py==2.2.2
66
+ aiohappyeyeballs==2.6.1
67
+ aiosignal==1.3.2
68
+ annotated-types==0.7.0
69
+ anyio==4.9.0
70
+ async-timeout==5.0.1
71
+ attrs==25.3.0
72
+ audioread==3.0.1
73
+ babel==2.17.0
74
+ blinker==1.9.0
75
+ catalogue==2.0.10
76
+ certifi==2025.1.31
77
+ cffi==1.17.1
78
+ charset-normalizer==3.4.1
79
+ click==8.1.8
80
+ cloudpathlib==0.21.0
81
+ colorama==0.4.6
82
+ comtypes==1.4.10
83
+ confection==0.1.5
84
+ contourpy==1.2.1
85
+ cycler==0.12.1
86
+ cymem==2.0.11
87
+ Cython==3.0.12
88
+ dateparser==1.1.8
89
+ decorator==5.2.1
90
+ docopt==0.6.2
91
+ fastapi==0.109.2
92
+ filelock==3.18.0
93
+ fonttools==4.57.0
94
+ frozenlist==1.5.0
95
+ grpcio==1.71.0
96
+ h11==0.14.0
97
+ huggingface-hub==0.30.2
98
+ idna==3.10
99
+ importlib_metadata==8.6.1
100
+ importlib_resources==6.5.2
101
+ itsdangerous==2.2.0
102
+ Jinja2==3.1.6
103
+ joblib==1.4.2
104
+ jsonlines==1.2.0
105
+ kiwisolver==1.4.7
106
+ langcodes==3.5.0
107
+ language_data==1.3.0
108
+ lazy_loader==0.4
109
+ llvmlite==0.43.0
110
+ marisa-trie==1.2.1
111
+ Markdown==3.8
112
+ markdown-it-py==3.0.0
113
+ MarkupSafe==3.0.2
114
+ mdurl==0.1.2
115
+ more-itertools==10.6.0
116
+ mpmath==1.3.0
117
+ msgpack==1.1.0
118
+ multidict==6.4.3
119
+ murmurhash==1.0.12
120
+ networkx==2.8.8
121
+ num2words==0.5.14
122
+ packaging
123
+ pandas>=1.4,<2.0
124
+ pillow==11.1.0
125
+ platformdirs==4.3.7
126
+ pooch==1.8.2
127
+ preshed==3.0.9
128
+ propcache==0.3.1
129
+ protobuf==6.30.2
130
+ psutil==7.0.0
131
+ pycparser==2.22
132
+ pydantic==1.10.21
133
+ pydantic_core==2.33.1
134
+ Pygments==2.19.1
135
+ pynndescent==0.5.13
136
+ pyparsing==3.2.3
137
+ python-crfsuite==0.9.11
138
+ python-dateutil==2.9.0.post0
139
+ pyttsx3==2.98
140
+ pytz==2025.2
141
+ regex==2024.11.6
142
+ requests==2.32.3
143
+ rich==14.0.0
144
+ safetensors==0.5.3
145
+ shellingham==1.5.4
146
+ six==1.17.0
147
+ smart-open==7.1.0
148
+ sniffio==1.3.1
149
+ soxr==0.5.0.post1
150
+ spacy-legacy==3.0.12
151
+ spacy-loggers==1.0.5
152
+ SpeechRecognition==3.14.2
153
+ srsly==2.5.1
154
+ starlette==0.36.3
155
+ SudachiDict-core==20250129
156
+ SudachiPy==0.6.10
157
+ sympy==1.13.1
158
+ tensorboard==2.19.0
159
+ tensorboard-data-server==0.7.2
160
+ thinc==8.3.4
161
+ threadpoolctl==3.6.0
162
+ tokenizers==0.21.1
163
+ typeguard==4.4.2
164
+ typer==0.15.2
165
+ typing-inspection==0.4.0
166
+ typing_extensions==4.13.2
167
+ tzdata==2025.2
168
+ tzlocal==5.3.1
169
+ Unidecode==1.3.8
170
+ urllib3==2.4.0
171
+ uvicorn==0.34.0
172
+ wasabi==1.1.3
173
+ weasel==0.4.1
174
+ Werkzeug==3.1.3
175
+ wrapt==1.17.2
176
+ yarl==1.19.0
177
+ zipp==3.21.0
178
+ # Force rebuild