Mgolo commited on
Commit
595752f
·
verified ·
1 Parent(s): b19fe5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +224 -128
app.py CHANGED
@@ -1,99 +1,127 @@
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
3
  import torch
4
- import tempfile
5
- import os
6
  import whisper
7
  import fitz # PyMuPDF
8
  import docx
9
  from bs4 import BeautifulSoup
10
  import markdown2
11
  import chardet
12
- import re
 
13
 
14
- # Device setup
15
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
16
 
17
- # Global model holders
18
- translator = None
19
- whisper_model = None
20
 
21
- # Model configurations
22
- MODELS = {
 
 
 
23
  ("English", "Wolof"): {"model_name": "LocaleNLP/localenlp-eng-wol-0.03", "tag": ">>wol<<"},
24
  ("Wolof", "English"): {"model_name": "LocaleNLP/localenlp-wol-eng-0.03", "tag": ">>eng<<"},
25
  ("English", "Hausa"): {"model_name": "LocaleNLP/localenlp-eng-hau-0.01", "tag": ">>hau<<"},
26
  ("Hausa", "English"): {"model_name": "LocaleNLP/localenlp-hau-eng-0.01", "tag": ">>eng<<"},
27
- ("English", "Darija"): {"model_name": "LocaleNLP/english_darija", "tag": ">>dar<<"},
28
  }
29
 
30
- HF_TOKEN = os.getenv("hffff")
31
-
32
- def load_model(input_lang, output_lang):
33
- global translator
34
- key = (input_lang, output_lang)
35
- if key not in MODELS:
36
- raise ValueError("Language pair not supported.")
37
- cfg = MODELS[key]
38
- if translator is None or translator.model.config._name_or_path != cfg["model_name"]:
39
- model = AutoModelForSeq2SeqLM.from_pretrained(cfg["model_name"], token=HF_TOKEN).to(device)
40
- tokenizer = MarianTokenizer.from_pretrained(cfg["model_name"], token=HF_TOKEN)
41
- translator = pipeline("translation", model=model, tokenizer=tokenizer, device=0 if device.type=='cuda' else -1)
42
- return translator, cfg["tag"]
43
-
44
- def load_whisper_model():
45
- global whisper_model
46
- if whisper_model is None:
47
- whisper_model = whisper.load_model("base")
48
- return whisper_model
49
-
50
- def transcribe_audio(audio_file):
51
- model = load_whisper_model()
52
- if isinstance(audio_file, str):
53
- audio_path = audio_file
54
- else:
55
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
56
- tmp.write(audio_file.read())
57
- audio_path = tmp.name
58
- result = model.transcribe(audio_path)
59
- if not isinstance(audio_file, str):
60
- os.remove(audio_path)
61
- return result["text"]
 
 
 
 
 
 
 
 
 
62
 
63
- def extract_text_from_file(uploaded_file):
64
- if isinstance(uploaded_file, str):
65
- file_path = uploaded_file
66
- file_type = file_path.split('.')[-1].lower()
67
- with open(file_path, "rb") as f:
68
- content = f.read()
69
- else:
70
- file_type = uploaded_file.name.split('.')[-1].lower()
71
- content = uploaded_file.read()
72
 
73
- if file_type == "pdf":
 
 
 
 
 
 
 
 
 
74
  with fitz.open(stream=content, filetype="pdf") as doc:
75
- return "\n".join([page.get_text() for page in doc])
76
- elif file_type == "docx":
77
- doc = docx.Document(file_path if isinstance(uploaded_file, str) else uploaded_file)
78
- return "\n".join([para.text for para in doc.paragraphs])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  else:
80
- encoding = chardet.detect(content)['encoding']
81
- content = content.decode(encoding, errors='ignore') if encoding else content
82
- if file_type in ("html", "htm"):
83
- return BeautifulSoup(content, "html.parser").get_text()
84
- elif file_type == "md":
85
- html = markdown2.markdown(content)
86
- return BeautifulSoup(html, "html.parser").get_text()
87
- elif file_type == "srt":
88
- return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content)
89
- elif file_type in ("txt", "text"):
90
- return content
91
- else:
92
- raise ValueError("Unsupported file type")
93
-
94
- def translate_text(text, input_lang, output_lang):
95
- translator, tag = load_model(input_lang, output_lang)
96
- paragraphs = text.split("\n")
97
  translated_output = []
98
 
99
  with torch.no_grad():
@@ -101,70 +129,138 @@ def translate_text(text, input_lang, output_lang):
101
  if not para.strip():
102
  translated_output.append("")
103
  continue
104
- sentences = [s.strip() for s in para.split('. ') if s.strip()]
105
- formatted = [f"{tag} {s}" for s in sentences]
106
- results = translator(formatted,
107
- max_length=5000,
108
- num_beams=5,
109
- early_stopping=True,
110
- no_repeat_ngram_size=3,
111
- repetition_penalty=1.5,
112
- length_penalty=1.2)
113
- translated_sentences = [r['translation_text'].capitalize() for r in results]
114
- translated_output.append('. '.join(translated_sentences))
 
 
 
115
  return "\n".join(translated_output)
116
 
117
- def process_input(input_mode, input_lang, text, audio_file, file_obj):
118
- if input_mode == "Audio" and input_lang != "English":
119
- raise ValueError("Audio input must be in English.")
120
- if input_mode == "Text":
121
- return text
122
- elif input_mode == "Audio" and audio_file is not None:
123
- return transcribe_audio(audio_file)
124
- elif input_mode == "File" and file_obj is not None:
125
- return extract_text_from_file(file_obj)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  return ""
127
 
128
- # Gradio UI
129
- with gr.Blocks() as demo:
130
- gr.Markdown("## LocaleNLP Multi-language Translator")
131
- gr.Markdown("Translate between English, Wolof, and Hausa. Now, audio input only accepts English.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  with gr.Row():
134
- input_mode = gr.Radio(choices=["Text", "Audio", "File"], label="Input type", value="Text")
135
- input_lang = gr.Dropdown(choices=["English", "Wolof", "Hausa"], label="Input language", value="English")
136
- output_lang = gr.Dropdown(choices=["English", "Wolof", "Hausa","Darija"], label="Output language", value="Wolof")
137
 
138
- input_text = gr.Textbox(label="Enter text", lines=10, visible=True)
139
- audio_input = gr.Audio(label="Upload audio (.wav, .mp3, .m4a)", type="filepath", visible=False)
140
- file_input = gr.File(file_types=['.pdf', '.docx', '.html', '.htm', '.md', '.srt', '.txt'], label="Upload document", visible=False)
141
 
142
  extracted_text = gr.Textbox(label="Extracted / Transcribed Text", lines=10, interactive=False)
143
  translate_button = gr.Button("Translate")
144
  output_text = gr.Textbox(label="Translated Text", lines=10, interactive=False)
145
 
146
- def update_visibility(mode):
147
- return {
148
- input_text: gr.update(visible=(mode=="Text")),
149
- audio_input: gr.update(visible=(mode=="Audio")),
150
- file_input: gr.update(visible=(mode=="File")),
151
- extracted_text: gr.update(value="", visible=True),
152
- output_text: gr.update(value="")
153
- }
154
  input_mode.change(fn=update_visibility, inputs=input_mode, outputs=[input_text, audio_input, file_input, extracted_text, output_text])
155
 
156
- def handle_process(mode, lang_in, text, audio, file_obj):
157
- try:
158
- extracted = process_input(mode, lang_in, text, audio, file_obj)
159
- return extracted, ""
160
- except Exception as e:
161
- return "", f"Error: {str(e)}"
162
- translate_button.click(fn=handle_process, inputs=[input_mode, input_lang, input_text, audio_input, file_input], outputs=[extracted_text, output_text])
163
-
164
- def handle_translate(text, lang_in, lang_out):
165
- if not text.strip():
166
- return "No input text to translate."
167
- return translate_text(text, lang_in, lang_out)
168
- translate_button.click(fn=handle_translate, inputs=[extracted_text, input_lang, output_lang], outputs=output_text)
169
-
170
- demo.launch()
 
1
+ import os
2
+ import re
3
+ import tempfile
4
+ import logging
5
+ from typing import Optional, Dict, Tuple, Any
6
+ from pathlib import Path
7
+
8
  import gradio as gr
 
9
  import torch
 
 
10
  import whisper
11
  import fitz # PyMuPDF
12
  import docx
13
  from bs4 import BeautifulSoup
14
  import markdown2
15
  import chardet
16
+ from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
17
+
18
 
19
+ # -------------------------------
20
+ # Configuration & Logging Setup
21
+ # -------------------------------
22
 
23
+ logging.basicConfig(level=logging.INFO)
24
+ logger = logging.getLogger(__name__)
 
25
 
26
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ HF_TOKEN = os.getenv("HF_TOKEN")
28
+
29
+ # Language Pair Models
30
+ MODELS: Dict[Tuple[str, str], Dict[str, str]] = {
31
  ("English", "Wolof"): {"model_name": "LocaleNLP/localenlp-eng-wol-0.03", "tag": ">>wol<<"},
32
  ("Wolof", "English"): {"model_name": "LocaleNLP/localenlp-wol-eng-0.03", "tag": ">>eng<<"},
33
  ("English", "Hausa"): {"model_name": "LocaleNLP/localenlp-eng-hau-0.01", "tag": ">>hau<<"},
34
  ("Hausa", "English"): {"model_name": "LocaleNLP/localenlp-hau-eng-0.01", "tag": ">>eng<<"},
35
+ ("English", "Darija"): {"model_name": "LocaleNLP/english_darija", "tag": ">>dar<<"}
36
  }
37
 
38
+ SUPPORTED_LANGUAGES = ["English", "Wolof", "Hausa", "Darija"]
39
+ INPUT_MODES = ["Text", "Audio", "File"]
40
+ SUPPORTED_FILE_TYPES = [".pdf", ".docx", ".html", ".htm", ".md", ".srt", ".txt"]
41
+
42
+ # -------------------------------
43
+ # Model Manager
44
+ # -------------------------------
45
+
46
+ class ModelManager:
47
+ """Manages loading and caching of translation and transcription models."""
48
+
49
+ def __init__(self):
50
+ self.translation_pipeline = None
51
+ self.whisper_model = None
52
+
53
+ def load_translation_model(self, src_lang: str, tgt_lang: str) -> Tuple[Any, str]:
54
+ key = (src_lang, tgt_lang)
55
+ if key not in MODELS:
56
+ raise ValueError(f"Unsupported language pair: {src_lang} -> {tgt_lang}")
57
+
58
+ config = MODELS[key]
59
+ model_name = config["model_name"]
60
+ lang_tag = config["tag"]
61
+
62
+ if self.translation_pipeline is None or self.translation_pipeline.model.config._name_or_path != model_name:
63
+ logger.info(f"Loading translation model: {model_name}")
64
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=HF_TOKEN).to(DEVICE)
65
+ tokenizer = MarianTokenizer.from_pretrained(model_name, token=HF_TOKEN)
66
+ self.translation_pipeline = pipeline(
67
+ "translation",
68
+ model=model,
69
+ tokenizer=tokenizer,
70
+ device=0 if DEVICE.type == "cuda" else -1
71
+ )
72
+ return self.translation_pipeline, lang_tag
73
+
74
+ def load_whisper_model(self) -> Any:
75
+ if self.whisper_model is None:
76
+ logger.info("Loading Whisper base model...")
77
+ self.whisper_model = whisper.load_model("base")
78
+ return self.whisper_model
79
 
 
 
 
 
 
 
 
 
 
80
 
81
+ # -------------------------------
82
+ # File Processing Utilities
83
+ # -------------------------------
84
+
85
+ def extract_text_from_file(file_path: str) -> str:
86
+ """Extracts text from various file types."""
87
+ ext = Path(file_path).suffix.lower()
88
+ content = Path(file_path).read_bytes()
89
+
90
+ if ext == ".pdf":
91
  with fitz.open(stream=content, filetype="pdf") as doc:
92
+ return "\n".join(page.get_text() for page in doc)
93
+
94
+ elif ext == ".docx":
95
+ doc = docx.Document(file_path)
96
+ return "\n".join(p.text for p in doc.paragraphs)
97
+
98
+ elif ext in (".html", ".htm"):
99
+ return BeautifulSoup(content.decode("utf-8", errors="ignore"), "html.parser").get_text()
100
+
101
+ elif ext == ".md":
102
+ html = markdown2.markdown(content.decode("utf-8", errors="ignore"))
103
+ return BeautifulSoup(html, "html.parser").get_text()
104
+
105
+ elif ext == ".srt":
106
+ decoded = content.decode("utf-8", errors="ignore")
107
+ return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", decoded)
108
+
109
+ elif ext in (".txt", ".text"):
110
+ encoding = chardet.detect(content)["encoding"]
111
+ return content.decode(encoding or "utf-8", errors="ignore")
112
+
113
  else:
114
+ raise ValueError(f"Unsupported file type: {ext}")
115
+
116
+
117
+ # -------------------------------
118
+ # Translation Logic
119
+ # -------------------------------
120
+
121
+ def translate_text(text: str, src_lang: str, tgt_lang: str, model_manager: ModelManager) -> str:
122
+ """Translates input text using the specified language pair."""
123
+ pipe, tag = model_manager.load_translation_model(src_lang, tgt_lang)
124
+ paragraphs = text.splitlines()
 
 
 
 
 
 
125
  translated_output = []
126
 
127
  with torch.no_grad():
 
129
  if not para.strip():
130
  translated_output.append("")
131
  continue
132
+ sentences = [s.strip() for s in para.split(". ") if s.strip()]
133
+ formatted = [f"{tag} {sentence}" for sentence in sentences]
134
+ results = pipe(
135
+ formatted,
136
+ max_length=5000,
137
+ num_beams=5,
138
+ early_stopping=True,
139
+ no_repeat_ngram_size=3,
140
+ repetition_penalty=1.5,
141
+ length_penalty=1.2
142
+ )
143
+ translated_sentences = [r["translation_text"].capitalize() for r in results]
144
+ translated_output.append(". ".join(translated_sentences))
145
+
146
  return "\n".join(translated_output)
147
 
148
+
149
+ # -------------------------------
150
+ # Audio Transcription
151
+ # -------------------------------
152
+
153
+ def transcribe_audio(file_path: str, model_manager: ModelManager) -> str:
154
+ """Transcribes audio file using Whisper."""
155
+ model = model_manager.load_whisper_model()
156
+ result = model.transcribe(file_path)
157
+ return result["text"]
158
+
159
+
160
+ # -------------------------------
161
+ # Main Processing Function
162
+ # -------------------------------
163
+
164
+ def process_input(
165
+ mode: str,
166
+ src_lang: str,
167
+ text_input: str,
168
+ audio_path: Optional[str],
169
+ file_obj: Optional[gr.FileData]
170
+ ) -> str:
171
+ """Processes input based on selected mode."""
172
+ if mode == "Text":
173
+ return text_input
174
+ elif mode == "Audio":
175
+ if src_lang != "English":
176
+ raise ValueError("Audio input must be in English.")
177
+ if not audio_path:
178
+ raise ValueError("No audio file uploaded.")
179
+ return transcribe_audio(audio_path, model_manager)
180
+ elif mode == "File":
181
+ if not file_obj:
182
+ raise ValueError("No file uploaded.")
183
+ return extract_text_from_file(file_obj.name)
184
  return ""
185
 
186
+
187
+ # -------------------------------
188
+ # Gradio UI Logic
189
+ # -------------------------------
190
+
191
+ model_manager = ModelManager()
192
+
193
+
194
+ def update_visibility(mode: str) -> Dict[str, Any]:
195
+ """Update visibility of input components based on selected mode."""
196
+ return {
197
+ input_text: gr.update(visible=(mode == "Text")),
198
+ audio_input: gr.update(visible=(mode == "Audio")),
199
+ file_input: gr.update(visible=(mode == "File")),
200
+ extracted_text: gr.update(value="", visible=True),
201
+ output_text: gr.update(value="")
202
+ }
203
+
204
+
205
+ def handle_process(
206
+ mode: str,
207
+ src_lang: str,
208
+ text_input: str,
209
+ audio_path: Optional[str],
210
+ file_obj: Optional[gr.FileData]
211
+ ) -> Tuple[str, str]:
212
+ """Handles the initial processing of input."""
213
+ try:
214
+ extracted = process_input(mode, src_lang, text_input, audio_path, file_obj)
215
+ return extracted, ""
216
+ except Exception as e:
217
+ logger.error(f"Processing error: {e}")
218
+ return "", f"Error: {str(e)}"
219
+
220
+
221
+ def handle_translate(extracted_text: str, src_lang: str, tgt_lang: str) -> str:
222
+ """Handles translation of extracted text."""
223
+ if not extracted_text.strip():
224
+ return "No input text to translate."
225
+ try:
226
+ return translate_text(extracted_text, src_lang, tgt_lang, model_manager)
227
+ except Exception as e:
228
+ logger.error(f"Translation error: {e}")
229
+ return f"Translation error: {str(e)}"
230
+
231
+
232
+ # -------------------------------
233
+ # Gradio Interface
234
+ # -------------------------------
235
+
236
+ with gr.Blocks(title="LocaleNLP Translator") as demo:
237
+ gr.Markdown("## 🌍 LocaleNLP Multi-language Translator")
238
+ gr.Markdown("Supports translation between English, Wolof, Hausa, and Darija. Audio input must be in English.")
239
 
240
  with gr.Row():
241
+ input_mode = gr.Radio(choices=INPUT_MODES, label="Input Type", value="Text")
242
+ input_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES[:-1], label="Input Language", value="English")
243
+ output_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, label="Output Language", value="Wolof")
244
 
245
+ input_text = gr.Textbox(label="Enter Text", lines=10, visible=True)
246
+ audio_input = gr.Audio(label="Upload Audio (.wav, .mp3, .m4a)", type="filepath", visible=False)
247
+ file_input = gr.File(file_types=SUPPORTED_FILE_TYPES, label="Upload Document", visible=False)
248
 
249
  extracted_text = gr.Textbox(label="Extracted / Transcribed Text", lines=10, interactive=False)
250
  translate_button = gr.Button("Translate")
251
  output_text = gr.Textbox(label="Translated Text", lines=10, interactive=False)
252
 
 
 
 
 
 
 
 
 
253
  input_mode.change(fn=update_visibility, inputs=input_mode, outputs=[input_text, audio_input, file_input, extracted_text, output_text])
254
 
255
+ translate_button.click(
256
+ fn=handle_process,
257
+ inputs=[input_mode, input_lang, input_text, audio_input, file_input],
258
+ outputs=[extracted_text, output_text]
259
+ ).then(
260
+ fn=handle_translate,
261
+ inputs=[extracted_text, input_lang, output_lang],
262
+ outputs=output_text
263
+ )
264
+
265
+ if __name__ == "__main__":
266
+ demo.launch()