Spaces:

LocaleNLP
/

LocaleNLP_Translator

Running

App Files Files Community

LocaleNLP_Translator / app.py

Mgolo

Create app.py

6bd1e51 verified 3 months ago

raw

history blame

8.27 kB

	import gradio as gr
	from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
	import torch
	import unicodedata
	import re
	import whisper
	import tempfile
	import os

	import nltk
	nltk.download('punkt')
	from nltk.tokenize import sent_tokenize

	import fitz # PyMuPDF
	import docx
	from bs4 import BeautifulSoup
	import markdown2
	import chardet

	# Device setup
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# Model configuration
	MODELS = {
	"english_wolof": {
	"model_name": "LocaleNLP/localenlp-eng-wol-0.03",
	"target_tag": ">>wol<<"
	},
	"wolof_english": {
	"model_name": "LocaleNLP/localenlp-wol-eng-0.03",
	"target_tag": ">>eng<<"
	},
	"english_hausa": {
	"model_name": "LocaleNLP/localenlp-eng-hau-0.01",
	"target_tag": ">>hau<<"
	},
	"hausa_english": {
	"model_name": "LocaleNLP/localenlp-hau-eng-0.01",
	"target_tag": ">>eng<<"
	}
	}

	# Global variables
	translator = None
	current_model = None
	whisper_model = None

	HF_TOKEN = os.getenv("HF_TOKEN")

	def load_translation_model(input_lang, output_lang):
	global translator, current_model

	model_key = f"{input_lang.lower()}_{output_lang.lower()}"
	if model_key not in MODELS:
	raise ValueError(f"Translation from {input_lang} to {output_lang} is not supported")

	if current_model != model_key or translator is None:
	model_config = MODELS[model_key]
	model = AutoModelForSeq2SeqLM.from_pretrained(model_config["model_name"], token=HF_TOKEN).to(device)
	tokenizer = MarianTokenizer.from_pretrained(model_config["model_name"], token=HF_TOKEN)
	translator = {
	"pipeline": pipeline("translation", model=model, tokenizer=tokenizer,
	device=0 if device.type == 'cuda' else -1),
	"target_tag": model_config["target_tag"]
	}
	current_model = model_key

	return translator

	def load_whisper_model():
	global whisper_model
	if whisper_model is None:
	whisper_model = whisper.load_model("base")
	return whisper_model

	def transcribe_audio(audio_file):
	model = load_whisper_model()
	if isinstance(audio_file, str):
	audio_path = audio_file
	else:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	tmp.write(audio_file.read())
	audio_path = tmp.name
	result = model.transcribe(audio_path)
	if not isinstance(audio_file, str):
	os.remove(audio_path)
	return result["text"]

	def extract_text_from_file(uploaded_file):
	if isinstance(uploaded_file, str):
	file_path = uploaded_file
	file_type = file_path.split('.')[-1].lower()
	with open(file_path, "rb") as f:
	content = f.read()
	else:
	file_type = uploaded_file.name.split('.')[-1].lower()
	content = uploaded_file.read()

	if file_type == "pdf":
	with fitz.open(stream=content, filetype="pdf") as doc:
	return "\n".join([page.get_text() for page in doc])
	elif file_type == "docx":
	if isinstance(uploaded_file, str):
	doc = docx.Document(file_path)
	else:
	doc = docx.Document(uploaded_file)
	return "\n".join([para.text for para in doc.paragraphs])
	else:
	encoding = chardet.detect(content)['encoding']
	if encoding:
	content = content.decode(encoding, errors='ignore')
	if file_type in ("html", "htm"):
	soup = BeautifulSoup(content, "html.parser")
	return soup.get_text()
	elif file_type == "md":
	html = markdown2.markdown(content)
	soup = BeautifulSoup(html, "html.parser")
	return soup.get_text()
	elif file_type == "srt":
	return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content)
	elif file_type in ("txt", "text"):
	return content
	else:
	raise ValueError("Unsupported file type")

	def translate(text, input_lang, output_lang):
	translator = load_translation_model(input_lang, output_lang)
	lang_tag = translator["target_tag"]
	translation_pipeline = translator["pipeline"]

	paragraphs = text.split("\n")
	translated_output = []

	with torch.no_grad():
	for para in paragraphs:
	if not para.strip():
	translated_output.append("")
	continue
	sentences = [s.strip() for s in para.split('. ') if s.strip()]
	formatted = [f"{lang_tag} {s}" for s in sentences]

	results = translation_pipeline(formatted,
	max_length=5000,
	num_beams=5,
	early_stopping=True,
	no_repeat_ngram_size=3,
	repetition_penalty=1.5,
	length_penalty=1.2)
	translated_sentences = [r['translation_text'].capitalize() for r in results]
	translated_output.append('. '.join(translated_sentences))

	return "\n".join(translated_output)

	def process_input(input_mode, text, audio_file, file_obj, input_lang):
	input_text = ""
	if input_mode == "Text":
	input_text = text
	elif input_mode == "Audio":
	if audio_file is not None:
	input_text = transcribe_audio(audio_file)
	elif input_mode == "File":
	if file_obj is not None:
	input_text = extract_text_from_file(file_obj)
	return input_text

	def translate_and_return(text, input_lang, output_lang):
	if not text.strip():
	return "No input text to translate."
	return translate(text, input_lang, output_lang)

	def update_input_lang_dropdown(input_mode):
	if input_mode == "Audio":
	return gr.Dropdown(value="English", interactive=False)
	else:
	return gr.Dropdown(interactive=True)

	# Gradio UI components
	with gr.Blocks() as demo:
	gr.Markdown("## LocaleNLP Translator")
	gr.Markdown("Translate between English, Wolof, and Hausa using Localenlp models.")

	with gr.Row():
	input_mode = gr.Radio(choices=["Text", "Audio", "File"], label="Select input mode", value="Text")

	with gr.Row():
	input_lang = gr.Dropdown(choices=["English", "Wolof", "Hausa"], label="Input Language", value="English")
	output_lang = gr.Dropdown(choices=["English", "Wolof", "Hausa"], label="Output Language", value="Hausa")

	input_text = gr.Textbox(label="Enter text", lines=10, visible=True)
	audio_input = gr.Audio(label="Upload audio (.wav, .mp3, .m4a)", type="filepath", visible=False)
	file_input = gr.File(file_types=['.pdf', '.docx', '.html', '.htm', '.md', '.srt', '.txt'], label="Upload document", visible=False)

	extracted_text = gr.Textbox(label="Extracted / Transcribed Text", lines=10, interactive=False)
	translate_button = gr.Button("Translate")
	output_text = gr.Textbox(label="Translated Text", lines=10, interactive=False)

	def update_visibility(mode):
	return {
	input_text: gr.update(visible=(mode=="Text")),
	audio_input: gr.update(visible=(mode=="Audio")),
	file_input: gr.update(visible=(mode=="File")),
	extracted_text: gr.update(value="", visible=True),
	output_text: gr.update(value="")
	}

	input_mode.change(fn=update_visibility, inputs=input_mode, outputs=[input_text, audio_input, file_input, extracted_text, output_text])
	input_mode.change(fn=update_input_lang_dropdown, inputs=input_mode, outputs=input_lang)

	def handle_process(mode, text, audio, file_obj, in_lang):
	try:
	extracted = process_input(mode, text, audio, file_obj, in_lang)
	return extracted, ""
	except Exception as e:
	return "", f"Error: {str(e)}"

	translate_button.click(fn=handle_process, inputs=[input_mode, input_text, audio_input, file_input, input_lang], outputs=[extracted_text, output_text])

	def handle_translate(text, in_lang, out_lang):
	return translate_and_return(text, in_lang, out_lang)

	translate_button.click(fn=handle_translate, inputs=[extracted_text, input_lang, output_lang], outputs=output_text)

	demo.launch()