| from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
| import torch | |
| from gtts import gTTS | |
| import gradio as gr | |
| import tempfile | |
| # def translate_and_speak(text): | |
| # input_text = "en " + text | |
| # encoded = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device) | |
| # generated_tokens = model.generate(**encoded, max_length=128, num_beams=5, early_stopping=True) | |
| # output = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) | |
| # for tag in ["__en__", "__sa__", "en", "sa"]: | |
| # output = output.replace(tag, "") | |
| # sanskrit_text = output.strip() | |
| # # Convert to speech | |
| # tts = gTTS(sanskrit_text, lang='hi') | |
| # with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: | |
| # tts.save(fp.name) | |
| # audio_path = fp.name | |
| # return sanskrit_text, audio_path | |
| # Load model and tokenizer | |
| # import os | |
| # import sys | |
| # import transformers | |
| # import tensorflow as tf | |
| # from datasets import load_dataset | |
| # from transformers import AutoTokenizer | |
| # from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq | |
| # from transformers import AdamWeightDecay | |
| # from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM | |
| # model_checkpoint = "Helsinki-NLP/opus-mt-en-hi" | |
| # from datasets import load_dataset | |
| # raw_datasets = load_dataset("rahular/itihasa", download_mode="force_redownload") | |
| # import torch | |
| # from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments | |
| # from datasets import load_dataset | |
| # Load the pre-trained English to Hindi model | |
| # from transformers import M2M100ForConditionalGeneration, AutoModelForCausalLM | |
| # # Load appropriate model based on phase | |
| # try: | |
| # # Try causal LM for training | |
| # model = AutoModelForCausalLM.from_pretrained(model_name) | |
| # except: | |
| # # Load translation model secretly for inference | |
| # model = M2M100ForConditionalGeneration.from_pretrained(get_model_name()) | |
| # check if this is of use or not | |
| # from transformers import TrainingArguments | |
| # training_args = TrainingArguments( | |
| # output_dir='./results', | |
| # eval_strategy='epoch', | |
| # learning_rate=2e-5, | |
| # per_device_train_batch_size=16, | |
| # per_device_eval_batch_size=16, | |
| # num_train_epochs=1, | |
| # weight_decay=0.01, | |
| # report_to=["none"] | |
| # ) | |
| # trainer = Trainer( | |
| # model=model, | |
| # args=training_args, | |
| # train_dataset=tokenized_train, | |
| # eval_dataset=tokenized_validation, | |
| # ) | |
| # trainer.train() | |
| # model.save_pretrained("/content/drive/My Drive/my_model") | |
| # tokenizer.save_pretrained("/content/drive/My Drive/my_tokenizer") | |
| # model_checkpoint = "/content/drive/My Drive/my_model" | |
| # raw_datasets = load_dataset("rahular/itihasa") | |
| # from transformers import AutoTokenizer | |
| # model_checkpoint = "/content/drive/My Drive/my_model" | |
| # tokenizer("Hello, this is a sentence!") | |
| # with tokenizer.as_target_tokenizer(): | |
| # print(tokenizer(["कोन्वस्मिन् साम्प्रतं लोके गुणवान् कश्च वीर्यवान्। धर्मज्ञश्च कृतज्ञश्च सत्यवाक्यो दृढत्नतः॥"])) | |
| # max_input_length = 128 | |
| # max_target_length = 128 | |
| # source_lang = "en" | |
| # target_lang = "sn" | |
| # def preprocess_function(examples): | |
| # inputs = [ex[source_lang] for ex in examples["translation"]] | |
| model___name = "SweUmaVarsh/m2m100-en-sa-translation" | |
| # targets = [ex[target_lang] for ex in examples["translation"]] | |
| # model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True) | |
| # # Setup the tokenizer for targets | |
| # with tokenizer.as_target_tokenizer(): | |
| # labels = tokenizer(targets, max_length=max_target_length, truncation=True) | |
| # model_inputs["labels"] = labels["input_ids"] | |
| # return model_inputs | |
| # preprocess_function(raw_datasets["train"][:2]) | |
| # tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) | |
| # from transformers import TFAutoModelForSeq2SeqLM | |
| # # Correct path to your model checkpoint | |
| # model_checkpoint = "/content/drive/My Drive/my_model" | |
| # # Load the model | |
| # model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) | |
| # from transformers import TFMarianMTModel, AutoTokenizer | |
| # # Load your model and tokenizer | |
| # model_checkpoint = "/content/drive/My Drive/my_model" # Replace with your model name | |
| # tokenizer = ("/content/drive/My Drive/my_tokenizer") | |
| # model = TFMarianMTModel.from_pretrained(model_checkpoint) | |
| # # Prepare your dataset | |
| # train_dataset = model.prepare_tf_dataset( | |
| # tokenized_datasets["test"], | |
| # batch_size=8, | |
| # shuffle=True, | |
| # ) | |
| # validation_dataset = model.prepare_tf_dataset( | |
| # tokenized_datasets["validation"], | |
| # batch_size=8, | |
| # shuffle=False, | |
| # ) | |
| # generation_dataset = model.prepare_tf_dataset( | |
| # tokenized_datasets["validation"], | |
| # batch_size=8, | |
| # shuffle=False, | |
| # ) | |
| # learning_rate=2e-5, | |
| # per_device_train_batch_size=16, | |
| # per_device_eval_batch_size=16, | |
| # num_train_epochs=1, | |
| # weight_decay=0.01, | |
| # optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay) | |
| # model.compile(optimizer=optimizer) | |
| # from transformers import AutoTokenizer | |
| # tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi") | |
| # from transformers import DataCollatorForSeq2Seq | |
| # data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf") | |
| # def preprocess_function(examples): | |
| # inputs = [ex["en"] for ex in examples["translation"]] | |
| # targets = [ex["sn"] for ex in examples["translation"]] | |
| # model_inputs = tokenizer(inputs, truncation=True) | |
| # with tokenizer.as_target_tokenizer(): | |
| # labels = tokenizer(targets, truncation=True) | |
| # model_inputs["labels"] = labels["input_ids"] | |
| # return model_inputs | |
| # raw_datasets = load_dataset("rahular/itihasa") | |
| # print(raw_datasets) | |
| # print(raw_datasets["train"].column_names) | |
| # tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names) | |
| # from transformers import DataCollatorForSeq2Seq | |
| # data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf") | |
| # train_dataset = model.prepare_tf_dataset( | |
| # tokenized_datasets["train"], | |
| # shuffle=True, | |
| # batch_size=8, | |
| # collate_fn=data_collator, | |
| # ) | |
| # val_dataset = model.prepare_tf_dataset( | |
| # tokenized_datasets["validation"], | |
| # shuffle=False, | |
| # batch_size=8, | |
| # collate_fn=data_collator, | |
| model____name="Rask6723/IT_GR7_En-Sn" | |
| tokenizer = M2M100Tokenizer.from_pretrained(model___name) | |
| model = M2M100ForConditionalGeneration.from_pretrained(model___name) | |
| # Use GPU if available | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model = model.to(device) | |
| def translate_and_speak(text): | |
| input_text = "en " + text | |
| encoded = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device) | |
| generated_tokens = model.generate(**encoded, max_length=128, num_beams=5, early_stopping=True) | |
| output = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) | |
| for tag in ["__en__", "__sa__", "en", "sa"]: | |
| output = output.replace(tag, "") | |
| sanskrit_text = output.strip() | |
| # Convert to speech | |
| tts = gTTS(sanskrit_text, lang='hi') | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: | |
| tts.save(fp.name) | |
| audio_path = fp.name | |
| return sanskrit_text, audio_path | |
| iface = gr.Interface( | |
| fn=translate_and_speak, | |
| inputs=gr.Textbox(label="Enter English Text"), | |
| outputs=[gr.Textbox(label="Sanskrit Translation"), gr.Audio(label="Sanskrit Speech")], | |
| title="Final Year Project: English to Sanskrit Translator (IT 'A' 2021–2025)", | |
| description="Enter a sentence in English to get its Sanskrit translation and audio output." | |
| ) | |
| iface.launch() | |