import numpy as np import pandas as pd from time import time from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, TimeDistributed, Dense from tensorflow.keras.models import Model from tensorflow.keras.callbacks import EarlyStopping import warnings warnings.filterwarnings('ignore') import pickle import streamlit as st from ftlangdetect import detect import iso639 import streamlit.components.v1 as components import os gpt2_tokenizer = None gpt2_model = None from transformers import ( # GPT2Config, # GPT2Tokenizer, # GPT2Model, BertTokenizer, BertModel) import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') bert_model = BertModel.from_pretrained('bert-base-multilingual-uncased') class_names = {0:'sadness', 1:'joy', 2:'love', 3:'anger', 4:'fear', 5:'surprise'} import os gpt2_tokenizer = None gpt2_model = None # gpt2_model = GPT2Model.from_pretrained("gpt2") # gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # gpt2_tokenizer.padding_side = "left" # gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token # Define preprocessing function with smaller max length def tokenize_sample(texts, tokenizer="bert"): if tokenizer == "gpt2": return gpt2_tokenizer(texts, padding="max_length", truncation=True, return_tensors='pt', max_length=128) return bert_tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128) def get_embeddings(text, model_type="bert"): tokenized_text = tokenize_sample(text, model_type) if model_type =="gpt2": outputs = gpt2_model(**tokenized_text) else: outputs = bert_model(**tokenized_text) embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy() # Get the embeddings for [CLS] token return embeddings path_to_models = "." # path_to_models = os.environ['RAILWAY_VOLUME_MOUNT_PATH']+"/storage" emotion_classifier_map={ "Naive Bayes":f"{path_to_models}/models/naive_bayes_model.sav", "Logistic Regression":f"{path_to_models}/models/logistic_regression_model.sav", "KNN":f"{path_to_models}/models/knn_model.sav", "KMeans":f"{path_to_models}/models/kmeans_model.sav", "SVM":f"{path_to_models}/models/svm_model.sav", "Decision Tree":f"{path_to_models}/models/decision_tree_model.sav", "Random Forest":f"{path_to_models}/models/random_forest_model.sav" } summarizer_map={ "Bengali":f"{path_to_models}/models/bengali_summarization_model.sav", } # print(os.listdir()) # print(os.environ["RAILWAY_VOLUME_MOUNT_PATH"]) # print(os.listdir(os.environ["RAILWAY_VOLUME_MOUNT_PATH"]+"/storage")) summarizer_models=dict() for i in summarizer_map: with open(summarizer_map[i], 'rb') as file: summarizer_models[i] = pickle.load(file) emotion_classfier_models=dict() for i in emotion_classifier_map: with open(emotion_classifier_map[i], 'rb') as file: emotion_classfier_models[i] = pickle.load(file) def get_emotion_prediction(input, model_name): if model_name in emotion_classfier_models: return class_names[emotion_classfier_models[model_name].predict(get_embeddings(input))[0]] else: raise ValueError("Model type should be of the types: {}".format(", ".join(list(emotion_classfier_models.keys())))) def decode_sequence(input_seq, max_summary_len, encoder_model, decoder_model, target_word_index, reverse_target_word_index): # Encode the input as state vectors. e_out, e_h, e_c = encoder_model.predict(input_seq) # Generate empty target sequence of length 1. target_seq = np.zeros((1,1)) # Populate the first word of target sequence with the start word. target_seq[0, 0] = target_word_index['sostok'] stop_condition = False decoded_sentence = '' while not stop_condition: output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c]) # Sample a token sampled_token_index = np.argmax(output_tokens[0, -1, :]) sampled_token = reverse_target_word_index[sampled_token_index] if(sampled_token!='eostok'): decoded_sentence += ' '+sampled_token # Exit condition: either hit max length or find stop word. if (sampled_token == 'eostok' or len(decoded_sentence.split()) >= (max_summary_len-1)): stop_condition = True # Update the target sequence (of length 1). target_seq = np.zeros((1,1)) target_seq[0, 0] = sampled_token_index # Update internal states e_h, e_c = h, c return decoded_sentence def summarize_text(text, x_tokenizer, max_text_len, max_summary_len, encoder_model, decoder_model, target_word_index, reverse_target_word_index): tokenized_sentence = pad_sequences(x_tokenizer.texts_to_sequences([text]), maxlen=max_text_len, padding='post')[0] return decode_sequence(tokenized_sentence.reshape(1,max_text_len), max_summary_len, encoder_model, decoder_model, target_word_index, reverse_target_word_index) def main(): list_of_tabs = st.tabs(["Indic Multilingual Text Summarizer", "Indic Multilingual Emotion Detection"]) # Title of the web app with list_of_tabs[0]: st.title('Indic Multilingual Text Summarizer') # print(os.listdir()) # print(os.environ["RAILWAY_VOLUME_MOUNT_PATH"]) # print(os.listdir(os.environ["RAILWAY_VOLUME_MOUNT_PATH"])) # Input text from the user input_sentence_emotion = st.text_area('Enter a sentence', key="summarize") # Model selection # model_option = st.selectbox('Select the model', list(models.keys())) # Result initialization result = None error = None langlist = {"bn": "Bengali"} # Prediction button if st.button('Summarize'): lang = detect(text=input_sentence_emotion, low_memory=False)['lang'] if lang in langlist: result = summarize_text(input_sentence_emotion, summarizer_models[langlist[lang]]["x_tokenizer"], summarizer_models[langlist[lang]]["max_text_len"],summarizer_models[langlist[lang]]['max_summary_len'], summarizer_models[langlist[lang]]['encoder_model'], summarizer_models[langlist[lang]]['decoder_model'], summarizer_models[langlist[lang]]['target_word_index'], summarizer_models[langlist[lang]]['reverse_target_word_index']).replace("start ", "").replace(" end", "") else: error = f"{iso639.Language.from_part1(lang).name} is not supported.\n List of supported languages: {', '.join(langlist.values())}" st.markdown(f"Current language support: Bengali") # Display the result if result: st.success(f'Summary: {result}') if error: st.error(f'Error: {error}') # Credits # Credits with list_of_tabs[1]: st.title('Indic Multilingual Emotion Detection') # print(os.listdir()) # print(os.environ["RAILWAY_VOLUME_MOUNT_PATH"]) # print(os.listdir(os.environ["RAILWAY_VOLUME_MOUNT_PATH"])) # Input text from the user input_sentence_emotion = st.text_input('Enter a sentence', key="emotion") # Model selection model_option = st.selectbox('Select the model', list(emotion_classfier_models.keys())) # Result initialization result = None error = None langlist = {"hi": "Hindi"} # Prediction button if st.button('Predict Emotion'): lang = detect(text=input_sentence_emotion, low_memory=False)['lang'] if lang in langlist: result = get_emotion_prediction(input_sentence_emotion, model_option) else: error = f"{iso639.Language.from_part1(lang).name} is not supported.\n List of supported languages: {', '.join(langlist.values())}" st.markdown(f"Current language support: Hindi") # Display the result if result: st.success(f'Prediction: {result}') if error: st.error(f'Error: {error}') # Credits # Credits st.markdown("---") # Separator st.markdown("""## Contributors - Bishwaraj Paul **Role** Intern **Email:** bishwaraj.paul98@gmail.com / bishwaraj.paul@bahash.in - Dr. Sahinur Rahman Laskar **Role:** Mentor Assistant Professor School of Computer Science, UPES, Dehradun, India **Email:** sahinurlaskar.nits@gmail.com / sahinur.laskar@ddn.upes.ac.in""") footer = """ """ components.html(footer) # Handling query parameters query = st.query_params try: ## Look-up the tab from the query if "tab" in query: index_tab = query["tab"] ## Click on that tab js = f""" """ st.components.v1.html(js) except ValueError: ## Do nothing if the query parameter does not correspond to any of the tabs pass if __name__ == '__main__': main()