Spaces:

Verathagnus
/

portfolio

Running

App Files Files Community

portfolio / app.py

Verathagnus

Update app.py

b4b5a5e verified about 1 year ago

raw

history blame contribute delete

12.2 kB

	import numpy as np
	import pandas as pd
	from time import time
	from sklearn.model_selection import train_test_split
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, TimeDistributed, Dense
	from tensorflow.keras.models import Model
	from tensorflow.keras.callbacks import EarlyStopping
	import warnings
	warnings.filterwarnings('ignore')

	import pickle
	import streamlit as st
	from ftlangdetect import detect
	import iso639
	import streamlit.components.v1 as components
	import os
	gpt2_tokenizer = None
	gpt2_model = None
	from transformers import (
	# GPT2Config,
	# GPT2Tokenizer,
	# GPT2Model,
	BertTokenizer,
	BertModel)
	import torch

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
	bert_model = BertModel.from_pretrained('bert-base-multilingual-uncased')
	class_names = {0:'sadness', 1:'joy', 2:'love', 3:'anger', 4:'fear', 5:'surprise'}
	import os
	gpt2_tokenizer = None
	gpt2_model = None
	# gpt2_model = GPT2Model.from_pretrained("gpt2")
	# gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
	# gpt2_tokenizer.padding_side = "left"
	# gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
	# Define preprocessing function with smaller max length
	def tokenize_sample(texts, tokenizer="bert"):
	if tokenizer == "gpt2":
	return gpt2_tokenizer(texts, padding="max_length", truncation=True, return_tensors='pt', max_length=128)
	return bert_tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)
	def get_embeddings(text, model_type="bert"):
	tokenized_text = tokenize_sample(text, model_type)
	if model_type =="gpt2":
	outputs = gpt2_model(**tokenized_text)
	else:
	outputs = bert_model(**tokenized_text)
	embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy() # Get the embeddings for [CLS] token
	return embeddings

	path_to_models = "."
	# path_to_models = os.environ['RAILWAY_VOLUME_MOUNT_PATH']+"/storage"
	emotion_classifier_map={
	"Naive Bayes":f"{path_to_models}/models/naive_bayes_model.sav",
	"Logistic Regression":f"{path_to_models}/models/logistic_regression_model.sav",
	"KNN":f"{path_to_models}/models/knn_model.sav",
	"KMeans":f"{path_to_models}/models/kmeans_model.sav",
	"SVM":f"{path_to_models}/models/svm_model.sav",
	"Decision Tree":f"{path_to_models}/models/decision_tree_model.sav",
	"Random Forest":f"{path_to_models}/models/random_forest_model.sav"
	}
	summarizer_map={
	"Bengali":f"{path_to_models}/models/bengali_summarization_model.sav",
	}
	# print(os.listdir())
	# print(os.environ["RAILWAY_VOLUME_MOUNT_PATH"])
	# print(os.listdir(os.environ["RAILWAY_VOLUME_MOUNT_PATH"]+"/storage"))
	summarizer_models=dict()
	for i in summarizer_map:
	with open(summarizer_map[i], 'rb') as file:
	summarizer_models[i] = pickle.load(file)
	emotion_classfier_models=dict()
	for i in emotion_classifier_map:
	with open(emotion_classifier_map[i], 'rb') as file:
	emotion_classfier_models[i] = pickle.load(file)
	def get_emotion_prediction(input, model_name):
	if model_name in emotion_classfier_models:
	return class_names[emotion_classfier_models[model_name].predict(get_embeddings(input))[0]]
	else:
	raise ValueError("Model type should be of the types: {}".format(", ".join(list(emotion_classfier_models.keys()))))

	def decode_sequence(input_seq, max_summary_len, encoder_model, decoder_model, target_word_index, reverse_target_word_index):
	# Encode the input as state vectors.
	e_out, e_h, e_c = encoder_model.predict(input_seq)

	# Generate empty target sequence of length 1.
	target_seq = np.zeros((1,1))

	# Populate the first word of target sequence with the start word.
	target_seq[0, 0] = target_word_index['sostok']

	stop_condition = False
	decoded_sentence = ''
	while not stop_condition:

	output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

	# Sample a token
	sampled_token_index = np.argmax(output_tokens[0, -1, :])
	sampled_token = reverse_target_word_index[sampled_token_index]

	if(sampled_token!='eostok'):
	decoded_sentence += ' '+sampled_token

	# Exit condition: either hit max length or find stop word.
	if (sampled_token == 'eostok' or len(decoded_sentence.split()) >= (max_summary_len-1)):
	stop_condition = True

	# Update the target sequence (of length 1).
	target_seq = np.zeros((1,1))
	target_seq[0, 0] = sampled_token_index

	# Update internal states
	e_h, e_c = h, c

	return decoded_sentence

	def summarize_text(text, x_tokenizer, max_text_len, max_summary_len, encoder_model, decoder_model, target_word_index, reverse_target_word_index):
	tokenized_sentence = pad_sequences(x_tokenizer.texts_to_sequences([text]), maxlen=max_text_len, padding='post')[0]
	return decode_sequence(tokenized_sentence.reshape(1,max_text_len), max_summary_len, encoder_model, decoder_model, target_word_index, reverse_target_word_index)

	def main():
	list_of_tabs = st.tabs(["Indic Multilingual Text Summarizer", "Indic Multilingual Emotion Detection"])
	# Title of the web app
	with list_of_tabs[0]:
	st.title('Indic Multilingual Text Summarizer')
	# print(os.listdir())
	# print(os.environ["RAILWAY_VOLUME_MOUNT_PATH"])
	# print(os.listdir(os.environ["RAILWAY_VOLUME_MOUNT_PATH"]))

	# Input text from the user
	input_sentence_emotion = st.text_area('Enter a sentence', key="summarize")

	# Model selection
	# model_option = st.selectbox('Select the model', list(models.keys()))
	# Result initialization
	result = None
	error = None
	langlist = {"bn": "Bengali"}
	# Prediction button
	if st.button('Summarize'):
	lang = detect(text=input_sentence_emotion, low_memory=False)['lang']
	if lang in langlist:
	result = summarize_text(input_sentence_emotion, summarizer_models[langlist[lang]]["x_tokenizer"], summarizer_models[langlist[lang]]["max_text_len"],summarizer_models[langlist[lang]]['max_summary_len'], summarizer_models[langlist[lang]]['encoder_model'], summarizer_models[langlist[lang]]['decoder_model'], summarizer_models[langlist[lang]]['target_word_index'], summarizer_models[langlist[lang]]['reverse_target_word_index']).replace("start ", "").replace(" end", "")
	else:
	error = f"{iso639.Language.from_part1(lang).name} is not supported.\n List of supported languages: {', '.join(langlist.values())}"
	st.markdown(f"Current language support: Bengali")
	# Display the result
	if result:
	st.success(f'Summary: {result}')
	if error:
	st.error(f'Error: {error}')
	# Credits
	# Credits


	with list_of_tabs[1]:
	st.title('Indic Multilingual Emotion Detection')
	# print(os.listdir())
	# print(os.environ["RAILWAY_VOLUME_MOUNT_PATH"])
	# print(os.listdir(os.environ["RAILWAY_VOLUME_MOUNT_PATH"]))

	# Input text from the user
	input_sentence_emotion = st.text_input('Enter a sentence', key="emotion")

	# Model selection
	model_option = st.selectbox('Select the model', list(emotion_classfier_models.keys()))

	# Result initialization
	result = None
	error = None
	langlist = {"hi": "Hindi"}
	# Prediction button
	if st.button('Predict Emotion'):
	lang = detect(text=input_sentence_emotion, low_memory=False)['lang']
	if lang in langlist:
	result = get_emotion_prediction(input_sentence_emotion, model_option)
	else:
	error = f"{iso639.Language.from_part1(lang).name} is not supported.\n List of supported languages: {', '.join(langlist.values())}"
	st.markdown(f"Current language support: Hindi")
	# Display the result
	if result:
	st.success(f'Prediction: {result}')
	if error:
	st.error(f'Error: {error}')
	# Credits
	# Credits
	st.markdown("---") # Separator
	st.markdown("""## Contributors
	- Bishwaraj Paul
	Role Intern
	Email: bishwaraj.paul98@gmail.com / bishwaraj.paul@bahash.in
	- Dr. Sahinur Rahman Laskar
	Role: Mentor
	Assistant Professor
	School of Computer Science, UPES, Dehradun, India
	Email: sahinurlaskar.nits@gmail.com / sahinur.laskar@ddn.upes.ac.in""")
	footer = """<style>
	.footer-text{
	-webkit-text-size-adjust: 100%;
	-webkit-tap-highlight-color: transparent;
	--blue: #007bff;
	--indigo: #6610f2;
	--purple: #6f42c1;
	--pink: #e83e8c;
	--red: #dc3545;
	--orange: #fd7e14;
	--yellow: #ffc107;
	--green: #28a745;
	--teal: #20c997;
	--cyan: #17a2b8;
	--white: #fff;
	--gray: #6c757d;
	--gray-dark: #343a40;
	--primary: #007bff;
	--secondary: #6c757d;
	--success: #28a745;
	--info: #17a2b8;
	--warning: #ffc107;
	--danger: #dc3545;
	--light: #f8f9fa;
	--dark: #343a40;
	--breakpoint-xs: 0;
	--breakpoint-sm: 576px;
	--breakpoint-md: 768px;
	--breakpoint-lg: 992px;
	--breakpoint-xl: 1200px;
	--font-family-sans-serif: -apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol";
	--font-family-monospace: SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;
	font-size: 16px;
	font-weight: 400;
	line-height: 24px;
	letter-spacing: 1px;
	font-family: 'Raleway', sans-serif;
	color: #666;
	box-sizing: border-box;
	text-align: center!important;
	}
	@media (min-width: 576px) {
	.col-sm-12 {
	-webkit-box-flex: 0;
	-ms-flex: 0 0 100%;
	flex: 0 0 100%;
	max-width: 100%;
	}
	}
	.row {
	display: -webkit-box;
	display: -ms-flexbox;
	display: flex;
	-ms-flex-wrap: wrap;
	flex-wrap: wrap;
	margin-right: -15px;
	margin-left: -15px;
	}
	@media (min-width: 1200px) {
	.container {
	max-width: 1140px;
	}
	}
	@media (min-width: 992px) {
	.container {
	max-width: 960px;
	}
	}
	@media (min-width: 768px) {
	.container {
	max-width: 720px;
	}
	}
	@media (min-width: 576px) {
	.container {
	max-width: 540px;
	}
	}
	.container {
	width: 100%;
	padding-right: 15px;
	padding-left: 15px;
	margin-right: auto;
	margin-left: auto;
	}
	.footer-bottom-area {
	padding: 30px 0;
	display: block;
	box-sizing: border-box;
	}
	.footer-bottom-bg {
	background: #222;
	}
	</style>
	<footer class="footer-bottom-area footer-bottom-bg">
	<div class="container">
	<div class="row">
	<div class="col-sm-12">
	<div class="footer-text">
	<p style="color: white; font-style: sans-serif;"><span>Bahash Private Limited</span> ©2024 - All Right Reserved.</p>
	</div>
	</div>
	</div>
	</div>
	</footer>
	"""
	components.html(footer)
	# Handling query parameters
	query = st.query_params
	try:
	## Look-up the tab from the query
	if "tab" in query:
	index_tab = query["tab"]
	## Click on that tab
	js = f"""
	<script>
	var tab = window.parent.document.getElementById('{index_tab}');
	tab.click();
	</script>
	"""
	st.components.v1.html(js)

	except ValueError:
	## Do nothing if the query parameter does not correspond to any of the tabs
	pass

	if __name__ == '__main__':
	main()