Spaces:
Sleeping
Sleeping
Commit
·
8380741
1
Parent(s):
27b5282
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,11 +7,14 @@ import tensorflow as tf
|
|
| 7 |
from tensorflow.keras.preprocessing.text import Tokenizer
|
| 8 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
| 9 |
import time
|
|
|
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
clf = load('my_model_filename.pkl')
|
| 12 |
vectorizer = load('tfidf_vectorizer.pkl')
|
| 13 |
scaler = load('scaler.joblib')
|
| 14 |
-
|
| 15 |
tukinazor = load('tokenizer.pkl')
|
| 16 |
rnn_model = load_model('path_to_my_model.h5')
|
| 17 |
bert_model = BertForSequenceClassification.from_pretrained('my_bert_model')
|
|
@@ -19,6 +22,26 @@ tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
|
| 19 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 20 |
bert_model = bert_model.to(device)
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def predict_text(text):
|
| 23 |
sequences = tukinazor.texts_to_sequences([text])
|
| 24 |
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=200, padding='post', truncating='post')
|
|
@@ -27,8 +50,19 @@ def predict_text(text):
|
|
| 27 |
return predicted_class
|
| 28 |
|
| 29 |
|
| 30 |
-
# Запуск приложения
|
| 31 |
def main():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
st.title("Модель классификации отзывов")
|
| 33 |
|
| 34 |
# Ввод текста
|
|
@@ -77,5 +111,14 @@ def main():
|
|
| 77 |
st.write(f"Прогнозируемый класс (BERT): {predictions.item() + 1}")
|
| 78 |
st.write(f"Время вычисления: {elapsed_time:.2f} сек.")
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from tensorflow.keras.preprocessing.text import Tokenizer
|
| 8 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
| 9 |
import time
|
| 10 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 11 |
|
| 12 |
+
model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
|
| 13 |
+
toxicity_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
| 14 |
+
toxicity_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
|
| 15 |
clf = load('my_model_filename.pkl')
|
| 16 |
vectorizer = load('tfidf_vectorizer.pkl')
|
| 17 |
scaler = load('scaler.joblib')
|
|
|
|
| 18 |
tukinazor = load('tokenizer.pkl')
|
| 19 |
rnn_model = load_model('path_to_my_model.h5')
|
| 20 |
bert_model = BertForSequenceClassification.from_pretrained('my_bert_model')
|
|
|
|
| 22 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 23 |
bert_model = bert_model.to(device)
|
| 24 |
|
| 25 |
+
labels = ["не токсичный", "оскорбляющий", "непристойный", "угрожающий", "опасный"]
|
| 26 |
+
def text2toxicity(text, aggregate=True):
|
| 27 |
+
""" Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
|
| 28 |
+
with torch.no_grad():
|
| 29 |
+
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
|
| 30 |
+
proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
|
| 31 |
+
|
| 32 |
+
if isinstance(text, str):
|
| 33 |
+
proba = proba[0]
|
| 34 |
+
|
| 35 |
+
if aggregate:
|
| 36 |
+
return 1 - proba.T[0] * (1 - proba.T[-1])
|
| 37 |
+
else:
|
| 38 |
+
# Добавленный блок кода
|
| 39 |
+
result = {}
|
| 40 |
+
for label, prob in zip(labels, proba):
|
| 41 |
+
result[label] = prob
|
| 42 |
+
return result
|
| 43 |
+
|
| 44 |
+
|
| 45 |
def predict_text(text):
|
| 46 |
sequences = tukinazor.texts_to_sequences([text])
|
| 47 |
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=200, padding='post', truncating='post')
|
|
|
|
| 50 |
return predicted_class
|
| 51 |
|
| 52 |
|
|
|
|
| 53 |
def main():
|
| 54 |
+
page_selection = st.sidebar.selectbox("Выберите страницу:", ["Классификация отзывов", "Анализ токсичности"])
|
| 55 |
+
|
| 56 |
+
if page_selection == "Классификация отзывов":
|
| 57 |
+
page_reviews_classification()
|
| 58 |
+
elif page_selection == "Анализ токсичности":
|
| 59 |
+
page_toxicity_analysis()
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
main()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def page_reviews_classification():
|
| 66 |
st.title("Модель классификации отзывов")
|
| 67 |
|
| 68 |
# Ввод текста
|
|
|
|
| 111 |
st.write(f"Прогнозируемый класс (BERT): {predictions.item() + 1}")
|
| 112 |
st.write(f"Время вычисления: {elapsed_time:.2f} сек.")
|
| 113 |
|
| 114 |
+
def page_toxicity_analysis():
|
| 115 |
+
# Код для анализа токсичности текста с использованием модели cointegrated/rubert-tiny-toxicity
|
| 116 |
+
user_input_toxicity = st.text_area("Введите текст для оценки токсичности:")
|
| 117 |
+
|
| 118 |
+
if st.button("Оценить токсичность"):
|
| 119 |
+
start_time = time.time()
|
| 120 |
+
probs = text2toxicity(user_input_toxicity, aggregate=False)
|
| 121 |
+
elapsed_time = time.time() - start_time
|
| 122 |
+
|
| 123 |
+
for label, prob in probs.items():
|
| 124 |
+
st.write(f"Вероятность {label}: {prob:.4f}")
|