Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,825 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Survey_Analysis_v_3_2_86.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1VOlSQ6kva-BiGfJc7b3BwlKBegP13tdS
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
#1 - https://www.kaggle.com/code/ramjasmaurya/financial-sentiment-analysis
|
| 11 |
+
#2 - https://www.kaggle.com/code/adarshbiradar/sentiment-analysis-using-bert
|
| 12 |
+
|
| 13 |
+
!pip install streamlit
|
| 14 |
+
import streamlit
|
| 15 |
+
|
| 16 |
+
pip install pygal
|
| 17 |
+
|
| 18 |
+
!pip install squarify
|
| 19 |
+
|
| 20 |
+
# Commented out IPython magic to ensure Python compatibility.
|
| 21 |
+
import numpy as np
|
| 22 |
+
import pandas as pd
|
| 23 |
+
import seaborn as sns
|
| 24 |
+
import matplotlib.pyplot as plt
|
| 25 |
+
import plotly.express as px
|
| 26 |
+
import plotly.graph_objects as go
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
import pygal as py
|
| 30 |
+
import squarify as sq
|
| 31 |
+
import matplotlib
|
| 32 |
+
plt.rcParams["figure.figsize"] = (20,15)
|
| 33 |
+
matplotlib.rc('xtick', labelsize=7)
|
| 34 |
+
matplotlib.rc('ytick', labelsize=7)
|
| 35 |
+
|
| 36 |
+
font = {'family' : 'normal',
|
| 37 |
+
'weight' : 'bold',
|
| 38 |
+
'size' : 5}
|
| 39 |
+
|
| 40 |
+
matplotlib.rc('font', **font)
|
| 41 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 42 |
+
import warnings
|
| 43 |
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 44 |
+
# %matplotlib inline
|
| 45 |
+
|
| 46 |
+
df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1")
|
| 47 |
+
df
|
| 48 |
+
|
| 49 |
+
col1=df.keys()[0]
|
| 50 |
+
col2=df.keys()[1]
|
| 51 |
+
col2
|
| 52 |
+
|
| 53 |
+
df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845])
|
| 54 |
+
|
| 55 |
+
df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False)
|
| 56 |
+
|
| 57 |
+
df
|
| 58 |
+
|
| 59 |
+
df = df.replace("neutral","neutral")
|
| 60 |
+
|
| 61 |
+
sns.countplot(y="sentiment",data=df)
|
| 62 |
+
|
| 63 |
+
df.isnull().sum()
|
| 64 |
+
|
| 65 |
+
from textblob import TextBlob
|
| 66 |
+
|
| 67 |
+
def preprocess(ReviewText):
|
| 68 |
+
ReviewText = ReviewText.str.replace("(<br/>)", "")
|
| 69 |
+
ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
|
| 70 |
+
ReviewText = ReviewText.str.replace('(&)', '')
|
| 71 |
+
ReviewText = ReviewText.str.replace('(>)', '')
|
| 72 |
+
ReviewText = ReviewText.str.replace('(<)', '')
|
| 73 |
+
ReviewText = ReviewText.str.replace('(\xa0)', ' ')
|
| 74 |
+
return ReviewText
|
| 75 |
+
df['Review Text'] = preprocess(df['news'])
|
| 76 |
+
|
| 77 |
+
df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity)
|
| 78 |
+
df['news_len'] = df['news'].astype(str).apply(len)
|
| 79 |
+
df['word_count'] = df['news'].apply(lambda x: len(str(x).split()))
|
| 80 |
+
|
| 81 |
+
df
|
| 82 |
+
|
| 83 |
+
print('top 4 random reviews with the highest positive sentiment polarity: \n')
|
| 84 |
+
|
| 85 |
+
df1=df.drop_duplicates(subset=['Review Text'])
|
| 86 |
+
|
| 87 |
+
cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values
|
| 88 |
+
for c in cl:
|
| 89 |
+
print(c[0])
|
| 90 |
+
|
| 91 |
+
print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
|
| 92 |
+
cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values
|
| 93 |
+
for c in cl1:
|
| 94 |
+
print(c[0])
|
| 95 |
+
|
| 96 |
+
print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n')
|
| 97 |
+
cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values
|
| 98 |
+
for c in cl3:
|
| 99 |
+
print(c[0])
|
| 100 |
+
|
| 101 |
+
sns.boxplot(df["polarity"],palette="rainbow",data=df)
|
| 102 |
+
|
| 103 |
+
df['polarity'].plot(
|
| 104 |
+
kind='hist',
|
| 105 |
+
bins=50,
|
| 106 |
+
color="peru",
|
| 107 |
+
title='Sentiment Polarity Distribution');plt.show()
|
| 108 |
+
|
| 109 |
+
p_s=df[df["polarity"]>0].count()["sentiment"]
|
| 110 |
+
neu_s=df[df["polarity"]==0].count()["sentiment"]
|
| 111 |
+
neg_s=df[df["polarity"]<0].count()["sentiment"]
|
| 112 |
+
|
| 113 |
+
# Setting labels for items in Chart
|
| 114 |
+
sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"]
|
| 115 |
+
|
| 116 |
+
# Setting size in Chart based on
|
| 117 |
+
# given values
|
| 118 |
+
values = [p_s,neu_s,neg_s]
|
| 119 |
+
|
| 120 |
+
# colors
|
| 121 |
+
colors = ['#FF0000', 'olive', '#FFFF00']
|
| 122 |
+
# explosion
|
| 123 |
+
explode = (0.05, 0.05, 0.05)
|
| 124 |
+
|
| 125 |
+
# Pie Chart
|
| 126 |
+
plt.pie(values, colors=colors, labels=sentiment,
|
| 127 |
+
autopct='%1.1f%%', pctdistance=0.85,
|
| 128 |
+
explode=explode)
|
| 129 |
+
|
| 130 |
+
# draw circle
|
| 131 |
+
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
|
| 132 |
+
fig = plt.gcf()
|
| 133 |
+
|
| 134 |
+
# Adding Circle in Pie chart
|
| 135 |
+
fig.gca().add_artist(centre_circle)
|
| 136 |
+
|
| 137 |
+
# Adding Title of chart
|
| 138 |
+
plt.title('count of polarity as per sentiment')
|
| 139 |
+
|
| 140 |
+
# Displaing Chart
|
| 141 |
+
plt.show()
|
| 142 |
+
|
| 143 |
+
df.plot.box(y=["word_count"],color="hotpink")
|
| 144 |
+
|
| 145 |
+
df['word_count'].plot(
|
| 146 |
+
kind='hist',
|
| 147 |
+
bins=100,
|
| 148 |
+
color="orange",
|
| 149 |
+
title='Review Text Word Count Distribution');plt.show()
|
| 150 |
+
|
| 151 |
+
sns.boxenplot(x="news_len",data=df)
|
| 152 |
+
plt.show()
|
| 153 |
+
|
| 154 |
+
df['news_len'].plot(
|
| 155 |
+
kind='hist',
|
| 156 |
+
bins=50,
|
| 157 |
+
color="lightblue",
|
| 158 |
+
title='Review Text Word Count Distribution');plt.show()
|
| 159 |
+
|
| 160 |
+
fig = px.scatter(df, x="news_len", y="word_count", color="sentiment",
|
| 161 |
+
marginal_x="box", marginal_y="violin",
|
| 162 |
+
title="Click on the legend items!")
|
| 163 |
+
fig.show()
|
| 164 |
+
|
| 165 |
+
def get_top_n_words(corpus, n=None):
|
| 166 |
+
vec = CountVectorizer().fit(corpus)
|
| 167 |
+
bag_of_words = vec.transform(corpus)
|
| 168 |
+
sum_words = bag_of_words.sum(axis=0)
|
| 169 |
+
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
|
| 170 |
+
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
|
| 171 |
+
return words_freq[:n]
|
| 172 |
+
common_words = get_top_n_words(df['Review Text'], 20)
|
| 173 |
+
for word, freq in common_words:
|
| 174 |
+
print(word, freq)
|
| 175 |
+
df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
|
| 176 |
+
df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
|
| 177 |
+
kind='bar',title='Top 20 words in review before removing stop words')
|
| 178 |
+
df1
|
| 179 |
+
|
| 180 |
+
def get_top_n_words(corpus, n=None):
|
| 181 |
+
vec = CountVectorizer(stop_words = 'english').fit(corpus)
|
| 182 |
+
bag_of_words = vec.transform(corpus)
|
| 183 |
+
sum_words = bag_of_words.sum(axis=0)
|
| 184 |
+
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
|
| 185 |
+
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
|
| 186 |
+
return words_freq[:n]
|
| 187 |
+
common_words = get_top_n_words(df['Review Text'], 20)
|
| 188 |
+
for word, freq in common_words:
|
| 189 |
+
print(word, freq)
|
| 190 |
+
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
|
| 191 |
+
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words')
|
| 192 |
+
|
| 193 |
+
def get_top_n_bigram(corpus, n=None):
|
| 194 |
+
vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
|
| 195 |
+
bag_of_words = vec.transform(corpus)
|
| 196 |
+
sum_words = bag_of_words.sum(axis=0)
|
| 197 |
+
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
|
| 198 |
+
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
|
| 199 |
+
return words_freq[:n]
|
| 200 |
+
common_words = get_top_n_bigram(df['Review Text'], 20)
|
| 201 |
+
for word, freq in common_words:
|
| 202 |
+
print(word, freq)
|
| 203 |
+
df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
|
| 204 |
+
df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
|
| 205 |
+
kind='bar',title='Top 20 bigrams in review before removing stop words')
|
| 206 |
+
|
| 207 |
+
def get_top_n_bigram(corpus, n=None):
|
| 208 |
+
vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
|
| 209 |
+
bag_of_words = vec.transform(corpus)
|
| 210 |
+
sum_words = bag_of_words.sum(axis=0)
|
| 211 |
+
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
|
| 212 |
+
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
|
| 213 |
+
return words_freq[:n]
|
| 214 |
+
common_words = get_top_n_bigram(df['Review Text'], 20)
|
| 215 |
+
for word, freq in common_words:
|
| 216 |
+
print(word, freq)
|
| 217 |
+
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
|
| 218 |
+
df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
|
| 219 |
+
kind='bar', title='Top 20 bigrams in review after removing stop words')
|
| 220 |
+
|
| 221 |
+
def get_top_n_trigram(corpus, n=None):
|
| 222 |
+
vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
|
| 223 |
+
bag_of_words = vec.transform(corpus)
|
| 224 |
+
sum_words = bag_of_words.sum(axis=0)
|
| 225 |
+
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
|
| 226 |
+
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
|
| 227 |
+
return words_freq[:n]
|
| 228 |
+
common_words = get_top_n_trigram(df['Review Text'], 20)
|
| 229 |
+
for word, freq in common_words:
|
| 230 |
+
print(word, freq)
|
| 231 |
+
df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
|
| 232 |
+
df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
|
| 233 |
+
kind='bar', title='Top 20 trigrams in review before removing stop words')
|
| 234 |
+
|
| 235 |
+
def get_top_n_trigram(corpus, n=None):
|
| 236 |
+
vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
|
| 237 |
+
bag_of_words = vec.transform(corpus)
|
| 238 |
+
sum_words = bag_of_words.sum(axis=0)
|
| 239 |
+
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
|
| 240 |
+
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
|
| 241 |
+
return words_freq[:n]
|
| 242 |
+
common_words = get_top_n_trigram(df['Review Text'], 20)
|
| 243 |
+
for word, freq in common_words:
|
| 244 |
+
print(word, freq)
|
| 245 |
+
df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count'])
|
| 246 |
+
df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
|
| 247 |
+
kind='bar', title='Top 20 trigrams in review after removing stop words')
|
| 248 |
+
|
| 249 |
+
import nltk
|
| 250 |
+
nltk.download('punkt')
|
| 251 |
+
nltk.download('wordnet')
|
| 252 |
+
nltk.download('omw-1.4')
|
| 253 |
+
nltk.download('averaged_perceptron_tagger')
|
| 254 |
+
|
| 255 |
+
#import nltk
|
| 256 |
+
blob = TextBlob(str(df['Review Text']))
|
| 257 |
+
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
|
| 258 |
+
pos_df = pos_df.pos.value_counts()[:20]
|
| 259 |
+
pos_df.plot(
|
| 260 |
+
kind='bar',
|
| 261 |
+
title='Top 20 Part-of-speech tagging for review corpus')
|
| 262 |
+
|
| 263 |
+
y0 = df.loc[df['sentiment'] == 'positive']['polarity']
|
| 264 |
+
y1 = df.loc[df['sentiment'] == 'negative']['polarity']
|
| 265 |
+
y2 = df.loc[df['sentiment'] == 'neutral']['polarity']
|
| 266 |
+
|
| 267 |
+
trace0 = go.Box(
|
| 268 |
+
y=y0,
|
| 269 |
+
name = 'positive',
|
| 270 |
+
marker = dict(
|
| 271 |
+
color = 'rgb(214, 12, 140)',
|
| 272 |
+
)
|
| 273 |
+
)
|
| 274 |
+
trace1 = go.Box(
|
| 275 |
+
y=y1,
|
| 276 |
+
name = 'negative',
|
| 277 |
+
marker = dict(
|
| 278 |
+
color = 'rgb(0, 128, 128)',
|
| 279 |
+
)
|
| 280 |
+
)
|
| 281 |
+
trace2 = go.Box(
|
| 282 |
+
y=y2,
|
| 283 |
+
name = 'neutral',
|
| 284 |
+
marker = dict(
|
| 285 |
+
color = 'rgb(10, 140, 208)',
|
| 286 |
+
)
|
| 287 |
+
)
|
| 288 |
+
data = [trace0, trace1, trace2]
|
| 289 |
+
layout = go.Layout(
|
| 290 |
+
title = "Polarity Boxplot according to sentiment"
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
go.Figure(data=data,layout=layout)
|
| 294 |
+
|
| 295 |
+
y0 = df.loc[df['sentiment'] == 'positive']['news_len']
|
| 296 |
+
y1 = df.loc[df['sentiment'] == 'negative']['news_len']
|
| 297 |
+
y2 = df.loc[df['sentiment'] == 'neutral']['news_len']
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
trace0 = go.Box(
|
| 301 |
+
y=y0,
|
| 302 |
+
name = 'positive',
|
| 303 |
+
marker = dict(
|
| 304 |
+
color = 'rgb(214, 12, 140)',
|
| 305 |
+
)
|
| 306 |
+
)
|
| 307 |
+
trace1 = go.Box(
|
| 308 |
+
y=y1,
|
| 309 |
+
name = 'negative',
|
| 310 |
+
marker = dict(
|
| 311 |
+
color = 'rgb(0, 128, 128)',
|
| 312 |
+
)
|
| 313 |
+
)
|
| 314 |
+
trace2 = go.Box(
|
| 315 |
+
y=y2,
|
| 316 |
+
name = 'neutral',
|
| 317 |
+
marker = dict(
|
| 318 |
+
color = 'rgb(10, 140, 208)',
|
| 319 |
+
)
|
| 320 |
+
)
|
| 321 |
+
data = [trace0, trace1, trace2]
|
| 322 |
+
layout = go.Layout(
|
| 323 |
+
title = "news length Boxplot by sentiment"
|
| 324 |
+
)
|
| 325 |
+
go.Figure(data=data,layout=layout)
|
| 326 |
+
|
| 327 |
+
xp = df.loc[df['sentiment'] == "positive", 'polarity']
|
| 328 |
+
xneu = df.loc[df['sentiment'] == "neutral", 'polarity']
|
| 329 |
+
xneg= df.loc[df['sentiment'] == "negative", 'polarity']
|
| 330 |
+
|
| 331 |
+
trace1 = go.Histogram(
|
| 332 |
+
x=xp, name='positive',
|
| 333 |
+
opacity=0.75
|
| 334 |
+
)
|
| 335 |
+
trace2 = go.Histogram(
|
| 336 |
+
x=xneu, name = 'neutral',
|
| 337 |
+
opacity=0.75
|
| 338 |
+
)
|
| 339 |
+
trace3 = go.Histogram(
|
| 340 |
+
x=xneg, name = 'negative',
|
| 341 |
+
opacity=0.75
|
| 342 |
+
)
|
| 343 |
+
data = [trace1, trace2,trace3]
|
| 344 |
+
layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity')
|
| 345 |
+
go.Figure(data=data, layout=layout)
|
| 346 |
+
|
| 347 |
+
trace1 = go.Scatter(
|
| 348 |
+
x=df['polarity'], y=df['news_len'], mode='markers', name='points',
|
| 349 |
+
marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
|
| 350 |
+
)
|
| 351 |
+
trace2 = go.Histogram2dContour(
|
| 352 |
+
x=df['polarity'], y=df['news_len'], name='density', ncontours=50,
|
| 353 |
+
colorscale='Hot', reversescale=True, showscale=False
|
| 354 |
+
)
|
| 355 |
+
trace3 = go.Histogram(
|
| 356 |
+
x=df['polarity'], name='Sentiment polarity density',
|
| 357 |
+
marker=dict(color='rgb(102,0,0)'),
|
| 358 |
+
yaxis='y2'
|
| 359 |
+
)
|
| 360 |
+
trace4 = go.Histogram(
|
| 361 |
+
y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'),
|
| 362 |
+
xaxis='x2'
|
| 363 |
+
)
|
| 364 |
+
data = [trace1, trace2, trace3, trace4]
|
| 365 |
+
|
| 366 |
+
layout = go.Layout(
|
| 367 |
+
showlegend=False,
|
| 368 |
+
autosize=False,
|
| 369 |
+
width=600,
|
| 370 |
+
height=550,
|
| 371 |
+
xaxis=dict(
|
| 372 |
+
domain=[0, 0.85],
|
| 373 |
+
showgrid=False,
|
| 374 |
+
zeroline=False
|
| 375 |
+
),
|
| 376 |
+
yaxis=dict(
|
| 377 |
+
domain=[0, 0.85],
|
| 378 |
+
showgrid=False,
|
| 379 |
+
zeroline=False
|
| 380 |
+
),
|
| 381 |
+
margin=dict(
|
| 382 |
+
t=50
|
| 383 |
+
),
|
| 384 |
+
hovermode='x unified',
|
| 385 |
+
bargap=0,
|
| 386 |
+
xaxis2=dict(
|
| 387 |
+
domain=[0.85, 1],
|
| 388 |
+
showgrid=False,
|
| 389 |
+
zeroline=False
|
| 390 |
+
),
|
| 391 |
+
yaxis2=dict(
|
| 392 |
+
domain=[0.85, 1],
|
| 393 |
+
showgrid=False,
|
| 394 |
+
zeroline=False
|
| 395 |
+
)
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
go.Figure(data=data, layout=layout)
|
| 399 |
+
|
| 400 |
+
trace1 = go.Scatter(
|
| 401 |
+
x=df['polarity'], y=df['word_count'], mode='markers', name='points',
|
| 402 |
+
marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
|
| 403 |
+
)
|
| 404 |
+
trace2 = go.Histogram2dContour(
|
| 405 |
+
x=df['polarity'], y=df['word_count'], name='density', ncontours=20,
|
| 406 |
+
colorscale='Hot', reversescale=True, showscale=False
|
| 407 |
+
)
|
| 408 |
+
trace3 = go.Histogram(
|
| 409 |
+
x=df['polarity'], name='Sentiment polarity density',
|
| 410 |
+
marker=dict(color='rgb(102,0,0)'),
|
| 411 |
+
yaxis='y2'
|
| 412 |
+
)
|
| 413 |
+
trace4 = go.Histogram(
|
| 414 |
+
y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'),
|
| 415 |
+
xaxis='x2'
|
| 416 |
+
)
|
| 417 |
+
data = [trace1, trace2, trace3, trace4]
|
| 418 |
+
|
| 419 |
+
layout = go.Layout(
|
| 420 |
+
showlegend=False,
|
| 421 |
+
autosize=False,
|
| 422 |
+
width=600,
|
| 423 |
+
height=550,
|
| 424 |
+
xaxis=dict(
|
| 425 |
+
domain=[0, 0.85],
|
| 426 |
+
showgrid=False,
|
| 427 |
+
zeroline=False
|
| 428 |
+
),
|
| 429 |
+
yaxis=dict(
|
| 430 |
+
domain=[0, 0.85],
|
| 431 |
+
showgrid=False,
|
| 432 |
+
zeroline=False
|
| 433 |
+
),
|
| 434 |
+
margin=dict(
|
| 435 |
+
t=50
|
| 436 |
+
),
|
| 437 |
+
hovermode='closest',
|
| 438 |
+
bargap=0,
|
| 439 |
+
xaxis2=dict(
|
| 440 |
+
domain=[0.85, 1],
|
| 441 |
+
showgrid=False,
|
| 442 |
+
zeroline=False
|
| 443 |
+
),
|
| 444 |
+
yaxis2=dict(
|
| 445 |
+
domain=[0.85, 1],
|
| 446 |
+
showgrid=False,
|
| 447 |
+
zeroline=False
|
| 448 |
+
)
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
go.Figure(data=data, layout=layout)
|
| 452 |
+
|
| 453 |
+
pip install scattertext
|
| 454 |
+
|
| 455 |
+
pip install spacy
|
| 456 |
+
|
| 457 |
+
import scattertext as st
|
| 458 |
+
import spacy
|
| 459 |
+
nlp = spacy.blank("en")
|
| 460 |
+
nlp.add_pipe('sentencizer')
|
| 461 |
+
#nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
| 462 |
+
corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build()
|
| 463 |
+
print(list(corpus.get_scaled_f_scores_vs_background().index[:20]))
|
| 464 |
+
|
| 465 |
+
term_freq_df = corpus.get_term_freq_df()
|
| 466 |
+
term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive')
|
| 467 |
+
list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20])
|
| 468 |
+
|
| 469 |
+
term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral')
|
| 470 |
+
list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20])
|
| 471 |
+
|
| 472 |
+
term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative')
|
| 473 |
+
list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20])
|
| 474 |
+
|
| 475 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 476 |
+
from sklearn.decomposition import TruncatedSVD
|
| 477 |
+
from collections import Counter
|
| 478 |
+
|
| 479 |
+
tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
|
| 480 |
+
reindexed_data = df['Review Text'].values
|
| 481 |
+
document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
|
| 482 |
+
n_topics = 10
|
| 483 |
+
lsa_model = TruncatedSVD(n_components=n_topics)
|
| 484 |
+
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)
|
| 485 |
+
|
| 486 |
+
def get_keys(topic_matrix):
|
| 487 |
+
'''
|
| 488 |
+
returns an integer list of predicted topic
|
| 489 |
+
categories for a given topic matrix
|
| 490 |
+
'''
|
| 491 |
+
keys = topic_matrix.argmax(axis=1).tolist()
|
| 492 |
+
return keys
|
| 493 |
+
|
| 494 |
+
def keys_to_counts(keys):
|
| 495 |
+
'''
|
| 496 |
+
returns a tuple of topic categories and their
|
| 497 |
+
accompanying magnitudes for a given list of keys
|
| 498 |
+
'''
|
| 499 |
+
count_pairs = Counter(keys).items()
|
| 500 |
+
categories = [pair[0] for pair in count_pairs]
|
| 501 |
+
counts = [pair[1] for pair in count_pairs]
|
| 502 |
+
return (categories, counts)
|
| 503 |
+
|
| 504 |
+
lsa_keys = get_keys(lsa_topic_matrix)
|
| 505 |
+
lsa_categories, lsa_counts = keys_to_counts(lsa_keys)
|
| 506 |
+
|
| 507 |
+
def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
|
| 508 |
+
'''
|
| 509 |
+
returns a list of n_topic strings, where each string contains the n most common
|
| 510 |
+
words in a predicted category, in order
|
| 511 |
+
'''
|
| 512 |
+
top_word_indices = []
|
| 513 |
+
for topic in range(n_topics):
|
| 514 |
+
temp_vector_sum = 0
|
| 515 |
+
for i in range(len(keys)):
|
| 516 |
+
if keys[i] == topic:
|
| 517 |
+
temp_vector_sum += document_term_matrix[i]
|
| 518 |
+
temp_vector_sum = temp_vector_sum.toarray()
|
| 519 |
+
top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
|
| 520 |
+
top_word_indices.append(top_n_word_indices)
|
| 521 |
+
top_words = []
|
| 522 |
+
for topic in top_word_indices:
|
| 523 |
+
topic_words = []
|
| 524 |
+
for index in topic:
|
| 525 |
+
temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
|
| 526 |
+
temp_word_vector[:,index] = 1
|
| 527 |
+
the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
|
| 528 |
+
topic_words.append(the_word.encode('ascii').decode('utf-8'))
|
| 529 |
+
top_words.append(" ".join(topic_words))
|
| 530 |
+
return top_words
|
| 531 |
+
|
| 532 |
+
top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
|
| 533 |
+
|
| 534 |
+
for i in range(len(top_lsa)):
|
| 535 |
+
print("Topic {}: ".format(i+1), top_lsa[i])
|
| 536 |
+
|
| 537 |
+
top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
|
| 538 |
+
labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories]
|
| 539 |
+
fig, ax = plt.subplots(figsize=(16,8))
|
| 540 |
+
ax.bar(lsa_categories, lsa_counts,color="skyblue");
|
| 541 |
+
ax.set_xticks(lsa_categories,);
|
| 542 |
+
ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive");
|
| 543 |
+
ax.set_ylabel('Number of review text on topics');
|
| 544 |
+
ax.set_title('Count of LSA topics');
|
| 545 |
+
plt.show();
|
| 546 |
+
|
| 547 |
+
"""#---2----"""
|
| 548 |
+
|
| 549 |
+
df['sentiment'].value_counts()
|
| 550 |
+
|
| 551 |
+
from sklearn.model_selection import train_test_split
|
| 552 |
+
train,eva = train_test_split(df,test_size = 0.2)
|
| 553 |
+
|
| 554 |
+
!pip install simpletransformers
|
| 555 |
+
|
| 556 |
+
from simpletransformers.classification import ClassificationModel
|
| 557 |
+
|
| 558 |
+
# Create a Transformer Model BERT
|
| 559 |
+
model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)
|
| 560 |
+
|
| 561 |
+
# 0,1,2 : positive,negative
|
| 562 |
+
def making_label(st):
|
| 563 |
+
if(st=='positive'):
|
| 564 |
+
return 0
|
| 565 |
+
elif(st=='neutral'):
|
| 566 |
+
return 2
|
| 567 |
+
else:
|
| 568 |
+
return 1
|
| 569 |
+
|
| 570 |
+
train['label'] = train['sentiment'].apply(making_label)
|
| 571 |
+
eva['label'] = eva['sentiment'].apply(making_label)
|
| 572 |
+
print(train.shape)
|
| 573 |
+
|
| 574 |
+
train_df = pd.DataFrame({
|
| 575 |
+
'text': train['news'][:1500].replace(r'\n', ' ', regex=True),
|
| 576 |
+
'label': train['label'][:1500]
|
| 577 |
+
})
|
| 578 |
+
|
| 579 |
+
eval_df = pd.DataFrame({
|
| 580 |
+
'text': eva['news'][-400:].replace(r'\n', ' ', regex=True),
|
| 581 |
+
'label': eva['label'][-400:]
|
| 582 |
+
})
|
| 583 |
+
|
| 584 |
+
model.train_model(train_df)
|
| 585 |
+
|
| 586 |
+
result, model_outputs, wrong_predictions = model.eval_model(eval_df)
|
| 587 |
+
|
| 588 |
+
result
|
| 589 |
+
|
| 590 |
+
model_outputs
|
| 591 |
+
|
| 592 |
+
len(wrong_predictions)
|
| 593 |
+
|
| 594 |
+
lst = []
|
| 595 |
+
for arr in model_outputs:
|
| 596 |
+
lst.append(np.argmax(arr))
|
| 597 |
+
|
| 598 |
+
true = eval_df['label'].tolist()
|
| 599 |
+
predicted = lst
|
| 600 |
+
|
| 601 |
+
import sklearn
|
| 602 |
+
mat = sklearn.metrics.confusion_matrix(true , predicted)
|
| 603 |
+
mat
|
| 604 |
+
|
| 605 |
+
df_cm = pd.DataFrame(mat, range(3), range(3))
|
| 606 |
+
|
| 607 |
+
sns.heatmap(df_cm, annot=True)
|
| 608 |
+
plt.show()
|
| 609 |
+
|
| 610 |
+
print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative']))
|
| 611 |
+
|
| 612 |
+
sklearn.metrics.accuracy_score(true,predicted)
|
| 613 |
+
|
| 614 |
+
#Give your statement
|
| 615 |
+
def get_result(statement):
|
| 616 |
+
result = model.predict([statement])
|
| 617 |
+
pos = np.where(result[1][0] == np.amax(result[1][0]))
|
| 618 |
+
pos = int(pos[0])
|
| 619 |
+
sentiment_dict = {0:'positive',1:'negative',2:'neutral'}
|
| 620 |
+
print(sentiment_dict[pos])
|
| 621 |
+
return
|
| 622 |
+
|
| 623 |
+
## neutral statement
|
| 624 |
+
get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .")
|
| 625 |
+
|
| 626 |
+
## positive statement
|
| 627 |
+
get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .")
|
| 628 |
+
|
| 629 |
+
## negative statement
|
| 630 |
+
get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .')
|
| 631 |
+
|
| 632 |
+
get_result("This company is growing like anything with 23% profit every year")
|
| 633 |
+
|
| 634 |
+
get_result("This company is not able to make any profit but make very less profit in last quarter")
|
| 635 |
+
|
| 636 |
+
get_result("The doctor treated well and the patient was very healthy")
|
| 637 |
+
|
| 638 |
+
get_result("the act of politicians is to serve and help needy and not to create ruck suck")
|
| 639 |
+
|
| 640 |
+
get_result("American burger is too good. Can't resisit to go and have one")
|
| 641 |
+
|
| 642 |
+
get_result("GDP per capita increased to double in India from 2013")
|
| 643 |
+
|
| 644 |
+
get_result("Indian economy is doing very good and will become super power one day.")
|
| 645 |
+
|
| 646 |
+
get_result("Indian economy is doing very good and will create millions of jobs in coming years")
|
| 647 |
+
|
| 648 |
+
get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years")
|
| 649 |
+
|
| 650 |
+
get_result("Indian economy is doing very good.Indian economy is not doing very good ")
|
| 651 |
+
|
| 652 |
+
get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy")
|
| 653 |
+
|
| 654 |
+
get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export")
|
| 655 |
+
|
| 656 |
+
get_result("The stock market of Indian economy is dangling too much")
|
| 657 |
+
|
| 658 |
+
"""#VADER"""
|
| 659 |
+
|
| 660 |
+
!pip install vaderSentiment
|
| 661 |
+
|
| 662 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 663 |
+
|
| 664 |
+
obj = SentimentIntensityAnalyzer()
|
| 665 |
+
|
| 666 |
+
sentence = "Ram is really good "
|
| 667 |
+
sentiment_dict = obj.polarity_scores(sentence)
|
| 668 |
+
print(sentiment_dict)
|
| 669 |
+
|
| 670 |
+
#check this
|
| 671 |
+
sentence = "Ram is better "
|
| 672 |
+
sentiment_dict = obj.polarity_scores(sentence)
|
| 673 |
+
print(sentiment_dict)
|
| 674 |
+
|
| 675 |
+
sentence = "Rahul is really bad"
|
| 676 |
+
sentiment_dict = obj.polarity_scores(sentence)
|
| 677 |
+
print(sentiment_dict)
|
| 678 |
+
|
| 679 |
+
#punctuation
|
| 680 |
+
print(obj.polarity_scores('Ram is good boy'))
|
| 681 |
+
print(obj.polarity_scores('Ram is good boy!'))
|
| 682 |
+
print(obj.polarity_scores('Ram is good boy!!'))
|
| 683 |
+
|
| 684 |
+
#capitalization
|
| 685 |
+
print(obj.polarity_scores('Ram is good'))
|
| 686 |
+
print(obj.polarity_scores('Ram is GOOD'))
|
| 687 |
+
|
| 688 |
+
#degree
|
| 689 |
+
print(obj.polarity_scores('Ram is good'))
|
| 690 |
+
print(obj.polarity_scores('Ram is better'))
|
| 691 |
+
print(obj.polarity_scores('Ram is best'))
|
| 692 |
+
|
| 693 |
+
print(obj.polarity_scores('Ram is bad'))
|
| 694 |
+
print(obj.polarity_scores('Ram is worse'))
|
| 695 |
+
print(obj.polarity_scores('Ram is worst'))
|
| 696 |
+
|
| 697 |
+
#conjuction
|
| 698 |
+
print(obj.polarity_scores('Ram is good'))
|
| 699 |
+
print(obj.polarity_scores('Ram is good, but he is also naughty sometimes'))
|
| 700 |
+
|
| 701 |
+
#slang
|
| 702 |
+
print(obj.polarity_scores("That Hotel"))
|
| 703 |
+
print(obj.polarity_scores("That Hotel SUX"))
|
| 704 |
+
print(obj.polarity_scores("That Hotel SUCKS"))
|
| 705 |
+
|
| 706 |
+
#emoticons
|
| 707 |
+
print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen"))
|
| 708 |
+
print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen"))
|
| 709 |
+
|
| 710 |
+
print(obj.polarity_scores("Your :( is the worst thing I have ever seen"))
|
| 711 |
+
print(obj.polarity_scores("Your smile is the worst thing I have ever seen"))
|
| 712 |
+
|
| 713 |
+
#https://360digitmg.com/blog/bert-variants-and-their-differences
|
| 714 |
+
#https://simpletransformers.ai/docs/classification-specifics/#supported-model-types Official reference
|
| 715 |
+
|
| 716 |
+
"""#3.a Using FINBERT Model"""
|
| 717 |
+
|
| 718 |
+
#PPT
|
| 719 |
+
#https://medium.com/@benjamin_joesy/finbert-financial-sentiment-analysis-with-bert-acf695b64ac6
|
| 720 |
+
|
| 721 |
+
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
|
| 722 |
+
|
| 723 |
+
# tested in transformers==4.18.0
|
| 724 |
+
import transformers
|
| 725 |
+
transformers.__version__
|
| 726 |
+
|
| 727 |
+
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
|
| 728 |
+
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
|
| 729 |
+
|
| 730 |
+
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
|
| 731 |
+
results = nlp(['growth is strong and we have plenty of liquidity.',
|
| 732 |
+
'there is a shortage of capital, and we need extra financing.',
|
| 733 |
+
'formulation patents might protect Vasotec to a limited extent.'])
|
| 734 |
+
|
| 735 |
+
results
|
| 736 |
+
|
| 737 |
+
"""#FINBERT ESG"""
|
| 738 |
+
|
| 739 |
+
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
|
| 740 |
+
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
|
| 741 |
+
|
| 742 |
+
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
|
| 743 |
+
results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.',
|
| 744 |
+
'Rhonda has been volunteering for several years for a variety of charitable community programs.',
|
| 745 |
+
'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.',
|
| 746 |
+
'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.'])
|
| 747 |
+
|
| 748 |
+
results
|
| 749 |
+
|
| 750 |
+
"""#FINBERT Classification"""
|
| 751 |
+
|
| 752 |
+
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
|
| 753 |
+
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
|
| 754 |
+
|
| 755 |
+
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
|
| 756 |
+
results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.',
|
| 757 |
+
'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.',
|
| 758 |
+
'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in'])
|
| 759 |
+
|
| 760 |
+
results
|
| 761 |
+
|
| 762 |
+
X = df['Review Text'].to_list()
|
| 763 |
+
y = df['sentiment'].to_list()
|
| 764 |
+
|
| 765 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
| 766 |
+
|
| 767 |
+
finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
|
| 768 |
+
tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
|
| 769 |
+
|
| 770 |
+
labels = {0:'neutral', 1:'positive',2:'negative'}
|
| 771 |
+
|
| 772 |
+
sent_val = list()
|
| 773 |
+
for x in X:
|
| 774 |
+
inputs = tokenizer_whole(x, return_tensors="pt", padding=True)
|
| 775 |
+
outputs = finbert_whole(**inputs)[0]
|
| 776 |
+
|
| 777 |
+
val = labels[np.argmax(outputs.detach().numpy())]
|
| 778 |
+
print(x, '---->', val)
|
| 779 |
+
print('#######################################################')
|
| 780 |
+
sent_val.append(val)
|
| 781 |
+
|
| 782 |
+
from sklearn.metrics import accuracy_score
|
| 783 |
+
print(accuracy_score(y, sent_val))
|
| 784 |
+
|
| 785 |
+
"""#Using DISTILBERT"""
|
| 786 |
+
|
| 787 |
+
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
|
| 788 |
+
|
| 789 |
+
tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
| 790 |
+
model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
| 791 |
+
|
| 792 |
+
labels = {0:'neutral', 1:'positive',2:'negative'}
|
| 793 |
+
|
| 794 |
+
sent_val_bert = list()
|
| 795 |
+
for x in X:
|
| 796 |
+
inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True)
|
| 797 |
+
outputs = model_distilbert(**inputs)[0]
|
| 798 |
+
|
| 799 |
+
val = labels[np.argmax(outputs.detach().numpy())]
|
| 800 |
+
print(x, '---->', val)
|
| 801 |
+
print('#######################################################')
|
| 802 |
+
sent_val_bert.append(val)
|
| 803 |
+
|
| 804 |
+
from sklearn.metrics import accuracy_score
|
| 805 |
+
print(accuracy_score(y, sent_val))
|
| 806 |
+
|
| 807 |
+
"""#Bert"""
|
| 808 |
+
|
| 809 |
+
tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased")
|
| 810 |
+
model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased")
|
| 811 |
+
|
| 812 |
+
labels = {0:'neutral', 1:'positive',2:'negative'}
|
| 813 |
+
|
| 814 |
+
sent_val_bert1 = list()
|
| 815 |
+
for x in X:
|
| 816 |
+
inputs = tokenizer_bert(x, return_tensors="pt", padding=True)
|
| 817 |
+
outputs = model_bert(**inputs)[0]
|
| 818 |
+
|
| 819 |
+
val = labels[np.argmax(outputs.detach().numpy())]
|
| 820 |
+
print(x, '---->', val)
|
| 821 |
+
print('#######################################################')
|
| 822 |
+
sent_val_bert1.append(val)
|
| 823 |
+
|
| 824 |
+
from sklearn.metrics import accuracy_score
|
| 825 |
+
print(accuracy_score(y, sent_val))
|