Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
02c2d7e
1
Parent(s):
a66b528
divide into individual files
Browse files- src/plotting.py +71 -0
- src/preprocessing.py +200 -0
- src/utils.py +9 -260
- src/wordifier.py +87 -0
src/plotting.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import altair as alt
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from stqdm import stqdm
|
| 5 |
+
|
| 6 |
+
stqdm.pandas()
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
| 10 |
+
|
| 11 |
+
unique_value_limit = 100
|
| 12 |
+
|
| 13 |
+
if data[label_column].nunique() > unique_value_limit:
|
| 14 |
+
|
| 15 |
+
st.warning(
|
| 16 |
+
f"""
|
| 17 |
+
The column you selected has more than {unique_value_limit}.
|
| 18 |
+
Are you sure it's the right column? If it is, please note that
|
| 19 |
+
this will impact __Wordify__ performance.
|
| 20 |
+
"""
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
|
| 26 |
+
source["Props"] = source["Counts"] / source["Counts"].sum()
|
| 27 |
+
source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
| 28 |
+
|
| 29 |
+
bars = (
|
| 30 |
+
alt.Chart(source)
|
| 31 |
+
.mark_bar()
|
| 32 |
+
.encode(
|
| 33 |
+
x=alt.X("Labels:O", sort="-y"),
|
| 34 |
+
y="Counts:Q",
|
| 35 |
+
)
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
|
| 39 |
+
|
| 40 |
+
return (bars + text).properties(height=300)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def plot_nchars(data: pd.DataFrame, text_column: str):
|
| 44 |
+
source = data[text_column].str.len().to_frame()
|
| 45 |
+
|
| 46 |
+
plot = (
|
| 47 |
+
alt.Chart(source)
|
| 48 |
+
.mark_bar()
|
| 49 |
+
.encode(
|
| 50 |
+
alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
|
| 51 |
+
alt.Y("count()", axis=alt.Axis(title="")),
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
return plot.properties(height=300)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def plot_score(data: pd.DataFrame, label_col: str, label: str):
|
| 59 |
+
|
| 60 |
+
source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
|
| 61 |
+
|
| 62 |
+
plot = (
|
| 63 |
+
alt.Chart(source)
|
| 64 |
+
.mark_bar()
|
| 65 |
+
.encode(
|
| 66 |
+
y=alt.Y("word:O", sort="-x"),
|
| 67 |
+
x="score:Q",
|
| 68 |
+
)
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
return plot.properties(height=max(30 * source.shape[0], 50))
|
src/preprocessing.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import string
|
| 3 |
+
from collections import OrderedDict
|
| 4 |
+
from typing import Callable, Dict, List
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import spacy
|
| 9 |
+
import streamlit as st
|
| 10 |
+
from pandas.core.series import Series
|
| 11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 12 |
+
from sklearn.preprocessing import LabelEncoder
|
| 13 |
+
from stqdm import stqdm
|
| 14 |
+
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
| 15 |
+
|
| 16 |
+
from .configs import Languages
|
| 17 |
+
|
| 18 |
+
stqdm.pandas()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def encode(text: pd.Series, labels: pd.Series):
|
| 22 |
+
"""
|
| 23 |
+
Encodes text in mathematical object ameanable to training algorithm
|
| 24 |
+
"""
|
| 25 |
+
tfidf_vectorizer = TfidfVectorizer(
|
| 26 |
+
input="content", # default: file already in memory
|
| 27 |
+
encoding="utf-8", # default
|
| 28 |
+
decode_error="strict", # default
|
| 29 |
+
strip_accents=None, # do nothing
|
| 30 |
+
lowercase=False, # do nothing
|
| 31 |
+
preprocessor=None, # do nothing - default
|
| 32 |
+
tokenizer=None, # default
|
| 33 |
+
stop_words=None, # do nothing
|
| 34 |
+
analyzer="word",
|
| 35 |
+
ngram_range=(1, 3), # maximum 3-ngrams
|
| 36 |
+
min_df=0.001,
|
| 37 |
+
max_df=0.75,
|
| 38 |
+
sublinear_tf=True,
|
| 39 |
+
)
|
| 40 |
+
label_encoder = LabelEncoder()
|
| 41 |
+
|
| 42 |
+
with st.spinner("Encoding text using TF-IDF and Encoding labels"):
|
| 43 |
+
X = tfidf_vectorizer.fit_transform(text.values)
|
| 44 |
+
y = label_encoder.fit_transform(labels.values)
|
| 45 |
+
|
| 46 |
+
return {
|
| 47 |
+
"X": X,
|
| 48 |
+
"y": y,
|
| 49 |
+
"X_names": np.array(tfidf_vectorizer.get_feature_names()),
|
| 50 |
+
"y_names": label_encoder.classes_,
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
|
| 55 |
+
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
| 56 |
+
# fmt: off
|
| 57 |
+
_re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
|
| 58 |
+
def normalize_acronyms(t):
|
| 59 |
+
return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
_re_non_word = re.compile(r"\W")
|
| 63 |
+
def remove_non_word(t):
|
| 64 |
+
return _re_non_word.sub(" ", t)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
_re_space = re.compile(r" {2,}")
|
| 68 |
+
def normalize_useless_spaces(t):
|
| 69 |
+
return _re_space.sub(" ", t)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
_re_rep = re.compile(r"(\S)(\1{2,})")
|
| 73 |
+
def normalize_repeating_chars(t):
|
| 74 |
+
def _replace_rep(m):
|
| 75 |
+
c, cc = m.groups()
|
| 76 |
+
return c
|
| 77 |
+
|
| 78 |
+
return _re_rep.sub(_replace_rep, t)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
| 82 |
+
def normalize_repeating_words(t):
|
| 83 |
+
def _replace_wrep(m):
|
| 84 |
+
c, cc, e = m.groups()
|
| 85 |
+
return c
|
| 86 |
+
|
| 87 |
+
return _re_wrep.sub(_replace_wrep, t)
|
| 88 |
+
|
| 89 |
+
# fmt: on
|
| 90 |
+
class TextPreprocessor:
|
| 91 |
+
def __init__(
|
| 92 |
+
self,
|
| 93 |
+
language: str,
|
| 94 |
+
cleaning_steps: List[str],
|
| 95 |
+
lemmatizer_when: str = "last",
|
| 96 |
+
remove_stop: bool = True,
|
| 97 |
+
) -> None:
|
| 98 |
+
|
| 99 |
+
# prepare lemmatizer
|
| 100 |
+
self.language = language
|
| 101 |
+
self.nlp = spacy.load(
|
| 102 |
+
Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
|
| 103 |
+
)
|
| 104 |
+
self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
|
| 105 |
+
self.remove_stop = remove_stop
|
| 106 |
+
self._lemmatize = self._get_lemmatizer()
|
| 107 |
+
|
| 108 |
+
# prepare cleaning
|
| 109 |
+
self.cleaning_steps = [
|
| 110 |
+
self._cleaning_options()[step]
|
| 111 |
+
for step in cleaning_steps
|
| 112 |
+
if step in self._cleaning_options()
|
| 113 |
+
]
|
| 114 |
+
self.cleaning_pipeline = (
|
| 115 |
+
make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
def _get_lemmatizer(self) -> Callable:
|
| 119 |
+
"""Return the correct spacy Doc-level lemmatizer"""
|
| 120 |
+
if self.remove_stop:
|
| 121 |
+
|
| 122 |
+
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
| 123 |
+
"""Lemmatizes spacy Doc and removes stopwords"""
|
| 124 |
+
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
|
| 125 |
+
|
| 126 |
+
else:
|
| 127 |
+
|
| 128 |
+
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
| 129 |
+
"""Lemmatizes spacy Doc"""
|
| 130 |
+
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
|
| 131 |
+
|
| 132 |
+
return lemmatizer
|
| 133 |
+
|
| 134 |
+
@staticmethod
|
| 135 |
+
def _lemmatization_options() -> Dict[str, str]:
|
| 136 |
+
return {
|
| 137 |
+
"Before preprocessing": "first",
|
| 138 |
+
"After preprocessing": "last",
|
| 139 |
+
"Never! Let's do it quick and dirty": None,
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
def lemmatizer(self, series: pd.Series) -> pd.Series:
|
| 143 |
+
"""
|
| 144 |
+
Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
|
| 145 |
+
"""
|
| 146 |
+
res = []
|
| 147 |
+
pbar = stqdm(total=len(series))
|
| 148 |
+
for doc in self.nlp.pipe(series, batch_size=500):
|
| 149 |
+
res.append(self._lemmatize(doc))
|
| 150 |
+
pbar.update(1)
|
| 151 |
+
pbar.close()
|
| 152 |
+
return pd.Series(res)
|
| 153 |
+
|
| 154 |
+
@staticmethod
|
| 155 |
+
def _cleaning_options():
|
| 156 |
+
"""Returns available cleaning steps in order"""
|
| 157 |
+
return OrderedDict(
|
| 158 |
+
[
|
| 159 |
+
("lower", lambda x: x.lower()),
|
| 160 |
+
("normalize_unicode", normalize.unicode),
|
| 161 |
+
("normalize_bullet_points", normalize.bullet_points),
|
| 162 |
+
("normalize_hyphenated_words", normalize.hyphenated_words),
|
| 163 |
+
("normalize_quotation_marks", normalize.quotation_marks),
|
| 164 |
+
("normalize_whitespace", normalize.whitespace),
|
| 165 |
+
("replace_urls", replace.urls),
|
| 166 |
+
("replace_currency_symbols", replace.currency_symbols),
|
| 167 |
+
("replace_emails", replace.emails),
|
| 168 |
+
("replace_emojis", replace.emojis),
|
| 169 |
+
("replace_hashtags", replace.hashtags),
|
| 170 |
+
("replace_numbers", replace.numbers),
|
| 171 |
+
("replace_phone_numbers", replace.phone_numbers),
|
| 172 |
+
("replace_user_handles", replace.user_handles),
|
| 173 |
+
("normalize_acronyms", normalize_acronyms),
|
| 174 |
+
("remove_accents", remove.accents),
|
| 175 |
+
("remove_brackets", remove.brackets),
|
| 176 |
+
("remove_html_tags", remove.html_tags),
|
| 177 |
+
("remove_punctuation", remove.punctuation),
|
| 178 |
+
("remove_non_words", remove_non_word),
|
| 179 |
+
("normalize_useless_spaces", normalize_useless_spaces),
|
| 180 |
+
("normalize_repeating_chars", normalize_repeating_chars),
|
| 181 |
+
("normalize_repeating_words", normalize_repeating_words),
|
| 182 |
+
("strip", lambda x: x.strip()),
|
| 183 |
+
]
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
def fit_transform(self, series: pd.Series) -> Series:
|
| 187 |
+
"""Applies text preprocessing"""
|
| 188 |
+
|
| 189 |
+
if self.lemmatizer_when == "first":
|
| 190 |
+
with st.spinner("Lemmatizing"):
|
| 191 |
+
series = self.lemmatizer(series)
|
| 192 |
+
|
| 193 |
+
with st.spinner("Cleaning"):
|
| 194 |
+
series = series.progress_map(self.cleaning_pipeline)
|
| 195 |
+
|
| 196 |
+
if self.lemmatizer_when == "last":
|
| 197 |
+
with st.spinner("Lemmatizing"):
|
| 198 |
+
series = self.lemmatizer(series)
|
| 199 |
+
|
| 200 |
+
return series
|
src/utils.py
CHANGED
|
@@ -1,24 +1,12 @@
|
|
| 1 |
import base64
|
| 2 |
-
import re
|
| 3 |
-
from collections import OrderedDict
|
| 4 |
-
from typing import Callable, Dict, List
|
| 5 |
-
|
| 6 |
import altair as alt
|
| 7 |
-
import numpy as np
|
| 8 |
import pandas as pd
|
| 9 |
-
import spacy
|
| 10 |
import streamlit as st
|
| 11 |
-
from pandas.core.series import Series
|
| 12 |
from PIL import Image
|
| 13 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 14 |
-
from sklearn.linear_model import LogisticRegression
|
| 15 |
-
from sklearn.preprocessing import LabelEncoder
|
| 16 |
-
from sklearn.utils import resample
|
| 17 |
from stqdm import stqdm
|
| 18 |
-
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
| 19 |
|
| 20 |
-
from .configs import
|
| 21 |
-
|
| 22 |
stqdm.pandas()
|
| 23 |
|
| 24 |
|
|
@@ -27,7 +15,7 @@ def get_logo(path):
|
|
| 27 |
return Image.open(path)
|
| 28 |
|
| 29 |
|
| 30 |
-
# @st.cache(suppress_st_warning=True)
|
| 31 |
@st.cache(allow_output_mutation=True)
|
| 32 |
def read_file(uploaded_file) -> pd.DataFrame:
|
| 33 |
|
|
@@ -51,258 +39,19 @@ def download_button(dataframe: pd.DataFrame, name: str):
|
|
| 51 |
st.write(href, unsafe_allow_html=True)
|
| 52 |
|
| 53 |
|
| 54 |
-
def encode(text: pd.Series, labels: pd.Series):
|
| 55 |
-
tfidf_vectorizer = TfidfVectorizer(
|
| 56 |
-
input="content", # default: file already in memory
|
| 57 |
-
encoding="utf-8", # default
|
| 58 |
-
decode_error="strict", # default
|
| 59 |
-
strip_accents=None, # do nothing
|
| 60 |
-
lowercase=False, # do nothing
|
| 61 |
-
preprocessor=None, # do nothing - default
|
| 62 |
-
tokenizer=None, # default
|
| 63 |
-
stop_words=None, # do nothing
|
| 64 |
-
analyzer="word",
|
| 65 |
-
ngram_range=(1, 3), # maximum 3-ngrams
|
| 66 |
-
min_df=0.001,
|
| 67 |
-
max_df=0.75,
|
| 68 |
-
sublinear_tf=True,
|
| 69 |
-
)
|
| 70 |
-
label_encoder = LabelEncoder()
|
| 71 |
-
|
| 72 |
-
with st.spinner("Encoding text using TF-IDF and Encoding labels"):
|
| 73 |
-
X = tfidf_vectorizer.fit_transform(text.values)
|
| 74 |
-
y = label_encoder.fit_transform(labels.values)
|
| 75 |
-
|
| 76 |
-
return {
|
| 77 |
-
"X": X,
|
| 78 |
-
"y": y,
|
| 79 |
-
"X_names": np.array(tfidf_vectorizer.get_feature_names()),
|
| 80 |
-
"y_names": label_encoder.classes_,
|
| 81 |
-
}
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
|
| 85 |
-
|
| 86 |
-
n_instances, n_features = X.shape
|
| 87 |
-
n_classes = len(y_names)
|
| 88 |
-
|
| 89 |
-
# NOTE: the * 10 / 10 trick is to have "nice" round-ups
|
| 90 |
-
sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
|
| 91 |
-
|
| 92 |
-
sample_size = min(
|
| 93 |
-
# this is the maximum supported
|
| 94 |
-
configs.MAX_SELECTION.value,
|
| 95 |
-
# at minimum you want MIN_SELECTION but in general you want
|
| 96 |
-
# n_instances * sample_fraction
|
| 97 |
-
max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
|
| 98 |
-
# however if previous one is bigger the the available instances take
|
| 99 |
-
# the number of available instances
|
| 100 |
-
n_instances,
|
| 101 |
-
)
|
| 102 |
-
|
| 103 |
-
# TODO: might want to try out something to subsample features at each iteration
|
| 104 |
-
|
| 105 |
-
# initialize coefficient matrices
|
| 106 |
-
pos_scores = np.zeros((n_classes, n_features), dtype=int)
|
| 107 |
-
neg_scores = np.zeros((n_classes, n_features), dtype=int)
|
| 108 |
-
|
| 109 |
-
with st.spinner("Wordifying!"):
|
| 110 |
-
|
| 111 |
-
for _ in stqdm(range(configs.NUM_ITERS.value)):
|
| 112 |
-
|
| 113 |
-
# run randomized regression
|
| 114 |
-
clf = LogisticRegression(
|
| 115 |
-
penalty="l1",
|
| 116 |
-
C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
|
| 117 |
-
solver="liblinear",
|
| 118 |
-
multi_class="auto",
|
| 119 |
-
max_iter=500,
|
| 120 |
-
class_weight="balanced",
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
# sample indices to subsample matrix
|
| 124 |
-
selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
|
| 125 |
-
|
| 126 |
-
# fit
|
| 127 |
-
try:
|
| 128 |
-
clf.fit(X[selection], y[selection])
|
| 129 |
-
except ValueError:
|
| 130 |
-
continue
|
| 131 |
-
|
| 132 |
-
# record coefficients
|
| 133 |
-
if n_classes == 2:
|
| 134 |
-
pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
|
| 135 |
-
neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
|
| 136 |
-
pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
|
| 137 |
-
neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
|
| 138 |
-
else:
|
| 139 |
-
pos_scores += clf.coef_ > 0
|
| 140 |
-
neg_scores += clf.coef_ < 0
|
| 141 |
-
|
| 142 |
-
# normalize
|
| 143 |
-
pos_scores = pos_scores / configs.NUM_ITERS.value
|
| 144 |
-
neg_scores = neg_scores / configs.NUM_ITERS.value
|
| 145 |
-
|
| 146 |
-
# get only active features
|
| 147 |
-
pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
|
| 148 |
-
neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
|
| 149 |
-
|
| 150 |
-
# prepare DataFrame
|
| 151 |
-
pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
|
| 152 |
-
neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
|
| 153 |
-
|
| 154 |
-
posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
| 155 |
-
negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
| 156 |
-
|
| 157 |
-
return posdf, negdf
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
# more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
|
| 161 |
-
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
| 162 |
-
_re_normalize_acronyms = re.compile("(?:[a-zA-Z]\.){2,}")
|
| 163 |
-
def normalize_acronyms(t):
|
| 164 |
-
return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
|
| 165 |
-
|
| 166 |
-
_re_non_word = re.compile("\W")
|
| 167 |
-
def remove_non_word(t):
|
| 168 |
-
return _re_non_word.sub(" ", t)
|
| 169 |
-
|
| 170 |
-
_re_space = re.compile(" {2,}")
|
| 171 |
-
def normalize_useless_spaces(t):
|
| 172 |
-
return _re_space.sub(" ", t)
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
_re_rep = re.compile(r"(\S)(\1{2,})")
|
| 176 |
-
def normalize_repeating_chars(t):
|
| 177 |
-
def _replace_rep(m):
|
| 178 |
-
c, cc = m.groups()
|
| 179 |
-
return c
|
| 180 |
-
|
| 181 |
-
return _re_rep.sub(_replace_rep, t)
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
| 185 |
-
def normalize_repeating_words(t):
|
| 186 |
-
def _replace_wrep(m):
|
| 187 |
-
c, cc, e = m.groups()
|
| 188 |
-
return c
|
| 189 |
-
|
| 190 |
-
return _re_wrep.sub(_replace_wrep, t)
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
class TextPreprocessor:
|
| 194 |
-
def __init__(
|
| 195 |
-
self, language: str, cleaning_steps: List[str], lemmatizer_when: str = "last", remove_stop: bool = True
|
| 196 |
-
) -> None:
|
| 197 |
-
# prepare lemmatizer
|
| 198 |
-
self.language = language
|
| 199 |
-
self.nlp = spacy.load(Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"])
|
| 200 |
-
self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
|
| 201 |
-
self.remove_stop = remove_stop
|
| 202 |
-
self._lemmatize = self._get_lemmatizer()
|
| 203 |
-
|
| 204 |
-
# prepare cleaning
|
| 205 |
-
self.cleaning_steps = [
|
| 206 |
-
self._cleaning_options()[step] for step in cleaning_steps if step in self._cleaning_options()
|
| 207 |
-
]
|
| 208 |
-
self.cleaning_pipeline = make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
|
| 209 |
-
|
| 210 |
-
def _get_lemmatizer(self) -> Callable:
|
| 211 |
-
"""Return the correct spacy Doc-level lemmatizer"""
|
| 212 |
-
if self.remove_stop:
|
| 213 |
-
|
| 214 |
-
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
| 215 |
-
"""Lemmatizes spacy Doc and removes stopwords"""
|
| 216 |
-
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
|
| 217 |
-
|
| 218 |
-
else:
|
| 219 |
-
|
| 220 |
-
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
| 221 |
-
"""Lemmatizes spacy Doc"""
|
| 222 |
-
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
|
| 223 |
-
|
| 224 |
-
return lemmatizer
|
| 225 |
-
|
| 226 |
-
@staticmethod
|
| 227 |
-
def _lemmatization_options() -> Dict[str, str]:
|
| 228 |
-
return {
|
| 229 |
-
"Before preprocessing": "first",
|
| 230 |
-
"After preprocessing": "last",
|
| 231 |
-
"Never! Let's do it quick and dirty": None,
|
| 232 |
-
}
|
| 233 |
-
|
| 234 |
-
def lemmatizer(self, series: pd.Series) -> pd.Series:
|
| 235 |
-
"""
|
| 236 |
-
Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
|
| 237 |
-
"""
|
| 238 |
-
res = []
|
| 239 |
-
pbar = stqdm(total=len(series))
|
| 240 |
-
for doc in self.nlp.pipe(series, batch_size=500):
|
| 241 |
-
res.append(self._lemmatize(doc))
|
| 242 |
-
pbar.update(1)
|
| 243 |
-
pbar.close()
|
| 244 |
-
return pd.Series(res)
|
| 245 |
-
|
| 246 |
-
@staticmethod
|
| 247 |
-
def _cleaning_options():
|
| 248 |
-
"""Returns available cleaning steps in order"""
|
| 249 |
-
return OrderedDict(
|
| 250 |
-
[
|
| 251 |
-
("lower", lambda x: x.lower()),
|
| 252 |
-
("normalize_unicode", normalize.unicode),
|
| 253 |
-
("normalize_bullet_points", normalize.bullet_points),
|
| 254 |
-
("normalize_hyphenated_words", normalize.hyphenated_words),
|
| 255 |
-
("normalize_quotation_marks", normalize.quotation_marks),
|
| 256 |
-
("normalize_whitespace", normalize.whitespace),
|
| 257 |
-
("replace_urls", replace.urls),
|
| 258 |
-
("replace_currency_symbols", replace.currency_symbols),
|
| 259 |
-
("replace_emails", replace.emails),
|
| 260 |
-
("replace_emojis", replace.emojis),
|
| 261 |
-
("replace_hashtags", replace.hashtags),
|
| 262 |
-
("replace_numbers", replace.numbers),
|
| 263 |
-
("replace_phone_numbers", replace.phone_numbers),
|
| 264 |
-
("replace_user_handles", replace.user_handles),
|
| 265 |
-
("normalize_acronyms", normalize_acronyms),
|
| 266 |
-
("remove_accents", remove.accents),
|
| 267 |
-
("remove_brackets", remove.brackets),
|
| 268 |
-
("remove_html_tags", remove.html_tags),
|
| 269 |
-
("remove_punctuation", remove.punctuation),
|
| 270 |
-
("remove_non_words", remove_non_word),
|
| 271 |
-
("normalize_useless_spaces", normalize_useless_spaces),
|
| 272 |
-
("normalize_repeating_chars", normalize_repeating_chars),
|
| 273 |
-
("normalize_repeating_words", normalize_repeating_words),
|
| 274 |
-
("strip", lambda x: x.strip()),
|
| 275 |
-
]
|
| 276 |
-
)
|
| 277 |
-
|
| 278 |
-
def fit_transform(self, series: pd.Series) -> Series:
|
| 279 |
-
"""Applies text preprocessing"""
|
| 280 |
-
|
| 281 |
-
if self.lemmatizer_when == "first":
|
| 282 |
-
with st.spinner("Lemmatizing"):
|
| 283 |
-
series = self.lemmatizer(series)
|
| 284 |
-
|
| 285 |
-
with st.spinner("Cleaning"):
|
| 286 |
-
series = series.progress_map(self.cleaning_pipeline)
|
| 287 |
-
|
| 288 |
-
if self.lemmatizer_when == "last":
|
| 289 |
-
with st.spinner("Lemmatizing"):
|
| 290 |
-
series = self.lemmatizer(series)
|
| 291 |
-
|
| 292 |
-
return series
|
| 293 |
-
|
| 294 |
-
|
| 295 |
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
| 296 |
|
| 297 |
unique_value_limit = 100
|
| 298 |
-
|
| 299 |
if data[label_column].nunique() > unique_value_limit:
|
| 300 |
|
| 301 |
-
st.warning(
|
| 302 |
-
|
|
|
|
| 303 |
Are you sure it's the right column? If it is, please note that
|
| 304 |
this will impact __Wordify__ performance.
|
| 305 |
-
"""
|
|
|
|
| 306 |
|
| 307 |
return
|
| 308 |
|
|
|
|
| 1 |
import base64
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import altair as alt
|
|
|
|
| 3 |
import pandas as pd
|
|
|
|
| 4 |
import streamlit as st
|
|
|
|
| 5 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from stqdm import stqdm
|
|
|
|
| 7 |
|
| 8 |
+
from .configs import SupportedFiles
|
| 9 |
+
|
| 10 |
stqdm.pandas()
|
| 11 |
|
| 12 |
|
|
|
|
| 15 |
return Image.open(path)
|
| 16 |
|
| 17 |
|
| 18 |
+
# @st.cache(suppress_st_warning=True)
|
| 19 |
@st.cache(allow_output_mutation=True)
|
| 20 |
def read_file(uploaded_file) -> pd.DataFrame:
|
| 21 |
|
|
|
|
| 39 |
st.write(href, unsafe_allow_html=True)
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
| 43 |
|
| 44 |
unique_value_limit = 100
|
| 45 |
+
|
| 46 |
if data[label_column].nunique() > unique_value_limit:
|
| 47 |
|
| 48 |
+
st.warning(
|
| 49 |
+
f"""
|
| 50 |
+
The column you selected has more than {unique_value_limit}.
|
| 51 |
Are you sure it's the right column? If it is, please note that
|
| 52 |
this will impact __Wordify__ performance.
|
| 53 |
+
"""
|
| 54 |
+
)
|
| 55 |
|
| 56 |
return
|
| 57 |
|
src/wordifier.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import streamlit as st
|
| 5 |
+
from sklearn.linear_model import LogisticRegression
|
| 6 |
+
from sklearn.utils import resample
|
| 7 |
+
from stqdm import stqdm
|
| 8 |
+
|
| 9 |
+
from .configs import ModelConfigs
|
| 10 |
+
|
| 11 |
+
stqdm.pandas()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
|
| 15 |
+
|
| 16 |
+
n_instances, n_features = X.shape
|
| 17 |
+
n_classes = len(y_names)
|
| 18 |
+
|
| 19 |
+
# NOTE: the * 10 / 10 trick is to have "nice" round-ups
|
| 20 |
+
sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
|
| 21 |
+
|
| 22 |
+
sample_size = min(
|
| 23 |
+
# this is the maximum supported
|
| 24 |
+
configs.MAX_SELECTION.value,
|
| 25 |
+
# at minimum you want MIN_SELECTION but in general you want
|
| 26 |
+
# n_instances * sample_fraction
|
| 27 |
+
max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
|
| 28 |
+
# however if previous one is bigger the the available instances take
|
| 29 |
+
# the number of available instances
|
| 30 |
+
n_instances,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# TODO: might want to try out something to subsample features at each iteration
|
| 34 |
+
|
| 35 |
+
# initialize coefficient matrices
|
| 36 |
+
pos_scores = np.zeros((n_classes, n_features), dtype=int)
|
| 37 |
+
neg_scores = np.zeros((n_classes, n_features), dtype=int)
|
| 38 |
+
|
| 39 |
+
with st.spinner("Wordifying!"):
|
| 40 |
+
|
| 41 |
+
for _ in stqdm(range(configs.NUM_ITERS.value)):
|
| 42 |
+
|
| 43 |
+
# run randomized regression
|
| 44 |
+
clf = LogisticRegression(
|
| 45 |
+
penalty="l1",
|
| 46 |
+
C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
|
| 47 |
+
solver="liblinear",
|
| 48 |
+
multi_class="auto",
|
| 49 |
+
max_iter=500,
|
| 50 |
+
class_weight="balanced",
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# sample indices to subsample matrix
|
| 54 |
+
selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
|
| 55 |
+
|
| 56 |
+
# fit
|
| 57 |
+
try:
|
| 58 |
+
clf.fit(X[selection], y[selection])
|
| 59 |
+
except ValueError:
|
| 60 |
+
continue
|
| 61 |
+
|
| 62 |
+
# record coefficients
|
| 63 |
+
if n_classes == 2:
|
| 64 |
+
pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
|
| 65 |
+
neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
|
| 66 |
+
pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
|
| 67 |
+
neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
|
| 68 |
+
else:
|
| 69 |
+
pos_scores += clf.coef_ > 0
|
| 70 |
+
neg_scores += clf.coef_ < 0
|
| 71 |
+
|
| 72 |
+
# normalize
|
| 73 |
+
pos_scores = pos_scores / configs.NUM_ITERS.value
|
| 74 |
+
neg_scores = neg_scores / configs.NUM_ITERS.value
|
| 75 |
+
|
| 76 |
+
# get only active features
|
| 77 |
+
pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
|
| 78 |
+
neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
|
| 79 |
+
|
| 80 |
+
# prepare DataFrame
|
| 81 |
+
pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
|
| 82 |
+
neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
|
| 83 |
+
|
| 84 |
+
posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
| 85 |
+
negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
| 86 |
+
|
| 87 |
+
return posdf, negdf
|