Spaces:
Runtime error
Runtime error
Commit
·
14574d7
1
Parent(s):
6303415
visu with discarded documents by filter
Browse files
app.py
CHANGED
|
@@ -66,7 +66,7 @@ class Visualization:
|
|
| 66 |
def set_sliders(docs):
|
| 67 |
columns = list(docs)
|
| 68 |
keys = []
|
| 69 |
-
conds =
|
| 70 |
|
| 71 |
def get_cond(key, cutoff, max_cutoff):
|
| 72 |
if max_cutoff:
|
|
@@ -87,9 +87,8 @@ class Visualization:
|
|
| 87 |
)
|
| 88 |
new_key = ("number_words", cutoff_min_number_words, False)
|
| 89 |
keys.append(new_key)
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
print_discared_by_cond(cond)
|
| 93 |
|
| 94 |
cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
|
| 95 |
cutoff_max_number_words = st.sidebar.slider(
|
|
@@ -97,9 +96,10 @@ class Visualization:
|
|
| 97 |
)
|
| 98 |
new_key = ("number_words", cutoff_max_number_words, True)
|
| 99 |
keys.append(new_key)
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
| 103 |
|
| 104 |
if "special_characters_ratio" in columns:
|
| 105 |
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
|
|
@@ -113,8 +113,8 @@ class Visualization:
|
|
| 113 |
)
|
| 114 |
keys.append(new_key)
|
| 115 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
| 116 |
-
conds.append(cond)
|
| 117 |
print_discared_by_cond(cond)
|
|
|
|
| 118 |
|
| 119 |
if "stopwords_ratio" in columns:
|
| 120 |
cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
|
|
@@ -124,8 +124,8 @@ class Visualization:
|
|
| 124 |
new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
|
| 125 |
keys.append(new_key)
|
| 126 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
| 127 |
-
conds.append(cond)
|
| 128 |
print_discared_by_cond(cond)
|
|
|
|
| 129 |
|
| 130 |
if "badwords_ratio" in columns:
|
| 131 |
cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
|
|
@@ -135,8 +135,8 @@ class Visualization:
|
|
| 135 |
new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
|
| 136 |
keys.append(new_key)
|
| 137 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
| 138 |
-
conds.append(cond)
|
| 139 |
print_discared_by_cond(cond)
|
|
|
|
| 140 |
|
| 141 |
if "lang_id_score" in columns:
|
| 142 |
cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
|
|
@@ -146,8 +146,8 @@ class Visualization:
|
|
| 146 |
new_key = ("lang_id_score", cutoff_lang_id_score, False)
|
| 147 |
keys.append(new_key)
|
| 148 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
| 149 |
-
conds.append(cond)
|
| 150 |
print_discared_by_cond(cond)
|
|
|
|
| 151 |
|
| 152 |
if "perplexity_score" in columns:
|
| 153 |
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
|
|
@@ -158,34 +158,61 @@ class Visualization:
|
|
| 158 |
new_key = ("perplexity_score", cutoff_perplexity_score, True)
|
| 159 |
keys.append(new_key)
|
| 160 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
| 161 |
-
conds.append(cond)
|
| 162 |
print_discared_by_cond(cond)
|
|
|
|
| 163 |
|
| 164 |
return keys, conds
|
| 165 |
|
| 166 |
self.keys, conds = set_sliders(self.docs)
|
| 167 |
|
| 168 |
-
|
|
|
|
| 169 |
|
| 170 |
st.header("Filtering on documents")
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
-
|
| 182 |
-
st.
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
def filtering_of_words(self):
|
| 191 |
st.sidebar.subheader("Parameter of the filtering on words")
|
|
|
|
| 66 |
def set_sliders(docs):
|
| 67 |
columns = list(docs)
|
| 68 |
keys = []
|
| 69 |
+
conds = {}
|
| 70 |
|
| 71 |
def get_cond(key, cutoff, max_cutoff):
|
| 72 |
if max_cutoff:
|
|
|
|
| 87 |
)
|
| 88 |
new_key = ("number_words", cutoff_min_number_words, False)
|
| 89 |
keys.append(new_key)
|
| 90 |
+
cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
|
| 91 |
+
print_discared_by_cond(cond_1)
|
|
|
|
| 92 |
|
| 93 |
cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
|
| 94 |
cutoff_max_number_words = st.sidebar.slider(
|
|
|
|
| 96 |
)
|
| 97 |
new_key = ("number_words", cutoff_max_number_words, True)
|
| 98 |
keys.append(new_key)
|
| 99 |
+
cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
|
| 100 |
+
print_discared_by_cond(cond_2)
|
| 101 |
+
|
| 102 |
+
conds["number_words"] = [cond_1, cond_2]
|
| 103 |
|
| 104 |
if "special_characters_ratio" in columns:
|
| 105 |
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
|
|
|
|
| 113 |
)
|
| 114 |
keys.append(new_key)
|
| 115 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
|
| 116 |
print_discared_by_cond(cond)
|
| 117 |
+
conds["special_characters_ratio"] = [cond]
|
| 118 |
|
| 119 |
if "stopwords_ratio" in columns:
|
| 120 |
cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
|
|
|
|
| 124 |
new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
|
| 125 |
keys.append(new_key)
|
| 126 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
|
| 127 |
print_discared_by_cond(cond)
|
| 128 |
+
conds["stopwords_ratio"] = [cond]
|
| 129 |
|
| 130 |
if "badwords_ratio" in columns:
|
| 131 |
cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
|
|
|
|
| 135 |
new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
|
| 136 |
keys.append(new_key)
|
| 137 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
|
| 138 |
print_discared_by_cond(cond)
|
| 139 |
+
conds["badwords_ratio"] = [cond]
|
| 140 |
|
| 141 |
if "lang_id_score" in columns:
|
| 142 |
cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
|
|
|
|
| 146 |
new_key = ("lang_id_score", cutoff_lang_id_score, False)
|
| 147 |
keys.append(new_key)
|
| 148 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
|
| 149 |
print_discared_by_cond(cond)
|
| 150 |
+
conds["lang_id_score"] = [cond]
|
| 151 |
|
| 152 |
if "perplexity_score" in columns:
|
| 153 |
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
|
|
|
|
| 158 |
new_key = ("perplexity_score", cutoff_perplexity_score, True)
|
| 159 |
keys.append(new_key)
|
| 160 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
|
| 161 |
print_discared_by_cond(cond)
|
| 162 |
+
conds["perplexity_score"] = [cond]
|
| 163 |
|
| 164 |
return keys, conds
|
| 165 |
|
| 166 |
self.keys, conds = set_sliders(self.docs)
|
| 167 |
|
| 168 |
+
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
| 169 |
+
all_conds = np.all(all_conds, axis=0)
|
| 170 |
|
| 171 |
st.header("Filtering on documents")
|
| 172 |
|
| 173 |
+
def display_dataset(cond, description):
|
| 174 |
+
displayed_docs = self.docs.loc[cond]
|
| 175 |
+
st.subheader(
|
| 176 |
+
f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
|
| 177 |
+
)
|
| 178 |
+
st.markdown(
|
| 179 |
+
"Click on a column to sort by it, place the cursor on the text to display it."
|
| 180 |
+
)
|
| 181 |
+
st.dataframe(displayed_docs)
|
| 182 |
+
|
| 183 |
+
display_dataset(np.invert(all_conds), "Discarded documents")
|
| 184 |
|
| 185 |
+
#st.subheader("Display discarded documents by filter")
|
| 186 |
+
display_discarded_documents_by_filter = st.checkbox("Display discarded documents by filter")
|
| 187 |
+
|
| 188 |
+
if display_discarded_documents_by_filter:
|
| 189 |
+
columns = list(self.docs)
|
| 190 |
+
|
| 191 |
+
if "number_words" in columns:
|
| 192 |
+
cond_filter = np.invert(np.all(conds["number_words"], axis=0))
|
| 193 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the number of words")
|
| 194 |
+
|
| 195 |
+
if "special_characters_ratio" in columns:
|
| 196 |
+
cond_filter = np.invert(np.all(conds["special_characters_ratio"], axis=0))
|
| 197 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the special characters ratio")
|
| 198 |
+
|
| 199 |
+
if "stopwords_ratio" in columns:
|
| 200 |
+
cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
|
| 201 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the stop words ratio")
|
| 202 |
+
|
| 203 |
+
if "badwords_ratio" in columns:
|
| 204 |
+
cond_filter = np.invert(np.all(conds["badwords_ratio"], axis=0))
|
| 205 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the bad words ratio")
|
| 206 |
+
|
| 207 |
+
if "lang_id_score" in columns:
|
| 208 |
+
cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
|
| 209 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the language identification confidence score")
|
| 210 |
+
|
| 211 |
+
if "perplexity_score" in columns:
|
| 212 |
+
cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
|
| 213 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the perplexity score")
|
| 214 |
+
|
| 215 |
+
display_dataset(all_conds, "Retained documents")
|
| 216 |
|
| 217 |
def filtering_of_words(self):
|
| 218 |
st.sidebar.subheader("Parameter of the filtering on words")
|