Spaces:
Runtime error
Runtime error
better description, flagged words
Browse files- app.py +20 -20
- en_examples_with_stats_ldnoob.json +2 -2
app.py
CHANGED
|
@@ -30,51 +30,51 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
| 30 |
|
| 31 |
if "special_%" in columns:
|
| 32 |
special_ratio = st.sidebar.slider(
|
| 33 |
-
"% filtered by special characters ratio", 0.0, 50.0, 0.0, step=1
|
| 34 |
)
|
| 35 |
cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
|
| 36 |
special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
|
| 37 |
-
st.sidebar.text(f"
|
| 38 |
keys.append(("special_%", special_cutoff, True))
|
| 39 |
|
| 40 |
if "stop_%" in columns:
|
| 41 |
stop_ratio = st.sidebar.slider(
|
| 42 |
-
"% filtered by stop word ratio", 0.0, 50.0, 0.0, step=1
|
| 43 |
)
|
| 44 |
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
|
| 45 |
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
|
| 46 |
-
st.sidebar.text(f"
|
| 47 |
keys.append(("stop_%", stop_cutoff, False))
|
| 48 |
|
| 49 |
@st.cache(suppress_st_warning=True)
|
| 50 |
-
def
|
| 51 |
|
| 52 |
-
def
|
| 53 |
-
return len([word for word in text.split() if word.lower().strip() in
|
| 54 |
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
| 58 |
-
data["
|
| 59 |
|
| 60 |
-
|
| 61 |
|
| 62 |
-
if "
|
| 63 |
-
|
| 64 |
-
"% filtered by
|
| 65 |
)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
st.sidebar.text(f"
|
| 69 |
-
keys.append(("
|
| 70 |
|
| 71 |
if "perplexity" in columns:
|
| 72 |
ppl_ratio = st.sidebar.slider(
|
| 73 |
-
"% filtered by perplexity", 0.0, 50.0, 0.0, step=1
|
| 74 |
)
|
| 75 |
ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
|
| 76 |
ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
|
| 77 |
-
st.sidebar.text(f"
|
| 78 |
keys.append(("perplexity", ppl_cutoff, True))
|
| 79 |
|
| 80 |
cond = [
|
|
|
|
| 30 |
|
| 31 |
if "special_%" in columns:
|
| 32 |
special_ratio = st.sidebar.slider(
|
| 33 |
+
"% filtered by special characters ratio", 0.0, 50.0, 0.0, step=0.1
|
| 34 |
)
|
| 35 |
cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
|
| 36 |
special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
|
| 37 |
+
st.sidebar.text(f"No docs with <{special_cutoff:.1f}% special chars")
|
| 38 |
keys.append(("special_%", special_cutoff, True))
|
| 39 |
|
| 40 |
if "stop_%" in columns:
|
| 41 |
stop_ratio = st.sidebar.slider(
|
| 42 |
+
"% filtered by stop word ratio", 0.0, 50.0, 0.0, step=0.1
|
| 43 |
)
|
| 44 |
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
|
| 45 |
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
|
| 46 |
+
st.sidebar.text(f"No docs with >{stop_cutoff:.2f}% stop words")
|
| 47 |
keys.append(("stop_%", stop_cutoff, False))
|
| 48 |
|
| 49 |
@st.cache(suppress_st_warning=True)
|
| 50 |
+
def recalculate_flagged_words(file):
|
| 51 |
|
| 52 |
+
def flagged_word_ratio(text: str, flagged_word_list):
|
| 53 |
+
return len([word for word in text.split() if word.lower().strip() in flagged_word_list]) / len(text.split())
|
| 54 |
|
| 55 |
+
flagged_word_list = [word.decode().strip() for word in file.readlines()]
|
| 56 |
|
| 57 |
+
flagged_word_ratios = [flagged_word_ratio(text, flagged_word_list) * 100 for text in data["text"]]
|
| 58 |
+
data["flagged_%"] = flagged_word_ratios
|
| 59 |
|
| 60 |
+
flagged_word_file = st.sidebar.file_uploader("Upload your own list of flagged words (1 word per line)")
|
| 61 |
|
| 62 |
+
if "flagged_%" in columns:
|
| 63 |
+
flagged_ratio = st.sidebar.slider(
|
| 64 |
+
"% filtered by flaggedwords ratio", 0.0, 50.0, 0.0, step=0.1
|
| 65 |
)
|
| 66 |
+
flagged_index = max(0, math.floor((100 - flagged_ratio) * len(data.index) / 100) - 1)
|
| 67 |
+
flagged_cutoff = np.partition(data["flagged_%"], flagged_index)[flagged_index]
|
| 68 |
+
st.sidebar.text(f"No docs with >{flagged_cutoff:.2f}% flagged words")
|
| 69 |
+
keys.append(("flagged_%", flagged_cutoff, True))
|
| 70 |
|
| 71 |
if "perplexity" in columns:
|
| 72 |
ppl_ratio = st.sidebar.slider(
|
| 73 |
+
"% filtered by perplexity", 0.0, 50.0, 0.0, step=0.1
|
| 74 |
)
|
| 75 |
ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
|
| 76 |
ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
|
| 77 |
+
st.sidebar.text(f"No docs with >{ppl_cutoff:.0f} perplexity")
|
| 78 |
keys.append(("perplexity", ppl_cutoff, True))
|
| 79 |
|
| 80 |
cond = [
|
en_examples_with_stats_ldnoob.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e4e2a111df4e1a3243d53c9516baf8a3f495f8faec5b86fe8787bc6dc2a03bc
|
| 3 |
+
size 21206447
|