Spaces:
Running
on
Zero
Running
on
Zero
Sean-Case
commited on
Commit
·
0a543a0
1
Parent(s):
381f959
Should now parse custom regex correctly. Will now wipe previously created embeddings if 'low resource mode' option switched.
Browse files- app.py +7 -5
- funcs/clean_funcs.py +4 -11
- funcs/helper_functions.py +2 -2
- funcs/topic_core_funcs.py +18 -7
app.py
CHANGED
|
@@ -18,6 +18,7 @@ with block:
|
|
| 18 |
|
| 19 |
data_state = gr.State(pd.DataFrame())
|
| 20 |
embeddings_state = gr.State(np.array([]))
|
|
|
|
| 21 |
topic_model_state = gr.State()
|
| 22 |
custom_regex_state = gr.State(pd.DataFrame())
|
| 23 |
docs_state = gr.State()
|
|
@@ -43,12 +44,13 @@ with block:
|
|
| 43 |
|
| 44 |
with gr.Accordion("Clean data", open = False):
|
| 45 |
with gr.Row():
|
| 46 |
-
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK).")
|
| 47 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
|
| 48 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
|
| 49 |
with gr.Row():
|
| 50 |
-
gr.
|
| 51 |
-
|
|
|
|
| 52 |
clean_btn = gr.Button("Clean data")
|
| 53 |
|
| 54 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
|
@@ -105,11 +107,11 @@ with block:
|
|
| 105 |
in_colnames.change(dummy_function, in_colnames, None)
|
| 106 |
|
| 107 |
# Clean data
|
| 108 |
-
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_state])
|
| 109 |
clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
|
| 110 |
|
| 111 |
# Extract topics
|
| 112 |
-
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
|
| 113 |
|
| 114 |
# Reduce outliers
|
| 115 |
reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
|
|
|
|
| 18 |
|
| 19 |
data_state = gr.State(pd.DataFrame())
|
| 20 |
embeddings_state = gr.State(np.array([]))
|
| 21 |
+
embeddings_type_state = gr.State("")
|
| 22 |
topic_model_state = gr.State()
|
| 23 |
custom_regex_state = gr.State(pd.DataFrame())
|
| 24 |
docs_state = gr.State()
|
|
|
|
| 44 |
|
| 45 |
with gr.Accordion("Clean data", open = False):
|
| 46 |
with gr.Row():
|
| 47 |
+
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK), custom regex.")
|
| 48 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
|
| 49 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
|
| 50 |
with gr.Row():
|
| 51 |
+
custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
|
| 52 |
+
gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
|
| 53 |
+
custom_regex_text = gr.Textbox(label="Custom regex load status")
|
| 54 |
clean_btn = gr.Button("Clean data")
|
| 55 |
|
| 56 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
|
|
|
| 107 |
in_colnames.change(dummy_function, in_colnames, None)
|
| 108 |
|
| 109 |
# Clean data
|
| 110 |
+
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
|
| 111 |
clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
|
| 112 |
|
| 113 |
# Extract topics
|
| 114 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
|
| 115 |
|
| 116 |
# Reduce outliers
|
| 117 |
reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
|
funcs/clean_funcs.py
CHANGED
|
@@ -42,17 +42,10 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
| 42 |
# Allow for custom regex patterns to be removed
|
| 43 |
if len(custom_regex) > 0:
|
| 44 |
for pattern in custom_regex:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
#text = text.str.replace_all(r'(?i)2nd floor civic centre', '')
|
| 50 |
-
#text = text.str.replace_all(r'(?i)6 brixton hill', '')
|
| 51 |
-
#text = text.str.replace_all(r'(?i)\bsocial care\b', '')
|
| 52 |
-
#text = text.str.replace_all(r'(?i)\basc\b', '')
|
| 53 |
-
#text = text.str.replace_all(r'(?i)\bcsc\b', '')
|
| 54 |
-
#text = text.str.replace_all(r'(?i)\blambeth\b', '')
|
| 55 |
-
|
| 56 |
text = text.to_list()
|
| 57 |
|
| 58 |
return text
|
|
|
|
| 42 |
# Allow for custom regex patterns to be removed
|
| 43 |
if len(custom_regex) > 0:
|
| 44 |
for pattern in custom_regex:
|
| 45 |
+
raw_string_pattern = r'{}'.format(pattern)
|
| 46 |
+
print("Removing regex pattern: ", raw_string_pattern)
|
| 47 |
+
text = text.str.replace_all(raw_string_pattern, '')
|
| 48 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
text = text.to_list()
|
| 50 |
|
| 51 |
return text
|
funcs/helper_functions.py
CHANGED
|
@@ -153,9 +153,9 @@ def custom_regex_load(in_file):
|
|
| 153 |
error = "No regex file provided."
|
| 154 |
print(error)
|
| 155 |
output_text = error
|
| 156 |
-
return custom_regex
|
| 157 |
|
| 158 |
-
return custom_regex
|
| 159 |
|
| 160 |
|
| 161 |
|
|
|
|
| 153 |
error = "No regex file provided."
|
| 154 |
print(error)
|
| 155 |
output_text = error
|
| 156 |
+
return error, custom_regex
|
| 157 |
|
| 158 |
+
return output_text, custom_regex
|
| 159 |
|
| 160 |
|
| 161 |
|
funcs/topic_core_funcs.py
CHANGED
|
@@ -126,7 +126,7 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text
|
|
| 126 |
|
| 127 |
return output_text, output_list, data, data_file_name_no_ext
|
| 128 |
|
| 129 |
-
def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext, custom_labels_df, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, zero_shot_similarity, random_seed, calc_probs, vectoriser_state, progress=gr.Progress(track_tqdm=True)):
|
| 130 |
|
| 131 |
all_tic = time.perf_counter()
|
| 132 |
|
|
@@ -161,7 +161,13 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
| 161 |
if low_resource_mode == "No":
|
| 162 |
print("Using high resource BGE transformer model")
|
| 163 |
|
| 164 |
-
embedding_model = SentenceTransformer(embeddings_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
# UMAP model uses Bertopic defaults
|
| 167 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
|
@@ -169,11 +175,16 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
| 169 |
else:
|
| 170 |
print("Choosing low resource TF-IDF model.")
|
| 171 |
|
| 172 |
-
|
| 173 |
TfidfVectorizer(),
|
| 174 |
TruncatedSVD(100, random_state=random_seed)
|
| 175 |
)
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
| 179 |
|
|
@@ -246,7 +257,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
| 246 |
except:
|
| 247 |
print(fail_error_message)
|
| 248 |
|
| 249 |
-
return fail_error_message, output_list, embeddings_out, data_file_name_no_ext, None, docs, vectoriser_model
|
| 250 |
|
| 251 |
# For some reason, zero topic modelling exports assigned topics as a np.array instead of a list. Converting it back here.
|
| 252 |
if isinstance(assigned_topics, np.ndarray):
|
|
@@ -268,7 +279,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
| 268 |
|
| 269 |
if not assigned_topics:
|
| 270 |
# Handle the empty array case
|
| 271 |
-
return "No topics found.", output_list, embeddings_out, data_file_name_no_ext, topic_model, docs
|
| 272 |
|
| 273 |
else:
|
| 274 |
print("Topic model created.")
|
|
@@ -304,7 +315,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
| 304 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
|
| 305 |
print(time_out)
|
| 306 |
|
| 307 |
-
return output_text, output_list, embeddings_out, data_file_name_no_ext, topic_model, docs, vectoriser_model
|
| 308 |
|
| 309 |
def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, save_topic_model, progress=gr.Progress(track_tqdm=True)):
|
| 310 |
|
|
|
|
| 126 |
|
| 127 |
return output_text, output_list, data, data_file_name_no_ext
|
| 128 |
|
| 129 |
+
def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext, custom_labels_df, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, embeddings_type_state, zero_shot_similarity, random_seed, calc_probs, vectoriser_state, progress=gr.Progress(track_tqdm=True)):
|
| 130 |
|
| 131 |
all_tic = time.perf_counter()
|
| 132 |
|
|
|
|
| 161 |
if low_resource_mode == "No":
|
| 162 |
print("Using high resource BGE transformer model")
|
| 163 |
|
| 164 |
+
embedding_model = SentenceTransformer(embeddings_name)
|
| 165 |
+
|
| 166 |
+
# If tfidf embeddings currently exist, wipe these empty
|
| 167 |
+
if embeddings_type_state == "tfidf":
|
| 168 |
+
embeddings_out = np.array([])
|
| 169 |
+
|
| 170 |
+
embeddings_type_state = "bge"
|
| 171 |
|
| 172 |
# UMAP model uses Bertopic defaults
|
| 173 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
|
|
|
| 175 |
else:
|
| 176 |
print("Choosing low resource TF-IDF model.")
|
| 177 |
|
| 178 |
+
embedding_model = make_pipeline(
|
| 179 |
TfidfVectorizer(),
|
| 180 |
TruncatedSVD(100, random_state=random_seed)
|
| 181 |
)
|
| 182 |
+
|
| 183 |
+
# If bge embeddings currently exist, wipe these empty, then rename embeddings type
|
| 184 |
+
if embeddings_type_state == "bge":
|
| 185 |
+
embeddings_out = np.array([])
|
| 186 |
+
|
| 187 |
+
embeddings_type_state = "tfidf"
|
| 188 |
|
| 189 |
umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
| 190 |
|
|
|
|
| 257 |
except:
|
| 258 |
print(fail_error_message)
|
| 259 |
|
| 260 |
+
return fail_error_message, output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, None, docs, vectoriser_model
|
| 261 |
|
| 262 |
# For some reason, zero topic modelling exports assigned topics as a np.array instead of a list. Converting it back here.
|
| 263 |
if isinstance(assigned_topics, np.ndarray):
|
|
|
|
| 279 |
|
| 280 |
if not assigned_topics:
|
| 281 |
# Handle the empty array case
|
| 282 |
+
return "No topics found.", output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, topic_model, docs
|
| 283 |
|
| 284 |
else:
|
| 285 |
print("Topic model created.")
|
|
|
|
| 315 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
|
| 316 |
print(time_out)
|
| 317 |
|
| 318 |
+
return output_text, output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, topic_model, docs, vectoriser_model
|
| 319 |
|
| 320 |
def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, save_topic_model, progress=gr.Progress(track_tqdm=True)):
|
| 321 |
|