Spaces:
Running
on
Zero
Running
on
Zero
Added option to reduce outliers based on closest topic
Browse files- app.py +59 -23
- funcs/anonymiser.py +17 -10
app.py
CHANGED
|
@@ -80,7 +80,7 @@ hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1
|
|
| 80 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
| 81 |
|
| 82 |
|
| 83 |
-
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics):
|
| 84 |
|
| 85 |
all_tic = time.perf_counter()
|
| 86 |
|
|
@@ -99,12 +99,17 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 99 |
in_label_list_first = in_colnames_list_first
|
| 100 |
|
| 101 |
if anonymise_drop == "Yes":
|
|
|
|
|
|
|
| 102 |
in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
|
| 103 |
in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
|
| 104 |
anonymise_data_name = "anonymised_data.csv"
|
| 105 |
in_files.to_csv(anonymise_data_name)
|
| 106 |
output_list.append(anonymise_data_name)
|
| 107 |
|
|
|
|
|
|
|
|
|
|
| 108 |
docs = list(in_files[in_colnames_list_first].str.lower())
|
| 109 |
label_col = in_files[in_label_list_first]
|
| 110 |
|
|
@@ -115,7 +120,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 115 |
print("Low resource mode: ", low_resource_mode)
|
| 116 |
|
| 117 |
if low_resource_mode == "No":
|
| 118 |
-
print("
|
| 119 |
try:
|
| 120 |
embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
|
| 121 |
except:
|
|
@@ -125,7 +130,8 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 125 |
|
| 126 |
embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
|
| 127 |
|
| 128 |
-
|
|
|
|
| 129 |
|
| 130 |
elif low_resource_mode == "Yes":
|
| 131 |
print("Choosing low resource TF-IDF model.")
|
|
@@ -140,8 +146,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 140 |
|
| 141 |
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
| 142 |
|
| 143 |
-
|
| 144 |
-
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.05, max_df=0.9)
|
| 145 |
|
| 146 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
| 147 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
|
|
@@ -152,13 +157,22 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 152 |
|
| 153 |
if not candidate_topics:
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
| 164 |
|
|
@@ -174,15 +188,26 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 174 |
zero_shot_topics = read_file(candidate_topics.name)
|
| 175 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
| 188 |
|
|
@@ -192,6 +217,15 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 192 |
else:
|
| 193 |
print("Preparing topic model outputs.")
|
| 194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
topic_dets = topic_model.get_topic_info()
|
| 196 |
#print(topic_dets.columns)
|
| 197 |
|
|
@@ -299,7 +333,7 @@ with block:
|
|
| 299 |
|
| 300 |
with gr.Row():
|
| 301 |
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
|
| 302 |
-
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value =
|
| 303 |
|
| 304 |
with gr.Row():
|
| 305 |
topics_btn = gr.Button("Extract topics")
|
|
@@ -319,6 +353,8 @@ with block:
|
|
| 319 |
with gr.Row():
|
| 320 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
|
| 321 |
create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
|
|
|
|
|
|
|
| 322 |
save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
|
| 323 |
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
|
| 324 |
|
|
@@ -326,7 +362,7 @@ with block:
|
|
| 326 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
| 327 |
in_colnames.change(dummy_function, in_colnames, None)
|
| 328 |
|
| 329 |
-
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics], outputs=[output_single_text, output_file, plot], api_name="topics")
|
| 330 |
|
| 331 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
| 332 |
|
|
|
|
| 80 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
| 81 |
|
| 82 |
|
| 83 |
+
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers):
|
| 84 |
|
| 85 |
all_tic = time.perf_counter()
|
| 86 |
|
|
|
|
| 99 |
in_label_list_first = in_colnames_list_first
|
| 100 |
|
| 101 |
if anonymise_drop == "Yes":
|
| 102 |
+
anon_tic = time.perf_counter()
|
| 103 |
+
time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
|
| 104 |
in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
|
| 105 |
in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
|
| 106 |
anonymise_data_name = "anonymised_data.csv"
|
| 107 |
in_files.to_csv(anonymise_data_name)
|
| 108 |
output_list.append(anonymise_data_name)
|
| 109 |
|
| 110 |
+
anon_toc = time.perf_counter()
|
| 111 |
+
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
| 112 |
+
|
| 113 |
docs = list(in_files[in_colnames_list_first].str.lower())
|
| 114 |
label_col = in_files[in_label_list_first]
|
| 115 |
|
|
|
|
| 120 |
print("Low resource mode: ", low_resource_mode)
|
| 121 |
|
| 122 |
if low_resource_mode == "No":
|
| 123 |
+
print("Using high resource Jina transformer model")
|
| 124 |
try:
|
| 125 |
embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
|
| 126 |
except:
|
|
|
|
| 130 |
|
| 131 |
embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
|
| 132 |
|
| 133 |
+
# UMAP model uses Bertopic defaults
|
| 134 |
+
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
| 135 |
|
| 136 |
elif low_resource_mode == "Yes":
|
| 137 |
print("Choosing low resource TF-IDF model.")
|
|
|
|
| 146 |
|
| 147 |
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
| 148 |
|
| 149 |
+
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
|
|
|
| 150 |
|
| 151 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
| 152 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
|
|
|
|
| 157 |
|
| 158 |
if not candidate_topics:
|
| 159 |
|
| 160 |
+
# Generate representation model here if topics won't be changed later
|
| 161 |
+
if reduce_outliers == "No":
|
| 162 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
| 163 |
+
vectorizer_model=vectoriser_model,
|
| 164 |
+
umap_model=umap_model,
|
| 165 |
+
min_topic_size = min_docs_slider,
|
| 166 |
+
nr_topics = max_topics_slider,
|
| 167 |
+
representation_model=representation_model,
|
| 168 |
+
verbose = True)
|
| 169 |
+
else:
|
| 170 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
| 171 |
+
vectorizer_model=vectoriser_model,
|
| 172 |
+
umap_model=umap_model,
|
| 173 |
+
min_topic_size = min_docs_slider,
|
| 174 |
+
nr_topics = max_topics_slider,
|
| 175 |
+
verbose = True)
|
| 176 |
|
| 177 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
| 178 |
|
|
|
|
| 188 |
zero_shot_topics = read_file(candidate_topics.name)
|
| 189 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
| 190 |
|
| 191 |
+
# Generate representation model here if topics won't be changed later
|
| 192 |
+
if reduce_outliers == "No":
|
| 193 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
| 194 |
+
vectorizer_model=vectoriser_model,
|
| 195 |
+
umap_model=umap_model,
|
| 196 |
+
min_topic_size = min_docs_slider,
|
| 197 |
+
nr_topics = max_topics_slider,
|
| 198 |
+
zeroshot_topic_list = zero_shot_topics_lower,
|
| 199 |
+
zeroshot_min_similarity = 0.5,#0.7,
|
| 200 |
+
representation_model=representation_model,
|
| 201 |
+
verbose = True)
|
| 202 |
+
else:
|
| 203 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
| 204 |
+
vectorizer_model=vectoriser_model,
|
| 205 |
+
umap_model=umap_model,
|
| 206 |
+
min_topic_size = min_docs_slider,
|
| 207 |
+
nr_topics = max_topics_slider,
|
| 208 |
+
zeroshot_topic_list = zero_shot_topics_lower,
|
| 209 |
+
zeroshot_min_similarity = 0.5,#0.7,
|
| 210 |
+
verbose = True)
|
| 211 |
|
| 212 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
| 213 |
|
|
|
|
| 217 |
else:
|
| 218 |
print("Preparing topic model outputs.")
|
| 219 |
|
| 220 |
+
# Reduce outliers if required
|
| 221 |
+
if reduce_outliers == "Yes":
|
| 222 |
+
print("Reducing outliers.")
|
| 223 |
+
# Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
|
| 224 |
+
topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
|
| 225 |
+
# Then, update the topics to the ones that considered the new data
|
| 226 |
+
topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
|
| 227 |
+
print("Finished reducing outliers.")
|
| 228 |
+
|
| 229 |
topic_dets = topic_model.get_topic_info()
|
| 230 |
#print(topic_dets.columns)
|
| 231 |
|
|
|
|
| 333 |
|
| 334 |
with gr.Row():
|
| 335 |
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
|
| 336 |
+
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 10, step = 1, label = "Maximum number of topics")
|
| 337 |
|
| 338 |
with gr.Row():
|
| 339 |
topics_btn = gr.Button("Extract topics")
|
|
|
|
| 353 |
with gr.Row():
|
| 354 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
|
| 355 |
create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
|
| 356 |
+
reduce_outliers = gr.Dropdown(label = "Reduce outliers by selecting closest topic.", value="No", choices=["Yes", "No"])
|
| 357 |
+
with gr.Row():
|
| 358 |
save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
|
| 359 |
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
|
| 360 |
|
|
|
|
| 362 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
| 363 |
in_colnames.change(dummy_function, in_colnames, None)
|
| 364 |
|
| 365 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers], outputs=[output_single_text, output_file, plot], api_name="topics")
|
| 366 |
|
| 367 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
| 368 |
|
funcs/anonymiser.py
CHANGED
|
@@ -1,26 +1,33 @@
|
|
|
|
|
| 1 |
import spacy
|
|
|
|
| 2 |
import os
|
| 3 |
|
| 4 |
-
def
|
| 5 |
try:
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
spacy.load(model_name)
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
return False
|
| 11 |
|
| 12 |
-
model_name = "en_core_web_sm"
|
| 13 |
-
if not is_model_installed(model_name):
|
| 14 |
-
os.system(f"python -m spacy download {model_name}")
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
|
|
|
| 17 |
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
| 18 |
#os.system("pip uninstall -y gradio")
|
| 19 |
#os.system("pip install gradio==3.50.0")
|
| 20 |
#os.system("python -m spacy download en_core_web_lg")
|
| 21 |
|
| 22 |
-
spacy.load(model_name)
|
| 23 |
-
|
| 24 |
import re
|
| 25 |
import secrets
|
| 26 |
import base64
|
|
|
|
| 1 |
+
from spacy.cli import download
|
| 2 |
import spacy
|
| 3 |
+
spacy.prefer_gpu()
|
| 4 |
import os
|
| 5 |
|
| 6 |
+
def spacy_model_installed(model_name):
|
| 7 |
try:
|
| 8 |
+
import en_core_web_sm
|
| 9 |
+
en_core_web_sm.load()
|
| 10 |
+
print("Successfully imported spaCy model")
|
| 11 |
+
#nlp = spacy.load("en_core_web_sm")
|
| 12 |
+
#print(nlp._path)
|
| 13 |
+
except:
|
| 14 |
+
download(model_name)
|
| 15 |
spacy.load(model_name)
|
| 16 |
+
print("Successfully imported spaCy model")
|
| 17 |
+
#print(nlp._path)
|
|
|
|
| 18 |
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
#if not is_model_installed(model_name):
|
| 21 |
+
# os.system(f"python -m spacy download {model_name}")
|
| 22 |
+
model_name = "en_core_web_sm"
|
| 23 |
+
spacy_model_installed(model_name)
|
| 24 |
|
| 25 |
+
spacy.load(model_name)
|
| 26 |
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
| 27 |
#os.system("pip uninstall -y gradio")
|
| 28 |
#os.system("pip install gradio==3.50.0")
|
| 29 |
#os.system("python -m spacy download en_core_web_lg")
|
| 30 |
|
|
|
|
|
|
|
| 31 |
import re
|
| 32 |
import secrets
|
| 33 |
import base64
|