Spaces:
Running
on
Zero
Running
on
Zero
App should now check if embeddings are loaded before topic modelling. And will save only once.
Browse files- app.py +13 -10
- funcs/embeddings.py +41 -41
- funcs/helper_functions.py +4 -2
app.py
CHANGED
|
@@ -80,8 +80,8 @@ hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1
|
|
| 80 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
| 81 |
|
| 82 |
|
| 83 |
-
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers):
|
| 84 |
-
|
| 85 |
all_tic = time.perf_counter()
|
| 86 |
|
| 87 |
output_list = []
|
|
@@ -144,7 +144,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 144 |
|
| 145 |
umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
| 146 |
|
| 147 |
-
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
| 148 |
|
| 149 |
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
| 150 |
|
|
@@ -272,12 +272,16 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 272 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
| 273 |
output_list.append(topic_model_save_name_zip)
|
| 274 |
|
|
|
|
| 275 |
if return_intermediate_files == "Yes":
|
| 276 |
print("Saving embeddings to file")
|
| 277 |
if low_resource_mode == "Yes":
|
| 278 |
embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
| 279 |
else:
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
| 283 |
|
|
@@ -297,15 +301,13 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 297 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
|
| 298 |
print(time_out)
|
| 299 |
|
| 300 |
-
return output_text, output_list, topics_vis
|
| 301 |
|
| 302 |
all_toc = time.perf_counter()
|
| 303 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
|
| 304 |
print(time_out)
|
| 305 |
|
| 306 |
-
return output_text, output_list, None
|
| 307 |
-
|
| 308 |
-
# , topic_model_save_name
|
| 309 |
|
| 310 |
# ## Gradio app - extract topics
|
| 311 |
|
|
@@ -314,6 +316,7 @@ block = gr.Blocks(theme = gr.themes.Base())
|
|
| 314 |
with block:
|
| 315 |
|
| 316 |
data_state = gr.State(pd.DataFrame())
|
|
|
|
| 317 |
|
| 318 |
gr.Markdown(
|
| 319 |
"""
|
|
@@ -359,10 +362,10 @@ with block:
|
|
| 359 |
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
|
| 360 |
|
| 361 |
# Update column names dropdown when file uploaded
|
| 362 |
-
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
| 363 |
in_colnames.change(dummy_function, in_colnames, None)
|
| 364 |
|
| 365 |
-
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers], outputs=[output_single_text, output_file, plot], api_name="topics")
|
| 366 |
|
| 367 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
| 368 |
|
|
|
|
| 80 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
| 81 |
|
| 82 |
|
| 83 |
+
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_out):
|
| 84 |
+
|
| 85 |
all_tic = time.perf_counter()
|
| 86 |
|
| 87 |
output_list = []
|
|
|
|
| 144 |
|
| 145 |
umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
| 146 |
|
| 147 |
+
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
| 148 |
|
| 149 |
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
| 150 |
|
|
|
|
| 272 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
| 273 |
output_list.append(topic_model_save_name_zip)
|
| 274 |
|
| 275 |
+
# If you want to save your embedding files
|
| 276 |
if return_intermediate_files == "Yes":
|
| 277 |
print("Saving embeddings to file")
|
| 278 |
if low_resource_mode == "Yes":
|
| 279 |
embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
| 280 |
else:
|
| 281 |
+
if embeddings_super_compress == "No":
|
| 282 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
|
| 283 |
+
else:
|
| 284 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embedding_compress.npz'
|
| 285 |
|
| 286 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
| 287 |
|
|
|
|
| 301 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
|
| 302 |
print(time_out)
|
| 303 |
|
| 304 |
+
return output_text, output_list, topics_vis, embeddings_out
|
| 305 |
|
| 306 |
all_toc = time.perf_counter()
|
| 307 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
|
| 308 |
print(time_out)
|
| 309 |
|
| 310 |
+
return output_text, output_list, None, embeddings_out
|
|
|
|
|
|
|
| 311 |
|
| 312 |
# ## Gradio app - extract topics
|
| 313 |
|
|
|
|
| 316 |
with block:
|
| 317 |
|
| 318 |
data_state = gr.State(pd.DataFrame())
|
| 319 |
+
embeddings_state = gr.State(np.array([]))
|
| 320 |
|
| 321 |
gr.Markdown(
|
| 322 |
"""
|
|
|
|
| 362 |
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
|
| 363 |
|
| 364 |
# Update column names dropdown when file uploaded
|
| 365 |
+
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state])
|
| 366 |
in_colnames.change(dummy_function, in_colnames, None)
|
| 367 |
|
| 368 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_state], outputs=[output_single_text, output_file, plot, embeddings_state], api_name="topics")
|
| 369 |
|
| 370 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
| 371 |
|
funcs/embeddings.py
CHANGED
|
@@ -13,60 +13,60 @@ if cuda.is_available():
|
|
| 13 |
else:
|
| 14 |
torch_device = "cpu"
|
| 15 |
|
| 16 |
-
def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
print("Loading embeddings from file.")
|
| 22 |
-
embeddings_out = np.load(embeddings_file_names[0])['arr_0']
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
embeddings_out
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
|
| 48 |
-
|
|
|
|
| 49 |
|
| 50 |
-
|
| 51 |
-
print("Creating dense embeddings based on transformers model")
|
| 52 |
|
| 53 |
-
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
-
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
| 57 |
-
print(time_out)
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
print(
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
|
| 71 |
# Pre-reduce embeddings for visualisation purposes
|
| 72 |
if reduce_embeddings == "Yes":
|
|
|
|
| 13 |
else:
|
| 14 |
torch_device = "cpu"
|
| 15 |
|
| 16 |
+
def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
|
| 17 |
|
| 18 |
+
# If no embeddings found, make or load in
|
| 19 |
+
if embeddings_out.size == 0:
|
| 20 |
+
print("Embeddings not found. Loading or generating new ones.")
|
| 21 |
|
| 22 |
+
embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
if embeddings_file_names:
|
| 25 |
+
print("Loading embeddings from file.")
|
| 26 |
+
embeddings_out = np.load(embeddings_file_names[0])['arr_0']
|
| 27 |
|
| 28 |
+
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
| 29 |
+
if "compress" in embeddings_file_names[0]:
|
| 30 |
+
embeddings_out /= 100
|
| 31 |
|
| 32 |
+
if not embeddings_file_names:
|
| 33 |
+
tic = time.perf_counter()
|
| 34 |
+
print("Starting to embed documents.")
|
|
|
|
| 35 |
|
| 36 |
+
# Custom model
|
| 37 |
+
# If on CPU, don't resort to embedding models
|
| 38 |
+
if low_resource_mode_opt == "Yes":
|
| 39 |
+
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
| 40 |
|
| 41 |
+
embedding_model = make_pipeline(
|
| 42 |
+
TfidfVectorizer(),
|
| 43 |
+
TruncatedSVD(100, random_state=random_seed)
|
| 44 |
+
)
|
| 45 |
|
| 46 |
+
# Fit the pipeline to the text data
|
| 47 |
+
embedding_model.fit(docs)
|
| 48 |
|
| 49 |
+
# Transform text data to embeddings
|
| 50 |
+
embeddings_out = embedding_model.transform(docs)
|
| 51 |
|
| 52 |
+
#embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
|
|
|
|
| 53 |
|
| 54 |
+
elif low_resource_mode_opt == "No":
|
| 55 |
+
print("Creating dense embeddings based on transformers model")
|
| 56 |
|
| 57 |
+
embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
toc = time.perf_counter()
|
| 60 |
+
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
| 61 |
+
print(time_out)
|
| 62 |
+
|
| 63 |
+
# If the user has chosen to go with super compressed embedding files to save disk space
|
| 64 |
+
if embeddings_super_compress == "Yes":
|
| 65 |
+
embeddings_out = np.round(embeddings_out, 3)
|
| 66 |
+
embeddings_out *= 100
|
| 67 |
+
|
| 68 |
+
else:
|
| 69 |
+
print("Found pre-loaded embeddings.")
|
| 70 |
|
| 71 |
# Pre-reduce embeddings for visualisation purposes
|
| 72 |
if reduce_embeddings == "Yes":
|
funcs/helper_functions.py
CHANGED
|
@@ -5,6 +5,7 @@ import pandas as pd
|
|
| 5 |
import gradio as gr
|
| 6 |
import gzip
|
| 7 |
import pickle
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def detect_file_type(filename):
|
|
@@ -62,8 +63,9 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
| 62 |
|
| 63 |
|
| 64 |
concat_choices.extend(new_choices)
|
| 65 |
-
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
def get_file_path_end(file_path):
|
| 69 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
import gzip
|
| 7 |
import pickle
|
| 8 |
+
import numpy as np
|
| 9 |
|
| 10 |
|
| 11 |
def detect_file_type(filename):
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
concat_choices.extend(new_choices)
|
| 66 |
+
|
| 67 |
+
#The np.array([]) at the end is for clearing the embedding state when a new file is loaded
|
| 68 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([])
|
| 69 |
|
| 70 |
def get_file_path_end(file_path):
|
| 71 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|