Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Commit 
							
							·
						
						cc495e1
	
1
								Parent(s):
							
							49e0db8
								
Rearranged functions for embeddings creation to be compatible with zero GPU space. Updated packages.
Browse files- README.md +1 -1
- app.py +1 -1
- funcs/clean_funcs.py +10 -4
- funcs/embeddings.py +36 -7
- funcs/topic_core_funcs.py +68 -90
- requirements.txt +8 -6
- requirements_aws.txt +1 -1
- requirements_gpu.txt +3 -4
    	
        README.md
    CHANGED
    
    | @@ -4,7 +4,7 @@ emoji: 🚀 | |
| 4 | 
             
            colorFrom: red
         | 
| 5 | 
             
            colorTo: yellow
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
            -
            sdk_version: 5. | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: true
         | 
| 10 | 
             
            license: apache-2.0
         | 
|  | |
| 4 | 
             
            colorFrom: red
         | 
| 5 | 
             
            colorTo: yellow
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
            +
            sdk_version: 5.8.0
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: true
         | 
| 10 | 
             
            license: apache-2.0
         | 
    	
        app.py
    CHANGED
    
    | @@ -76,7 +76,7 @@ with app: | |
| 76 |  | 
| 77 | 
             
                    with gr.Accordion("Clean data", open = False):
         | 
| 78 | 
             
                        with gr.Row():
         | 
| 79 | 
            -
                            clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII,  | 
| 80 | 
             
                            drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
         | 
| 81 | 
             
                            anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
         | 
| 82 | 
             
                            #with gr.Row():
         | 
|  | |
| 76 |  | 
| 77 | 
             
                    with gr.Accordion("Clean data", open = False):
         | 
| 78 | 
             
                        with gr.Row():
         | 
| 79 | 
            +
                            clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, large numbers, emails, postcodes (UK).")
         | 
| 80 | 
             
                            drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
         | 
| 81 | 
             
                            anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
         | 
| 82 | 
             
                            #with gr.Row():
         | 
    	
        funcs/clean_funcs.py
    CHANGED
    
    | @@ -2,6 +2,7 @@ import re | |
| 2 | 
             
            import string
         | 
| 3 | 
             
            import unicodedata
         | 
| 4 | 
             
            import polars as pl
         | 
|  | |
| 5 | 
             
            import gradio as gr
         | 
| 6 |  | 
| 7 | 
             
            # Adding custom words to the stopwords
         | 
| @@ -15,15 +16,18 @@ html_start_pattern_end_dots_regex = r'<(.*?)\.\.' | |
| 15 | 
             
            non_ascii_pattern = r'[^\x00-\x7F]+'
         | 
| 16 | 
             
            email_pattern_regex = r'\S*@\S*\s?'
         | 
| 17 | 
             
            num_pattern_regex = r'[0-9]+'
         | 
| 18 | 
            -
             | 
|  | |
|  | |
| 19 | 
             
            postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
         | 
| 20 | 
             
            multiple_spaces_regex = r'\s{2,}'
         | 
| 21 | 
             
            multiple_new_lines_regex = r'(\r\n|\n)+'
         | 
|  | |
| 22 |  | 
| 23 | 
             
            def initial_clean(texts, custom_regex, progress=gr.Progress()):
         | 
| 24 |  | 
| 25 | 
             
                for text in texts:
         | 
| 26 | 
            -
                    if not text:
         | 
| 27 | 
             
                        text = ""
         | 
| 28 |  | 
| 29 | 
             
                    # Normalize unicode characters to decompose any special forms
         | 
| @@ -53,10 +57,12 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()): | |
| 53 | 
             
                    (html_start_pattern_end_dots_regex, ' '),
         | 
| 54 | 
             
                    (non_ascii_pattern, ' '),
         | 
| 55 | 
             
                    (email_pattern_regex, ' '),
         | 
| 56 | 
            -
                    ( | 
| 57 | 
             
                    (postcode_pattern_regex, ' '),
         | 
| 58 | 
             
                    (multiple_spaces_regex, ' '),
         | 
| 59 | 
            -
                    ( | 
|  | |
|  | |
| 60 | 
             
                ]
         | 
| 61 |  | 
| 62 | 
             
                # Apply each regex replacement
         | 
|  | |
| 2 | 
             
            import string
         | 
| 3 | 
             
            import unicodedata
         | 
| 4 | 
             
            import polars as pl
         | 
| 5 | 
            +
            import pandas as pd
         | 
| 6 | 
             
            import gradio as gr
         | 
| 7 |  | 
| 8 | 
             
            # Adding custom words to the stopwords
         | 
|  | |
| 16 | 
             
            non_ascii_pattern = r'[^\x00-\x7F]+'
         | 
| 17 | 
             
            email_pattern_regex = r'\S*@\S*\s?'
         | 
| 18 | 
             
            num_pattern_regex = r'[0-9]+'
         | 
| 19 | 
            +
            and_sign_regex = r'&'
         | 
| 20 | 
            +
            forward_slash_regex = r'/'
         | 
| 21 | 
            +
            nums_five_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{5,}\b|\b[0-9]+\s[0-9]+\b' # Should match five digit numbers or more, and also if there are full stops or commas in between
         | 
| 22 | 
             
            postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
         | 
| 23 | 
             
            multiple_spaces_regex = r'\s{2,}'
         | 
| 24 | 
             
            multiple_new_lines_regex = r'(\r\n|\n)+'
         | 
| 25 | 
            +
            multiple_punctuation_regex = r"(\p{P})\p{P}+"
         | 
| 26 |  | 
| 27 | 
             
            def initial_clean(texts, custom_regex, progress=gr.Progress()):
         | 
| 28 |  | 
| 29 | 
             
                for text in texts:
         | 
| 30 | 
            +
                    if not text or pd.isnull(text):
         | 
| 31 | 
             
                        text = ""
         | 
| 32 |  | 
| 33 | 
             
                    # Normalize unicode characters to decompose any special forms
         | 
|  | |
| 57 | 
             
                    (html_start_pattern_end_dots_regex, ' '),
         | 
| 58 | 
             
                    (non_ascii_pattern, ' '),
         | 
| 59 | 
             
                    (email_pattern_regex, ' '),
         | 
| 60 | 
            +
                    (nums_five_more_regex, ' '),
         | 
| 61 | 
             
                    (postcode_pattern_regex, ' '),
         | 
| 62 | 
             
                    (multiple_spaces_regex, ' '),
         | 
| 63 | 
            +
                    (multiple_punctuation_regex, "${1}"),
         | 
| 64 | 
            +
                    (and_sign_regex, 'and')#,
         | 
| 65 | 
            +
                    #(forward_slash_regex, 'or')
         | 
| 66 | 
             
                ]
         | 
| 67 |  | 
| 68 | 
             
                # Apply each regex replacement
         | 
    	
        funcs/embeddings.py
    CHANGED
    
    | @@ -1,7 +1,12 @@ | |
| 1 | 
             
            import time
         | 
| 2 | 
             
            import numpy as np
         | 
| 3 | 
             
            import os
         | 
|  | |
| 4 | 
             
            from torch import cuda, backends, version
         | 
|  | |
|  | |
|  | |
|  | |
| 5 |  | 
| 6 | 
             
            # Check for torch cuda
         | 
| 7 | 
             
            # If you want to disable cuda for testing purposes
         | 
| @@ -18,11 +23,9 @@ else: | |
| 18 | 
             
                torch_device =  "cpu"
         | 
| 19 | 
             
                high_quality_mode = "No"
         | 
| 20 |  | 
| 21 | 
            -
            print("Device used is: ", torch_device)
         | 
| 22 |  | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
            def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embedding_model, embeddings_super_compress: str, high_quality_mode_opt: str) -> np.ndarray:
         | 
| 26 | 
             
                """
         | 
| 27 | 
             
                Create or load embeddings for the given documents.
         | 
| 28 |  | 
| @@ -30,7 +33,6 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar | |
| 30 | 
             
                    docs (list): List of documents to embed.
         | 
| 31 | 
             
                    file_list (list): List of file names to check for existing embeddings.
         | 
| 32 | 
             
                    embeddings_out (np.ndarray): Array to store the embeddings.
         | 
| 33 | 
            -
                    embedding_model: Model used to generate embeddings.
         | 
| 34 | 
             
                    embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
         | 
| 35 | 
             
                    high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
         | 
| 36 |  | 
| @@ -38,6 +40,33 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar | |
| 38 | 
             
                    np.ndarray: The generated or loaded embeddings.
         | 
| 39 | 
             
                """
         | 
| 40 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 41 | 
             
                # If no embeddings found, make or load in
         | 
| 42 | 
             
                if embeddings_out.size == 0:
         | 
| 43 | 
             
                    print("Embeddings not found. Loading or generating new ones.")
         | 
| @@ -84,9 +113,9 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar | |
| 84 | 
             
                            embeddings_out = np.round(embeddings_out, 3) 
         | 
| 85 | 
             
                            embeddings_out *= 100
         | 
| 86 |  | 
| 87 | 
            -
                    return embeddings_out
         | 
| 88 |  | 
| 89 | 
             
                else:
         | 
| 90 | 
             
                    print("Found pre-loaded embeddings.")
         | 
| 91 |  | 
| 92 | 
            -
                    return embeddings_out
         | 
|  | |
| 1 | 
             
            import time
         | 
| 2 | 
             
            import numpy as np
         | 
| 3 | 
             
            import os
         | 
| 4 | 
            +
            import spaces
         | 
| 5 | 
             
            from torch import cuda, backends, version
         | 
| 6 | 
            +
            from sentence_transformers import SentenceTransformer
         | 
| 7 | 
            +
            from sklearn.pipeline import make_pipeline
         | 
| 8 | 
            +
            from sklearn.decomposition import TruncatedSVD
         | 
| 9 | 
            +
            from sklearn.feature_extraction.text import TfidfVectorizer
         | 
| 10 |  | 
| 11 | 
             
            # Check for torch cuda
         | 
| 12 | 
             
            # If you want to disable cuda for testing purposes
         | 
|  | |
| 23 | 
             
                torch_device =  "cpu"
         | 
| 24 | 
             
                high_quality_mode = "No"
         | 
| 25 |  | 
|  | |
| 26 |  | 
| 27 | 
            +
            @spaces.GPU
         | 
| 28 | 
            +
            def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1") -> np.ndarray:
         | 
|  | |
| 29 | 
             
                """
         | 
| 30 | 
             
                Create or load embeddings for the given documents.
         | 
| 31 |  | 
|  | |
| 33 | 
             
                    docs (list): List of documents to embed.
         | 
| 34 | 
             
                    file_list (list): List of file names to check for existing embeddings.
         | 
| 35 | 
             
                    embeddings_out (np.ndarray): Array to store the embeddings.
         | 
|  | |
| 36 | 
             
                    embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
         | 
| 37 | 
             
                    high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
         | 
| 38 |  | 
|  | |
| 40 | 
             
                    np.ndarray: The generated or loaded embeddings.
         | 
| 41 | 
             
                """
         | 
| 42 |  | 
| 43 | 
            +
                if high_quality_mode_opt == "Yes":
         | 
| 44 | 
            +
                # Define a list of possible local locations to search for the model
         | 
| 45 | 
            +
                    local_embeddings_locations = [
         | 
| 46 | 
            +
                        "model/embed/", # Potential local location
         | 
| 47 | 
            +
                        "/model/embed/", # Potential location in Docker container
         | 
| 48 | 
            +
                        "/home/user/app/model/embed/" # This is inside a Docker container
         | 
| 49 | 
            +
                    ]
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    # Attempt to load the model from each local location
         | 
| 52 | 
            +
                    for location in local_embeddings_locations:
         | 
| 53 | 
            +
                        try:
         | 
| 54 | 
            +
                            embedding_model = SentenceTransformer(location)#, truncate_dim=512)
         | 
| 55 | 
            +
                            print(f"Found local model installation at: {location}")
         | 
| 56 | 
            +
                            break  # Exit the loop if the model is found
         | 
| 57 | 
            +
                        except Exception as e:
         | 
| 58 | 
            +
                            print(f"Failed to load model from {location}: {e}")
         | 
| 59 | 
            +
                            continue
         | 
| 60 | 
            +
                    else:
         | 
| 61 | 
            +
                        # If the loop completes without finding the model in any local location
         | 
| 62 | 
            +
                        embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
         | 
| 63 | 
            +
                        print("Could not find local model installation. Downloading from Huggingface")
         | 
| 64 | 
            +
                else:
         | 
| 65 | 
            +
                    embedding_model = make_pipeline(
         | 
| 66 | 
            +
                            TfidfVectorizer(),
         | 
| 67 | 
            +
                            TruncatedSVD(100, random_state=random_seed)
         | 
| 68 | 
            +
                            )
         | 
| 69 | 
            +
             | 
| 70 | 
             
                # If no embeddings found, make or load in
         | 
| 71 | 
             
                if embeddings_out.size == 0:
         | 
| 72 | 
             
                    print("Embeddings not found. Loading or generating new ones.")
         | 
|  | |
| 113 | 
             
                            embeddings_out = np.round(embeddings_out, 3) 
         | 
| 114 | 
             
                            embeddings_out *= 100
         | 
| 115 |  | 
| 116 | 
            +
                    return embeddings_out, embedding_model
         | 
| 117 |  | 
| 118 | 
             
                else:
         | 
| 119 | 
             
                    print("Found pre-loaded embeddings.")
         | 
| 120 |  | 
| 121 | 
            +
                    return embeddings_out, embedding_model
         | 
    	
        funcs/topic_core_funcs.py
    CHANGED
    
    | @@ -7,6 +7,7 @@ import pandas as pd | |
| 7 | 
             
            import numpy as np
         | 
| 8 | 
             
            import time
         | 
| 9 | 
             
            from bertopic import BERTopic
         | 
|  | |
| 10 |  | 
| 11 | 
             
            from typing import List, Type, Union
         | 
| 12 | 
             
            PandasDataFrame = Type[pd.DataFrame]
         | 
| @@ -17,13 +18,7 @@ from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder | |
| 17 | 
             
            from funcs.embeddings import make_or_load_embeddings, torch_device
         | 
| 18 | 
             
            from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
         | 
| 19 | 
             
            from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
         | 
| 20 | 
            -
             | 
| 21 | 
             
            from sklearn.feature_extraction.text import CountVectorizer
         | 
| 22 | 
            -
             | 
| 23 | 
            -
            from sentence_transformers import SentenceTransformer
         | 
| 24 | 
            -
            from sklearn.pipeline import make_pipeline
         | 
| 25 | 
            -
            from sklearn.decomposition import TruncatedSVD
         | 
| 26 | 
            -
            from sklearn.feature_extraction.text import TfidfVectorizer
         | 
| 27 | 
             
            import funcs.anonymiser as anon
         | 
| 28 | 
             
            from umap import UMAP
         | 
| 29 |  | 
| @@ -96,84 +91,88 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, | |
| 96 | 
             
                output_list = []
         | 
| 97 | 
             
                #file_list = [string.name for string in in_files]
         | 
| 98 |  | 
| 99 | 
            -
                in_colnames_list_first  | 
| 100 |  | 
| 101 | 
            -
             | 
| 102 | 
            -
                if not "original_index" in data.columns:
         | 
| 103 | 
            -
                    data = data.reset_index(names="original_index")
         | 
| 104 |  | 
| 105 | 
            -
             | 
| 106 | 
            -
                    clean_tic = time.perf_counter()
         | 
| 107 | 
            -
                    print("Starting data clean.")
         | 
| 108 |  | 
| 109 | 
            -
                     | 
|  | |
|  | |
| 110 |  | 
| 111 | 
            -
                    if  | 
| 112 | 
            -
                         | 
|  | |
| 113 |  | 
| 114 | 
            -
             | 
| 115 | 
            -
                    clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
         | 
| 116 | 
            -
                    print(clean_time_out)
         | 
| 117 |  | 
| 118 | 
            -
             | 
| 119 | 
            -
             | 
| 120 | 
            -
                    data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
         | 
| 121 |  | 
| 122 | 
            -
             | 
| 123 | 
            -
                         | 
| 124 | 
            -
             | 
| 125 |  | 
| 126 | 
            -
             | 
| 127 | 
            -
                     | 
|  | |
| 128 |  | 
| 129 | 
            -
             | 
|  | |
|  | |
| 130 |  | 
| 131 | 
            -
                     | 
| 132 | 
            -
             | 
| 133 | 
            -
                    data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
         | 
| 134 | 
            -
                    data = data[data[in_colnames_list_first].str.len() >= 50]
         | 
| 135 | 
            -
                    data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
         | 
| 136 | 
            -
                    
         | 
| 137 | 
            -
                    #print("Data shape after duplicate/null removal: ", data.shape)
         | 
| 138 |  | 
| 139 | 
            -
             | 
| 140 | 
            -
                    progress(0.4, desc= "Anonymising data")
         | 
| 141 |  | 
| 142 | 
            -
             | 
| 143 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 144 |  | 
| 145 | 
            -
                     | 
| 146 | 
            -
             | 
| 147 | 
            -
                    data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
         | 
| 148 |  | 
| 149 | 
            -
             | 
|  | |
| 150 |  | 
| 151 | 
            -
             | 
|  | |
|  | |
| 152 |  | 
| 153 | 
            -
             | 
| 154 | 
            -
                    time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
         | 
| 155 |  | 
| 156 | 
            -
             | 
| 157 |  | 
| 158 | 
            -
             | 
| 159 | 
            -
             | 
| 160 |  | 
| 161 | 
            -
             | 
| 162 | 
            -
                        data_file_name_no_ext = data_file_name_no_ext + "_split"
         | 
| 163 |  | 
| 164 | 
            -
                     | 
| 165 | 
            -
             | 
| 166 | 
            -
             | 
| 167 | 
            -
             | 
| 168 | 
            -
             | 
| 169 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 170 |  | 
| 171 | 
            -
             | 
| 172 | 
            -
             | 
| 173 |  | 
| 174 | 
            -
             | 
| 175 |  | 
| 176 | 
            -
             | 
| 177 |  | 
| 178 | 
             
                out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev +  ".csv"
         | 
| 179 | 
             
                data.to_csv(out_data_name)
         | 
| @@ -299,27 +298,6 @@ def extract_topics( | |
| 299 | 
             
                if high_quality_mode == "Yes":
         | 
| 300 | 
             
                    print("Using high quality embedding model")
         | 
| 301 |  | 
| 302 | 
            -
                    # Define a list of possible local locations to search for the model
         | 
| 303 | 
            -
                    local_embeddings_locations = [
         | 
| 304 | 
            -
                        "model/embed/", # Potential local location
         | 
| 305 | 
            -
                        "/model/embed/", # Potential location in Docker container
         | 
| 306 | 
            -
                        "/home/user/app/model/embed/" # This is inside a Docker container
         | 
| 307 | 
            -
                    ]
         | 
| 308 | 
            -
             | 
| 309 | 
            -
                    # Attempt to load the model from each local location
         | 
| 310 | 
            -
                    for location in local_embeddings_locations:
         | 
| 311 | 
            -
                        try:
         | 
| 312 | 
            -
                            embedding_model = SentenceTransformer(location)#, truncate_dim=512)
         | 
| 313 | 
            -
                            print(f"Found local model installation at: {location}")
         | 
| 314 | 
            -
                            break  # Exit the loop if the model is found
         | 
| 315 | 
            -
                        except Exception as e:
         | 
| 316 | 
            -
                            print(f"Failed to load model from {location}: {e}")
         | 
| 317 | 
            -
                            continue
         | 
| 318 | 
            -
                    else:
         | 
| 319 | 
            -
                        # If the loop completes without finding the model in any local location
         | 
| 320 | 
            -
                        embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
         | 
| 321 | 
            -
                        print("Could not find local model installation. Downloading from Huggingface")
         | 
| 322 | 
            -
             | 
| 323 | 
             
                    #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)       
         | 
| 324 |  | 
| 325 | 
             
                    # If tfidf embeddings currently exist, wipe these empty
         | 
| @@ -329,15 +307,15 @@ def extract_topics( | |
| 329 | 
             
                    embeddings_type_state = "large"
         | 
| 330 |  | 
| 331 | 
             
                    # UMAP model uses Bertopic defaults
         | 
| 332 | 
            -
                    umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
         | 
| 333 |  | 
| 334 | 
             
                else:
         | 
| 335 | 
             
                    print("Choosing low resource TF-IDF model.")
         | 
| 336 |  | 
| 337 | 
            -
                    embedding_model = make_pipeline(
         | 
| 338 | 
            -
             | 
| 339 | 
            -
             | 
| 340 | 
            -
             | 
| 341 |  | 
| 342 | 
             
                    # If large embeddings currently exist, wipe these empty, then rename embeddings type
         | 
| 343 | 
             
                    if embeddings_type_state == "large":
         | 
| @@ -346,10 +324,10 @@ def extract_topics( | |
| 346 | 
             
                    embeddings_type_state = "tfidf"
         | 
| 347 |  | 
| 348 | 
             
                    #umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
         | 
| 349 | 
            -
             | 
| 350 | 
            -
             | 
| 351 |  | 
| 352 | 
            -
                embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out,  | 
| 353 |  | 
| 354 | 
             
                 # If you want to save your embedding files
         | 
| 355 | 
             
                if return_intermediate_files == "Yes":
         | 
|  | |
| 7 | 
             
            import numpy as np
         | 
| 8 | 
             
            import time
         | 
| 9 | 
             
            from bertopic import BERTopic
         | 
| 10 | 
            +
            import spaces
         | 
| 11 |  | 
| 12 | 
             
            from typing import List, Type, Union
         | 
| 13 | 
             
            PandasDataFrame = Type[pd.DataFrame]
         | 
|  | |
| 18 | 
             
            from funcs.embeddings import make_or_load_embeddings, torch_device
         | 
| 19 | 
             
            from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
         | 
| 20 | 
             
            from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
         | 
|  | |
| 21 | 
             
            from sklearn.feature_extraction.text import CountVectorizer
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 22 | 
             
            import funcs.anonymiser as anon
         | 
| 23 | 
             
            from umap import UMAP
         | 
| 24 |  | 
|  | |
| 91 | 
             
                output_list = []
         | 
| 92 | 
             
                #file_list = [string.name for string in in_files]
         | 
| 93 |  | 
| 94 | 
            +
                for in_colnames_list_first in in_colnames:
         | 
| 95 |  | 
| 96 | 
            +
                    print("Cleaning column:", in_colnames_list_first)
         | 
|  | |
|  | |
| 97 |  | 
| 98 | 
            +
                    #in_colnames_list_first = in_colnames[0]
         | 
|  | |
|  | |
| 99 |  | 
| 100 | 
            +
                    # Reset original index to a new column so you can link it to data outputted from cleaning
         | 
| 101 | 
            +
                    if not "original_index" in data.columns:
         | 
| 102 | 
            +
                        data = data.reset_index(names="original_index")
         | 
| 103 |  | 
| 104 | 
            +
                    if clean_text == "Yes":
         | 
| 105 | 
            +
                        clean_tic = time.perf_counter()
         | 
| 106 | 
            +
                        print("Starting data clean.")
         | 
| 107 |  | 
| 108 | 
            +
                        data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
         | 
|  | |
|  | |
| 109 |  | 
| 110 | 
            +
                        if '_clean' not in data_file_name_no_ext:
         | 
| 111 | 
            +
                            data_file_name_no_ext = data_file_name_no_ext + "_clean"
         | 
|  | |
| 112 |  | 
| 113 | 
            +
                        clean_toc = time.perf_counter()
         | 
| 114 | 
            +
                        clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
         | 
| 115 | 
            +
                        print(clean_time_out)
         | 
| 116 |  | 
| 117 | 
            +
                    # Clean custom regex if exists
         | 
| 118 | 
            +
                    if not custom_regex.empty:
         | 
| 119 | 
            +
                        data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
         | 
| 120 |  | 
| 121 | 
            +
                        if '_clean' not in data_file_name_no_ext:
         | 
| 122 | 
            +
                            data_file_name_no_ext = data_file_name_no_ext + "_clean"
         | 
| 123 | 
            +
                        
         | 
| 124 |  | 
| 125 | 
            +
                    if drop_duplicate_text == "Yes":
         | 
| 126 | 
            +
                        progress(0.3, desc= "Drop duplicates - remove short texts")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 127 |  | 
| 128 | 
            +
                        data_file_name_no_ext = data_file_name_no_ext + "_dedup"
         | 
|  | |
| 129 |  | 
| 130 | 
            +
                        #print("Removing duplicates and short entries from data")
         | 
| 131 | 
            +
                        #print("Data shape before: ", data.shape)
         | 
| 132 | 
            +
                        data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
         | 
| 133 | 
            +
                        data = data[data[in_colnames_list_first].str.len() >= 50]
         | 
| 134 | 
            +
                        data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
         | 
| 135 | 
            +
                        
         | 
| 136 | 
            +
                        #print("Data shape after duplicate/null removal: ", data.shape)
         | 
| 137 |  | 
| 138 | 
            +
                    if anonymise_drop == "Yes":
         | 
| 139 | 
            +
                        progress(0.4, desc= "Anonymising data")
         | 
|  | |
| 140 |  | 
| 141 | 
            +
                        if '_anon' not in data_file_name_no_ext:
         | 
| 142 | 
            +
                            data_file_name_no_ext = data_file_name_no_ext + "_anon"
         | 
| 143 |  | 
| 144 | 
            +
                        anon_tic = time.perf_counter()
         | 
| 145 | 
            +
                        
         | 
| 146 | 
            +
                        data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
         | 
| 147 |  | 
| 148 | 
            +
                        data[in_colnames_list_first] = data_anon_col
         | 
|  | |
| 149 |  | 
| 150 | 
            +
                        print(anonymisation_success)
         | 
| 151 |  | 
| 152 | 
            +
                        anon_toc = time.perf_counter()
         | 
| 153 | 
            +
                        time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
         | 
| 154 |  | 
| 155 | 
            +
                        print(time_out)
         | 
|  | |
| 156 |  | 
| 157 | 
            +
                    if sentence_split_drop == "Yes":
         | 
| 158 | 
            +
                        progress(0.6, desc= "Splitting text into sentences")
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                        if '_split' not in data_file_name_no_ext:
         | 
| 161 | 
            +
                            data_file_name_no_ext = data_file_name_no_ext + "_split"
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                        anon_tic = time.perf_counter()
         | 
| 164 | 
            +
                        
         | 
| 165 | 
            +
                        data = expand_sentences_spacy(data, in_colnames_list_first)
         | 
| 166 | 
            +
                        data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
         | 
| 167 | 
            +
                        data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
         | 
| 168 | 
            +
                        data.reset_index(inplace=True, drop=True)
         | 
| 169 |  | 
| 170 | 
            +
                        anon_toc = time.perf_counter()
         | 
| 171 | 
            +
                        time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
         | 
| 172 |  | 
| 173 | 
            +
                        print(time_out)
         | 
| 174 |  | 
| 175 | 
            +
                        data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
         | 
| 176 |  | 
| 177 | 
             
                out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev +  ".csv"
         | 
| 178 | 
             
                data.to_csv(out_data_name)
         | 
|  | |
| 298 | 
             
                if high_quality_mode == "Yes":
         | 
| 299 | 
             
                    print("Using high quality embedding model")
         | 
| 300 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 301 | 
             
                    #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)       
         | 
| 302 |  | 
| 303 | 
             
                    # If tfidf embeddings currently exist, wipe these empty
         | 
|  | |
| 307 | 
             
                    embeddings_type_state = "large"
         | 
| 308 |  | 
| 309 | 
             
                    # UMAP model uses Bertopic defaults
         | 
| 310 | 
            +
                    #umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
         | 
| 311 |  | 
| 312 | 
             
                else:
         | 
| 313 | 
             
                    print("Choosing low resource TF-IDF model.")
         | 
| 314 |  | 
| 315 | 
            +
                    # embedding_model = make_pipeline(
         | 
| 316 | 
            +
                    #         TfidfVectorizer(),
         | 
| 317 | 
            +
                    #         TruncatedSVD(100, random_state=random_seed)
         | 
| 318 | 
            +
                    #         )
         | 
| 319 |  | 
| 320 | 
             
                    # If large embeddings currently exist, wipe these empty, then rename embeddings type
         | 
| 321 | 
             
                    if embeddings_type_state == "large":
         | 
|  | |
| 324 | 
             
                    embeddings_type_state = "tfidf"
         | 
| 325 |  | 
| 326 | 
             
                    #umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
         | 
| 327 | 
            +
                # UMAP model uses Bertopic defaults
         | 
| 328 | 
            +
                umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=True, random_state=random_seed)
         | 
| 329 |  | 
| 330 | 
            +
                embeddings_out, embedding_model = make_or_load_embeddings(docs, file_list, embeddings_out, embeddings_super_compress, high_quality_mode, embeddings_name)
         | 
| 331 |  | 
| 332 | 
             
                 # If you want to save your embedding files
         | 
| 333 | 
             
                if return_intermediate_files == "Yes":
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -3,11 +3,10 @@ pandas==2.2.3 | |
| 3 | 
             
            plotly==5.24.1
         | 
| 4 | 
             
            scikit-learn==1.5.2
         | 
| 5 | 
             
            umap-learn==0.5.7
         | 
| 6 | 
            -
            gradio==5. | 
| 7 | 
            -
            boto3==1.35. | 
| 8 | 
             
            transformers==4.46.3
         | 
| 9 | 
             
            accelerate==1.1.1
         | 
| 10 | 
            -
            torch==2.5.1
         | 
| 11 | 
             
            bertopic==0.16.4
         | 
| 12 | 
             
            spacy==3.8.0
         | 
| 13 | 
             
            en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
         | 
| @@ -18,6 +17,9 @@ presidio_analyzer==2.2.355 | |
| 18 | 
             
            presidio_anonymizer==2.2.355
         | 
| 19 | 
             
            scipy
         | 
| 20 | 
             
            polars
         | 
| 21 | 
            -
            sentence-transformers==3. | 
| 22 | 
            -
             | 
| 23 | 
            -
            # | 
|  | |
|  | |
|  | 
|  | |
| 3 | 
             
            plotly==5.24.1
         | 
| 4 | 
             
            scikit-learn==1.5.2
         | 
| 5 | 
             
            umap-learn==0.5.7
         | 
| 6 | 
            +
            gradio==5.8.0
         | 
| 7 | 
            +
            boto3==1.35.71
         | 
| 8 | 
             
            transformers==4.46.3
         | 
| 9 | 
             
            accelerate==1.1.1
         | 
|  | |
| 10 | 
             
            bertopic==0.16.4
         | 
| 11 | 
             
            spacy==3.8.0
         | 
| 12 | 
             
            en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
         | 
|  | |
| 17 | 
             
            presidio_anonymizer==2.2.355
         | 
| 18 | 
             
            scipy
         | 
| 19 | 
             
            polars
         | 
| 20 | 
            +
            sentence-transformers==3.3.1
         | 
| 21 | 
            +
            torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
         | 
| 22 | 
            +
            #llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
         | 
| 23 | 
            +
            # Specify exact llama_cpp wheel for huggingface compatibility
         | 
| 24 | 
            +
            https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
         | 
| 25 | 
            +
            numpy==1.26.4
         | 
    	
        requirements_aws.txt
    CHANGED
    
    | @@ -6,7 +6,7 @@ umap-learn==0.5.7 | |
| 6 | 
             
            boto3==1.35.64
         | 
| 7 | 
             
            spacy==3.8.0
         | 
| 8 | 
             
            en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
         | 
| 9 | 
            -
            gradio==5. | 
| 10 | 
             
            pyarrow
         | 
| 11 | 
             
            openpyxl
         | 
| 12 | 
             
            Faker
         | 
|  | |
| 6 | 
             
            boto3==1.35.64
         | 
| 7 | 
             
            spacy==3.8.0
         | 
| 8 | 
             
            en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
         | 
| 9 | 
            +
            gradio==5.8.0
         | 
| 10 | 
             
            pyarrow
         | 
| 11 | 
             
            openpyxl
         | 
| 12 | 
             
            Faker
         | 
    	
        requirements_gpu.txt
    CHANGED
    
    | @@ -18,8 +18,7 @@ presidio_analyzer==2.2.355 | |
| 18 | 
             
            presidio_anonymizer==2.2.355
         | 
| 19 | 
             
            scipy
         | 
| 20 | 
             
            polars
         | 
| 21 | 
            -
            llama-cpp-python==0. | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
            #numpy==1.26.4
         | 
| 25 |  | 
|  | |
| 18 | 
             
            presidio_anonymizer==2.2.355
         | 
| 19 | 
             
            scipy
         | 
| 20 | 
             
            polars
         | 
| 21 | 
            +
            llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
         | 
| 22 | 
            +
            sentence-transformers==3.3.1
         | 
| 23 | 
            +
            numpy==1.26.4
         | 
|  | |
| 24 |  |