Advanced_Embeddings_Comparator

Build error

App Files Files Community

Chris4K commited on Oct 18, 2024

Commit

c77f8ac

verified ·

1 Parent(s): ce988dc

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -28

app.py CHANGED Viewed

@@ -171,18 +171,26 @@ class CustomEmbeddings(HuggingFaceEmbeddings):
 # Custom Tokenizer
-def create_custom_tokenizer(file_path):
     with open(file_path, 'r', encoding='utf-8') as f:
         text = f.read()
-    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
     tokenizer.pre_tokenizer = Whitespace()
-    trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
     tokenizer.train_from_iterator([text], trainer)
     return tokenizer
 def custom_tokenize(text, tokenizer):
     return tokenizer.encode(text).tokens
@@ -243,7 +251,7 @@ def get_retriever(vector_store, search_type, search_kwargs):
         raise ValueError(f"Unsupported search type: {search_type}")
 # Main Processing Functions
-def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', custom_tokenizer_file=None):
     if file_path:
         text = FileHandler.extract_text(file_path)
     else:
@@ -253,7 +261,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
             text += FileHandler.extract_text(file_path)
     if custom_tokenizer_file:
-        tokenizer = create_custom_tokenizer(custom_tokenizer_file)
         text = ' '.join(custom_tokenize(text, tokenizer))
     else:
         text = preprocess_text(text, lang)
@@ -387,7 +395,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
     return tokenizer, optimized_texts
 # Main Comparison Function
-def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None):
     all_results = []
     all_stats = []
     settings = {
@@ -399,12 +407,16 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
         "search_type": search_type,
         "top_k": top_k,
         "lang": lang,
-        "use_custom_embedding": use_custom_embedding,
         "optimize_vocab": optimize_vocab,
         "phonetic_weight": phonetic_weight
     }
-    for model_type, model_name in zip(model_types, model_names):
         # Process the file and generate chunks & embeddings
         chunks, embedding_model, num_tokens = process_files(
             file.name if file else None,
@@ -415,13 +427,16 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
             overlap_size,
             custom_separators.split(',') if custom_separators else None,
             lang,
-            custom_tokenizer_file
         )
         # Custom embedding handling
-        if use_custom_embedding:
-            custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
-            embedding_model = CustomEmbeddings(custom_model)
         # Optimizing vocabulary if required
         if optimize_vocab:
@@ -490,8 +505,8 @@ def launch_interface(share=True):
         inputs=[
             gr.File(label="Upload File (Optional)"),
             gr.Textbox(label="Search Query"),
-            gr.CheckboxGroup(choices=list(model_manager.list_models().keys()) + ["Custom"], label="Embedding Model Types"),
-            gr.CheckboxGroup(choices=[model for models in model_manager.list_models().values() for model in models] + ["custom_model"], label="Embedding Models"),
             gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
             gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
             gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
@@ -500,10 +515,12 @@ def launch_interface(share=True):
             gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity"),
             gr.Slider(1, 10, step=1, value=5, label="Top K"),
             gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german"),
-            gr.Checkbox(label="Use Custom Embedding", value=False),
             gr.Checkbox(label="Optimize Vocabulary", value=False),
             gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight"),
-            gr.File(label="Custom Tokenizer File (Optional)")
         ],
         outputs=[
             gr.Dataframe(label="Results", interactive=False),
@@ -523,13 +540,14 @@ def launch_interface(share=True):
     1. Upload a file (optional) or use the default files in the system.
     2. Enter a search query.
-    3. Select one or more embedding model types and specific models.
-    4. Choose a text splitting strategy and set chunk size and overlap.
-    5. Select a vector store type and search type.
-    6. Set the number of top results to retrieve.
-    7. Choose the language of your documents.
-    8. Optionally, use custom embeddings, optimize vocabulary, or adjust phonetic matching weight.
-    9. If you have a custom tokenizer, upload the file.
     The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
     """
@@ -539,7 +557,4 @@ def launch_interface(share=True):
         ["Embedding Comparison", "Tutorial"]
     )
-    iface.launch(share=share)
-if __name__ == "__main__":
-    launch_interface()

 # Custom Tokenizer
+def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
     with open(file_path, 'r', encoding='utf-8') as f:
         text = f.read()
+    if model_type == 'WordLevel':
+        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
+    elif model_type == 'BPE':
+        tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
+    elif model_type == 'Unigram':
+        tokenizer = Tokenizer(models.Unigram())
+    else:
+        raise ValueError(f"Unsupported tokenizer model: {model_type}")
     tokenizer.pre_tokenizer = Whitespace()
+    special_tokens = special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+    trainer = trainers.WordLevelTrainer(special_tokens=special_tokens, vocab_size=vocab_size)
     tokenizer.train_from_iterator([text], trainer)
     return tokenizer
 def custom_tokenize(text, tokenizer):
     return tokenizer.encode(text).tokens
         raise ValueError(f"Unsupported search type: {search_type}")
 # Main Processing Functions
+def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
     if file_path:
         text = FileHandler.extract_text(file_path)
     else:
             text += FileHandler.extract_text(file_path)
     if custom_tokenizer_file:
+        tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
         text = ' '.join(custom_tokenize(text, tokenizer))
     else:
         text = preprocess_text(text, lang)
     return tokenizer, optimized_texts
 # Main Comparison Function
+def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
     all_results = []
     all_stats = []
     settings = {
         "search_type": search_type,
         "top_k": top_k,
         "lang": lang,
         "optimize_vocab": optimize_vocab,
         "phonetic_weight": phonetic_weight
     }
+    # Parse embedding models
+    models = [model.strip().split(':') for model in embedding_models.split(',')]
+    if custom_embedding_model:
+        models.append(custom_embedding_model.strip().split(':'))
+    for model_type, model_name in models:
         # Process the file and generate chunks & embeddings
         chunks, embedding_model, num_tokens = process_files(
             file.name if file else None,
             overlap_size,
             custom_separators.split(',') if custom_separators else None,
             lang,
+            custom_tokenizer_file,
+            custom_tokenizer_model,
+            int(custom_tokenizer_vocab_size),
+            custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
         )
         # Custom embedding handling
+        #if use_custom_embedding:
+        #    custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
+        #    embedding_model = CustomEmbeddings(custom_model)
         # Optimizing vocabulary if required
         if optimize_vocab:
         inputs=[
             gr.File(label="Upload File (Optional)"),
             gr.Textbox(label="Search Query"),
+            gr.Textbox(label="Embedding Models (comma-separated, e.g. HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002)"),
+            gr.Textbox(label="Custom Embedding Model (optional, format: type:name)"),
             gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
             gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
             gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
             gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity"),
             gr.Slider(1, 10, step=1, value=5, label="Top K"),
             gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german"),
             gr.Checkbox(label="Optimize Vocabulary", value=False),
             gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight"),
+            gr.File(label="Custom Tokenizer File (Optional)"),
+            gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)"),
+            gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000"),
+            gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
         ],
         outputs=[
             gr.Dataframe(label="Results", interactive=False),
     1. Upload a file (optional) or use the default files in the system.
     2. Enter a search query.
+    3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
+    4. Optionally, specify a custom embedding model in the format type:name.
+    5. Choose a text splitting strategy and set chunk size and overlap.
+    6. Select a vector store type and search type.
+    7. Set the number of top results to retrieve.
+    8. Choose the language of your documents.
+    9. Optionally, optimize vocabulary or adjust phonetic matching weight.
+    10. If you have a custom tokenizer, upload the file and specify its attributes.
     The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
     """
         ["Embedding Comparison", "Tutorial"]
     )
+    iface.launch(share=share)