More_Advanced_Embeddings_Comparator

Runtime error

App Files Files Community

Chris4K commited on Oct 20, 2024

Commit

c38e61c

verified ·

1 Parent(s): 6fa8e54

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -8

app.py CHANGED Viewed

@@ -301,7 +301,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
     return chunks, embedding_model, len(text.split())
-def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, lang='german', apply_phonetic=True, phonetic_weight=0.3):
     preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
     vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
@@ -330,7 +330,9 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
     results_df = pd.DataFrame({
         'content': [doc.page_content for doc in results],
-        'embedding': embeddings
     })
     return results_df, end_time - start_time, vector_store, results
@@ -340,10 +342,12 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
 # Evaluation Metrics
 # ... (previous code remains the same)
-def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
     stats = {
         "num_results": len(results),
         "avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
         "search_time": search_time,
         "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
         "num_documents": len(vector_store.docstore._dict),
@@ -353,6 +357,10 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
         "top_k": top_k,
     }
     if len(results) > 1000:
         embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
         pairwise_similarities = np.inner(embeddings, embeddings)
@@ -373,7 +381,6 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
     stats["rank_correlation"] = rank_correlation
     return stats
 # Visualization
 def visualize_results(results_df, stats_df):
     fig, axs = plt.subplots(2, 2, figsize=(20, 20))
@@ -432,7 +439,7 @@ def rerank_results(results, query, reranker):
     return reranked_results
 # Main Comparison Function
-def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
     all_results = []
     all_stats = []
     settings = {
@@ -489,6 +496,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
             search_type,
             query,
             top_k,
             lang,
             apply_phonetic,
             phonetic_weight
@@ -500,13 +508,15 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
         result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
-        stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
         stats["model"] = f"{model_type} - {model_name}"
         stats.update(settings)
         formatted_results = format_results(results_raw, stats)
         for i, result in enumerate(formatted_results):
             result['embedding'] = result_embeddings[i]
         all_results.extend(formatted_results)
         all_stats.append(stats)
@@ -651,6 +661,7 @@ def launch_interface(share=True):
         with gr.Tab("Simple"):
             file_input = gr.File(label="Upload File (Optional)")
             query_input = gr.Textbox(label="Search Query")
             embedding_models_input = gr.CheckboxGroup(
                 choices=[
                     "HuggingFace:paraphrase-miniLM",
@@ -661,7 +672,7 @@ def launch_interface(share=True):
                 label="Embedding Models"
             )
             top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
         with gr.Tab("Advanced"):
             custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
             split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
@@ -723,7 +734,7 @@ def launch_interface(share=True):
             inputs=[
                 file_input, query_input, embedding_models_input, custom_embedding_model_input,
                 split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
-                vector_store_type_input, search_type_input, top_k_input, lang_input,
                 apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
                 phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
                 custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
@@ -1066,7 +1077,80 @@ def rerank_results(results, query, reranker):
 This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
         """

     return chunks, embedding_model, len(text.split())
+def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=True, phonetic_weight=0.3):
     preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
     vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
     results_df = pd.DataFrame({
         'content': [doc.page_content for doc in results],
+        'embedding': embeddings,
+        'length': [len(doc.page_content) for doc in results],
+        'contains_expected': [expected_result in doc.page_content if expected_result else None for doc in results]
     })
     return results_df, end_time - start_time, vector_store, results
 # Evaluation Metrics
 # ... (previous code remains the same)
+def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result=None):
     stats = {
         "num_results": len(results),
         "avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
+        "min_content_length": min([len(doc.page_content) for doc in results]) if results else 0,
+        "max_content_length": max([len(doc.page_content) for doc in results]) if results else 0,
         "search_time": search_time,
         "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
         "num_documents": len(vector_store.docstore._dict),
         "top_k": top_k,
     }
+    if expected_result:
+        stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
+        stats["expected_result_rank"] = next((i for i, doc in enumerate(results) if expected_result in doc.page_content), -1) + 1
     if len(results) > 1000:
         embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
         pairwise_similarities = np.inner(embeddings, embeddings)
     stats["rank_correlation"] = rank_correlation
     return stats
 # Visualization
 def visualize_results(results_df, stats_df):
     fig, axs = plt.subplots(2, 2, figsize=(20, 20))
     return reranked_results
 # Main Comparison Function
+def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, expected_result=None, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
     all_results = []
     all_stats = []
     settings = {
             search_type,
             query,
             top_k,
+            expected_result,
             lang,
             apply_phonetic,
             phonetic_weight
         result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
+        stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
         stats["model"] = f"{model_type} - {model_name}"
         stats.update(settings)
         formatted_results = format_results(results_raw, stats)
         for i, result in enumerate(formatted_results):
             result['embedding'] = result_embeddings[i]
+            result['length'] = len(result['Content'])
+            result['contains_expected'] = expected_result in result['Content'] if expected_result else None
         all_results.extend(formatted_results)
         all_stats.append(stats)
         with gr.Tab("Simple"):
             file_input = gr.File(label="Upload File (Optional)")
             query_input = gr.Textbox(label="Search Query")
+            expected_result_input = gr.Textbox(label="Expected Result (Optional)")
             embedding_models_input = gr.CheckboxGroup(
                 choices=[
                     "HuggingFace:paraphrase-miniLM",
                 label="Embedding Models"
             )
             top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
         with gr.Tab("Advanced"):
             custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
             split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
             inputs=[
                 file_input, query_input, embedding_models_input, custom_embedding_model_input,
                 split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
+                vector_store_type_input, search_type_input, top_k_input, expected_result_input, lang_input,
                 apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
                 phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
                 custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
 This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
+# Template
+python
+´´´
+# Chat App Template
+def create_chat_app(settings):
+    def chat(message, history):
+        # Process the message using the configured embedding model and vector store
+        chunks, embedding_model, _ = process_files(
+            settings['file_path'],
+            settings['model_type'],
+            settings['model_name'],
+            settings['split_strategy'],
+            settings['chunk_size'],
+            settings['overlap_size'],
+            settings['custom_separators'],
+            settings['lang'],
+            settings['apply_preprocessing']
+        )
+        results, _, _, _ = search_embeddings(
+            chunks,
+            embedding_model,
+            settings['vector_store_type'],
+            settings['search_type'],
+            message,
+            settings['top_k'],
+            lang=settings['lang'],
+            apply_phonetic=settings['apply_phonetic'],
+            phonetic_weight=settings['phonetic_weight']
+        )
+        # Generate a response based on the retrieved results
+        response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
+        for i, result in enumerate(results[:settings['top_k']]):
+            response += f"{i+1}. {result['content'][:100]}...\n\n"
+        return response
+    with gr.Blocks() as chat_interface:
+        gr.Markdown(f"# Chat App using {settings['model_type']} - {settings['model_name']}")
+        chatbot = gr.Chatbot()
+        msg = gr.Textbox()
+        clear = gr.Button("Clear")
+        msg.submit(chat, [msg, chatbot], [msg, chatbot])
+        clear.click(lambda: None, None, chatbot, queue=False)
+    return chat_interface
+# Sample usage of the chat app template
+sample_settings = {
+    'file_path': 'path/to/your/document.pdf',
+    'model_type': 'HuggingFace',
+    'model_name': 'paraphrase-miniLM',
+    'split_strategy': 'recursive',
+    'chunk_size': 500,
+    'overlap_size': 50,
+    'custom_separators': None,
+    'vector_store_type': 'FAISS',
+    'search_type': 'similarity',
+    'top_k': 3,
+    'lang': 'english',
+    'apply_preprocessing': True,
+    'apply_phonetic': True,
+    'phonetic_weight': 0.3
+}
+sample_chat_app = create_chat_app(sample_settings)
+if __name__ == "__main__":
+    launch_interface()
+    # Uncomment the following line to launch the sample chat app
+´´´
         """