Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -301,7 +301,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
|
|
| 301 |
|
| 302 |
return chunks, embedding_model, len(text.split())
|
| 303 |
|
| 304 |
-
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, lang='german', apply_phonetic=True, phonetic_weight=0.3):
|
| 305 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
| 306 |
|
| 307 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
|
@@ -330,7 +330,9 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
| 330 |
|
| 331 |
results_df = pd.DataFrame({
|
| 332 |
'content': [doc.page_content for doc in results],
|
| 333 |
-
'embedding': embeddings
|
|
|
|
|
|
|
| 334 |
})
|
| 335 |
|
| 336 |
return results_df, end_time - start_time, vector_store, results
|
|
@@ -340,10 +342,12 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
| 340 |
# Evaluation Metrics
|
| 341 |
# ... (previous code remains the same)
|
| 342 |
|
| 343 |
-
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
|
| 344 |
stats = {
|
| 345 |
"num_results": len(results),
|
| 346 |
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
|
|
|
|
|
|
| 347 |
"search_time": search_time,
|
| 348 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
| 349 |
"num_documents": len(vector_store.docstore._dict),
|
|
@@ -353,6 +357,10 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
|
|
| 353 |
"top_k": top_k,
|
| 354 |
}
|
| 355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
if len(results) > 1000:
|
| 357 |
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
| 358 |
pairwise_similarities = np.inner(embeddings, embeddings)
|
|
@@ -373,7 +381,6 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
|
|
| 373 |
stats["rank_correlation"] = rank_correlation
|
| 374 |
|
| 375 |
return stats
|
| 376 |
-
|
| 377 |
# Visualization
|
| 378 |
def visualize_results(results_df, stats_df):
|
| 379 |
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
|
@@ -432,7 +439,7 @@ def rerank_results(results, query, reranker):
|
|
| 432 |
return reranked_results
|
| 433 |
|
| 434 |
# Main Comparison Function
|
| 435 |
-
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
|
| 436 |
all_results = []
|
| 437 |
all_stats = []
|
| 438 |
settings = {
|
|
@@ -489,6 +496,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 489 |
search_type,
|
| 490 |
query,
|
| 491 |
top_k,
|
|
|
|
| 492 |
lang,
|
| 493 |
apply_phonetic,
|
| 494 |
phonetic_weight
|
|
@@ -500,13 +508,15 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 500 |
|
| 501 |
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
|
| 502 |
|
| 503 |
-
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
| 504 |
stats["model"] = f"{model_type} - {model_name}"
|
| 505 |
stats.update(settings)
|
| 506 |
|
| 507 |
formatted_results = format_results(results_raw, stats)
|
| 508 |
for i, result in enumerate(formatted_results):
|
| 509 |
result['embedding'] = result_embeddings[i]
|
|
|
|
|
|
|
| 510 |
|
| 511 |
all_results.extend(formatted_results)
|
| 512 |
all_stats.append(stats)
|
|
@@ -651,6 +661,7 @@ def launch_interface(share=True):
|
|
| 651 |
with gr.Tab("Simple"):
|
| 652 |
file_input = gr.File(label="Upload File (Optional)")
|
| 653 |
query_input = gr.Textbox(label="Search Query")
|
|
|
|
| 654 |
embedding_models_input = gr.CheckboxGroup(
|
| 655 |
choices=[
|
| 656 |
"HuggingFace:paraphrase-miniLM",
|
|
@@ -661,7 +672,7 @@ def launch_interface(share=True):
|
|
| 661 |
label="Embedding Models"
|
| 662 |
)
|
| 663 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
| 664 |
-
|
| 665 |
with gr.Tab("Advanced"):
|
| 666 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
| 667 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
|
@@ -723,7 +734,7 @@ def launch_interface(share=True):
|
|
| 723 |
inputs=[
|
| 724 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
| 725 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
| 726 |
-
vector_store_type_input, search_type_input, top_k_input, lang_input,
|
| 727 |
apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
|
| 728 |
phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
|
| 729 |
custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
|
|
@@ -1066,7 +1077,80 @@ def rerank_results(results, query, reranker):
|
|
| 1066 |
|
| 1067 |
This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
|
| 1068 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1069 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1070 |
|
| 1071 |
"""
|
| 1072 |
|
|
|
|
| 301 |
|
| 302 |
return chunks, embedding_model, len(text.split())
|
| 303 |
|
| 304 |
+
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=True, phonetic_weight=0.3):
|
| 305 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
| 306 |
|
| 307 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
|
|
|
| 330 |
|
| 331 |
results_df = pd.DataFrame({
|
| 332 |
'content': [doc.page_content for doc in results],
|
| 333 |
+
'embedding': embeddings,
|
| 334 |
+
'length': [len(doc.page_content) for doc in results],
|
| 335 |
+
'contains_expected': [expected_result in doc.page_content if expected_result else None for doc in results]
|
| 336 |
})
|
| 337 |
|
| 338 |
return results_df, end_time - start_time, vector_store, results
|
|
|
|
| 342 |
# Evaluation Metrics
|
| 343 |
# ... (previous code remains the same)
|
| 344 |
|
| 345 |
+
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result=None):
|
| 346 |
stats = {
|
| 347 |
"num_results": len(results),
|
| 348 |
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
| 349 |
+
"min_content_length": min([len(doc.page_content) for doc in results]) if results else 0,
|
| 350 |
+
"max_content_length": max([len(doc.page_content) for doc in results]) if results else 0,
|
| 351 |
"search_time": search_time,
|
| 352 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
| 353 |
"num_documents": len(vector_store.docstore._dict),
|
|
|
|
| 357 |
"top_k": top_k,
|
| 358 |
}
|
| 359 |
|
| 360 |
+
if expected_result:
|
| 361 |
+
stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
|
| 362 |
+
stats["expected_result_rank"] = next((i for i, doc in enumerate(results) if expected_result in doc.page_content), -1) + 1
|
| 363 |
+
|
| 364 |
if len(results) > 1000:
|
| 365 |
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
| 366 |
pairwise_similarities = np.inner(embeddings, embeddings)
|
|
|
|
| 381 |
stats["rank_correlation"] = rank_correlation
|
| 382 |
|
| 383 |
return stats
|
|
|
|
| 384 |
# Visualization
|
| 385 |
def visualize_results(results_df, stats_df):
|
| 386 |
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
|
|
|
| 439 |
return reranked_results
|
| 440 |
|
| 441 |
# Main Comparison Function
|
| 442 |
+
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, expected_result=None, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
|
| 443 |
all_results = []
|
| 444 |
all_stats = []
|
| 445 |
settings = {
|
|
|
|
| 496 |
search_type,
|
| 497 |
query,
|
| 498 |
top_k,
|
| 499 |
+
expected_result,
|
| 500 |
lang,
|
| 501 |
apply_phonetic,
|
| 502 |
phonetic_weight
|
|
|
|
| 508 |
|
| 509 |
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
|
| 510 |
|
| 511 |
+
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
|
| 512 |
stats["model"] = f"{model_type} - {model_name}"
|
| 513 |
stats.update(settings)
|
| 514 |
|
| 515 |
formatted_results = format_results(results_raw, stats)
|
| 516 |
for i, result in enumerate(formatted_results):
|
| 517 |
result['embedding'] = result_embeddings[i]
|
| 518 |
+
result['length'] = len(result['Content'])
|
| 519 |
+
result['contains_expected'] = expected_result in result['Content'] if expected_result else None
|
| 520 |
|
| 521 |
all_results.extend(formatted_results)
|
| 522 |
all_stats.append(stats)
|
|
|
|
| 661 |
with gr.Tab("Simple"):
|
| 662 |
file_input = gr.File(label="Upload File (Optional)")
|
| 663 |
query_input = gr.Textbox(label="Search Query")
|
| 664 |
+
expected_result_input = gr.Textbox(label="Expected Result (Optional)")
|
| 665 |
embedding_models_input = gr.CheckboxGroup(
|
| 666 |
choices=[
|
| 667 |
"HuggingFace:paraphrase-miniLM",
|
|
|
|
| 672 |
label="Embedding Models"
|
| 673 |
)
|
| 674 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
| 675 |
+
|
| 676 |
with gr.Tab("Advanced"):
|
| 677 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
| 678 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
|
|
|
| 734 |
inputs=[
|
| 735 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
| 736 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
| 737 |
+
vector_store_type_input, search_type_input, top_k_input, expected_result_input, lang_input,
|
| 738 |
apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
|
| 739 |
phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
|
| 740 |
custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
|
|
|
|
| 1077 |
|
| 1078 |
This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
|
| 1079 |
|
| 1080 |
+
# Template
|
| 1081 |
+
|
| 1082 |
+
python
|
| 1083 |
+
´´´
|
| 1084 |
+
# Chat App Template
|
| 1085 |
+
def create_chat_app(settings):
|
| 1086 |
+
def chat(message, history):
|
| 1087 |
+
# Process the message using the configured embedding model and vector store
|
| 1088 |
+
chunks, embedding_model, _ = process_files(
|
| 1089 |
+
settings['file_path'],
|
| 1090 |
+
settings['model_type'],
|
| 1091 |
+
settings['model_name'],
|
| 1092 |
+
settings['split_strategy'],
|
| 1093 |
+
settings['chunk_size'],
|
| 1094 |
+
settings['overlap_size'],
|
| 1095 |
+
settings['custom_separators'],
|
| 1096 |
+
settings['lang'],
|
| 1097 |
+
settings['apply_preprocessing']
|
| 1098 |
+
)
|
| 1099 |
+
|
| 1100 |
+
results, _, _, _ = search_embeddings(
|
| 1101 |
+
chunks,
|
| 1102 |
+
embedding_model,
|
| 1103 |
+
settings['vector_store_type'],
|
| 1104 |
+
settings['search_type'],
|
| 1105 |
+
message,
|
| 1106 |
+
settings['top_k'],
|
| 1107 |
+
lang=settings['lang'],
|
| 1108 |
+
apply_phonetic=settings['apply_phonetic'],
|
| 1109 |
+
phonetic_weight=settings['phonetic_weight']
|
| 1110 |
+
)
|
| 1111 |
+
|
| 1112 |
+
# Generate a response based on the retrieved results
|
| 1113 |
+
response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
|
| 1114 |
+
for i, result in enumerate(results[:settings['top_k']]):
|
| 1115 |
+
response += f"{i+1}. {result['content'][:100]}...\n\n"
|
| 1116 |
+
|
| 1117 |
+
return response
|
| 1118 |
+
|
| 1119 |
+
with gr.Blocks() as chat_interface:
|
| 1120 |
+
gr.Markdown(f"# Chat App using {settings['model_type']} - {settings['model_name']}")
|
| 1121 |
+
chatbot = gr.Chatbot()
|
| 1122 |
+
msg = gr.Textbox()
|
| 1123 |
+
clear = gr.Button("Clear")
|
| 1124 |
+
|
| 1125 |
+
msg.submit(chat, [msg, chatbot], [msg, chatbot])
|
| 1126 |
+
clear.click(lambda: None, None, chatbot, queue=False)
|
| 1127 |
+
|
| 1128 |
+
return chat_interface
|
| 1129 |
+
|
| 1130 |
+
# Sample usage of the chat app template
|
| 1131 |
+
sample_settings = {
|
| 1132 |
+
'file_path': 'path/to/your/document.pdf',
|
| 1133 |
+
'model_type': 'HuggingFace',
|
| 1134 |
+
'model_name': 'paraphrase-miniLM',
|
| 1135 |
+
'split_strategy': 'recursive',
|
| 1136 |
+
'chunk_size': 500,
|
| 1137 |
+
'overlap_size': 50,
|
| 1138 |
+
'custom_separators': None,
|
| 1139 |
+
'vector_store_type': 'FAISS',
|
| 1140 |
+
'search_type': 'similarity',
|
| 1141 |
+
'top_k': 3,
|
| 1142 |
+
'lang': 'english',
|
| 1143 |
+
'apply_preprocessing': True,
|
| 1144 |
+
'apply_phonetic': True,
|
| 1145 |
+
'phonetic_weight': 0.3
|
| 1146 |
+
}
|
| 1147 |
+
|
| 1148 |
+
sample_chat_app = create_chat_app(sample_settings)
|
| 1149 |
|
| 1150 |
+
if __name__ == "__main__":
|
| 1151 |
+
launch_interface()
|
| 1152 |
+
# Uncomment the following line to launch the sample chat app
|
| 1153 |
+
´´´
|
| 1154 |
|
| 1155 |
"""
|
| 1156 |
|