Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -74,6 +74,8 @@ FILES_DIR = './files'
|
|
| 74 |
# Model Management
|
| 75 |
class ModelManager:
|
| 76 |
def __init__(self):
|
|
|
|
|
|
|
| 77 |
self.models = {
|
| 78 |
'HuggingFace': {
|
| 79 |
'e5-base-de': "danielheinz/e5-base-sts-en-de",
|
|
@@ -90,6 +92,28 @@ class ModelManager:
|
|
| 90 |
}
|
| 91 |
}
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
def add_model(self, provider, name, model_path):
|
| 94 |
if provider not in self.models:
|
| 95 |
self.models[provider] = {}
|
|
@@ -197,7 +221,7 @@ class FileHandler:
|
|
| 197 |
def simple_tokenize(text):
|
| 198 |
return text.split()
|
| 199 |
|
| 200 |
-
def preprocess_text(text, lang='german', apply_preprocessing=
|
| 201 |
if not apply_preprocessing:
|
| 202 |
return text
|
| 203 |
|
|
@@ -225,7 +249,7 @@ def preprocess_text(text, lang='german', apply_preprocessing=True):
|
|
| 225 |
|
| 226 |
return ' '.join(tokens)
|
| 227 |
|
| 228 |
-
def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=
|
| 229 |
if not apply_phonetic:
|
| 230 |
return 0
|
| 231 |
if method == 'levenshtein_distance':
|
|
@@ -390,7 +414,7 @@ def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
|
| 390 |
|
| 391 |
|
| 392 |
# Main Processing Functions
|
| 393 |
-
def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', apply_preprocessing=
|
| 394 |
if file_path:
|
| 395 |
text = FileHandler.extract_text(file_path)
|
| 396 |
else:
|
|
@@ -412,7 +436,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
|
|
| 412 |
|
| 413 |
return chunks, embedding_model, len(text.split())
|
| 414 |
|
| 415 |
-
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=
|
| 416 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
| 417 |
|
| 418 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
|
@@ -421,6 +445,7 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
| 421 |
start_time = time.time()
|
| 422 |
results = retriever.invoke(preprocessed_query)
|
| 423 |
|
|
|
|
| 424 |
def score_result(doc):
|
| 425 |
base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
|
| 426 |
|
|
@@ -452,68 +477,83 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
| 452 |
|
| 453 |
return results_df, end_time - start_time, vector_store, results
|
| 454 |
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
# Safely get vector store size
|
| 471 |
-
try:
|
| 472 |
-
if hasattr(vector_store, '_index'):
|
| 473 |
-
stats["vector_store_size"] = vector_store._index.ntotal
|
| 474 |
-
elif hasattr(vector_store, '_collection'):
|
| 475 |
-
stats["vector_store_size"] = len(vector_store._collection.get())
|
| 476 |
-
else:
|
| 477 |
-
stats["vector_store_size"] = "N/A"
|
| 478 |
-
except:
|
| 479 |
-
stats["vector_store_size"] = "N/A"
|
| 480 |
-
|
| 481 |
-
# Safely get document count
|
| 482 |
-
try:
|
| 483 |
-
if hasattr(vector_store, 'docstore'):
|
| 484 |
-
stats["num_documents"] = len(vector_store.docstore._dict)
|
| 485 |
-
elif hasattr(vector_store, '_collection'):
|
| 486 |
-
stats["num_documents"] = len(vector_store._collection.get())
|
| 487 |
-
else:
|
| 488 |
-
stats["num_documents"] = len(results)
|
| 489 |
-
except:
|
| 490 |
-
stats["num_documents"] = len(results)
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
if expected_result:
|
| 494 |
-
stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
|
| 495 |
-
stats["expected_result_rank"] = next((i for i, doc in enumerate(results) if expected_result in doc.page_content), -1) + 1
|
| 496 |
-
|
| 497 |
-
if len(results) > 1000:
|
| 498 |
-
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
| 499 |
-
pairwise_similarities = np.inner(embeddings, embeddings)
|
| 500 |
-
stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
| 501 |
|
| 502 |
-
|
| 503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
else:
|
|
|
|
| 505 |
stats["silhouette_score"] = "N/A"
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
|
|
|
| 515 |
|
| 516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
# Visualization
|
| 518 |
def visualize_results(results_df, stats_df):
|
| 519 |
# Add model column if not present
|
|
@@ -688,7 +728,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 688 |
|
| 689 |
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
|
| 690 |
|
| 691 |
-
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
|
| 692 |
stats["model"] = f"{model_type} - {model_name}"
|
| 693 |
stats["model_type"] = model_type
|
| 694 |
stats["model_name"] = model_name
|
|
@@ -783,7 +823,7 @@ def automated_testing(file, query, test_params, expected_result=None):
|
|
| 783 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 784 |
results_raw = rerank_results(results_raw, query, reranker)
|
| 785 |
|
| 786 |
-
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'], expected_result)
|
| 787 |
stats["model"] = f"{params['model_type']} - {params['model_name']}"
|
| 788 |
stats["model_type"] = params['model_type']
|
| 789 |
stats["model_name"] = params['model_name']
|
|
@@ -989,28 +1029,55 @@ def launch_interface(share=True):
|
|
| 989 |
search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
|
| 990 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
| 991 |
|
| 992 |
-
with gr.Tab("
|
| 993 |
-
apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=
|
| 994 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
| 995 |
-
apply_phonetic_input = gr.Checkbox(label="Apply Phonetic Matching", value=
|
| 996 |
phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
|
| 997 |
custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
|
| 998 |
custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
|
| 999 |
custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
|
| 1000 |
custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
| 1001 |
use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
|
| 1002 |
-
query_optimization_model_input = gr.Textbox(label="Query Optimization Model
|
| 1003 |
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
| 1004 |
|
| 1005 |
with gr.Tab("Automation"):
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
)
|
| 1013 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1014 |
auto_split_strategies = gr.CheckboxGroup(
|
| 1015 |
choices=["token", "recursive"],
|
| 1016 |
label="Split Strategies to Test"
|
|
@@ -1030,6 +1097,36 @@ def launch_interface(share=True):
|
|
| 1030 |
auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
|
| 1031 |
auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
|
| 1032 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1033 |
with gr.Tab("LLM Suggestions"):
|
| 1034 |
llm_file_input = gr.File(label="Upload File for LLM Suggestions")
|
| 1035 |
llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
|
|
@@ -1072,22 +1169,6 @@ def launch_interface(share=True):
|
|
| 1072 |
outputs=[results_output, stats_output, plot_output, best_settings_output]
|
| 1073 |
)
|
| 1074 |
|
| 1075 |
-
auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
|
| 1076 |
-
auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
|
| 1077 |
-
recommendations_output = gr.JSON(label="Recommendations")
|
| 1078 |
-
|
| 1079 |
-
auto_submit_button = gr.Button("Run Automated Tests")
|
| 1080 |
-
auto_submit_button.click(
|
| 1081 |
-
fn=lambda *args: run_automated_tests_and_analyze(*args),
|
| 1082 |
-
inputs=[
|
| 1083 |
-
auto_file_input, auto_query_input, auto_expected_result_input, auto_model_types, auto_model_names,
|
| 1084 |
-
auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
|
| 1085 |
-
auto_vector_store_types, auto_search_types, auto_top_k,
|
| 1086 |
-
auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking
|
| 1087 |
-
],
|
| 1088 |
-
outputs=[auto_results_output, auto_stats_output, recommendations_output]
|
| 1089 |
-
)
|
| 1090 |
-
###
|
| 1091 |
|
| 1092 |
|
| 1093 |
use_case_md = """
|
|
@@ -1491,33 +1572,132 @@ if __name__ == "__main__":
|
|
| 1491 |
|
| 1492 |
iface.launch(share=share)
|
| 1493 |
|
| 1494 |
-
|
| 1495 |
-
|
| 1496 |
-
|
| 1497 |
-
|
| 1498 |
-
|
| 1499 |
-
|
| 1500 |
-
|
| 1501 |
-
|
| 1502 |
-
|
| 1503 |
-
|
| 1504 |
-
|
| 1505 |
-
|
| 1506 |
-
|
| 1507 |
-
|
| 1508 |
-
|
| 1509 |
-
|
| 1510 |
-
|
| 1511 |
-
'
|
| 1512 |
-
'
|
| 1513 |
-
|
| 1514 |
-
|
| 1515 |
-
|
| 1516 |
-
|
| 1517 |
-
|
| 1518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1519 |
|
| 1520 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1521 |
|
| 1522 |
if __name__ == "__main__":
|
| 1523 |
launch_interface()
|
|
|
|
| 74 |
# Model Management
|
| 75 |
class ModelManager:
|
| 76 |
def __init__(self):
|
| 77 |
+
self.rankings: Dict[str, float] = {}
|
| 78 |
+
self.model_stats: Dict[str, Dict[str, Any]] = {}
|
| 79 |
self.models = {
|
| 80 |
'HuggingFace': {
|
| 81 |
'e5-base-de': "danielheinz/e5-base-sts-en-de",
|
|
|
|
| 92 |
}
|
| 93 |
}
|
| 94 |
|
| 95 |
+
|
| 96 |
+
def update_model_ranking(self, model_id: str, score: float, feedback: Optional[str] = None):
|
| 97 |
+
"""Update model ranking based on performance and optional feedback"""
|
| 98 |
+
current_score = self.rankings.get(model_id, 0.0)
|
| 99 |
+
# Weighted average of current score and new score
|
| 100 |
+
self.rankings[model_id] = 0.7 * current_score + 0.3 * score
|
| 101 |
+
|
| 102 |
+
if feedback:
|
| 103 |
+
if model_id not in self.model_stats:
|
| 104 |
+
self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
|
| 105 |
+
self.model_stats[model_id]["feedback_count"] += 1
|
| 106 |
+
self.model_stats[model_id]["feedback"].append(feedback)
|
| 107 |
+
|
| 108 |
+
def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
|
| 109 |
+
"""Get top n ranked models"""
|
| 110 |
+
return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
|
| 111 |
+
|
| 112 |
+
def get_model_stats(self, model_id: str) -> Dict[str, Any]:
|
| 113 |
+
"""Get statistics for a specific model"""
|
| 114 |
+
return self.model_stats.get(model_id, {})
|
| 115 |
+
|
| 116 |
+
|
| 117 |
def add_model(self, provider, name, model_path):
|
| 118 |
if provider not in self.models:
|
| 119 |
self.models[provider] = {}
|
|
|
|
| 221 |
def simple_tokenize(text):
|
| 222 |
return text.split()
|
| 223 |
|
| 224 |
+
def preprocess_text(text, lang='german', apply_preprocessing=False):
|
| 225 |
if not apply_preprocessing:
|
| 226 |
return text
|
| 227 |
|
|
|
|
| 249 |
|
| 250 |
return ' '.join(tokens)
|
| 251 |
|
| 252 |
+
def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
|
| 253 |
if not apply_phonetic:
|
| 254 |
return 0
|
| 255 |
if method == 'levenshtein_distance':
|
|
|
|
| 414 |
|
| 415 |
|
| 416 |
# Main Processing Functions
|
| 417 |
+
def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', apply_preprocessing=False, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
|
| 418 |
if file_path:
|
| 419 |
text = FileHandler.extract_text(file_path)
|
| 420 |
else:
|
|
|
|
| 436 |
|
| 437 |
return chunks, embedding_model, len(text.split())
|
| 438 |
|
| 439 |
+
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
|
| 440 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
| 441 |
|
| 442 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
|
|
|
| 445 |
start_time = time.time()
|
| 446 |
results = retriever.invoke(preprocessed_query)
|
| 447 |
|
| 448 |
+
#this should be optional
|
| 449 |
def score_result(doc):
|
| 450 |
base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
|
| 451 |
|
|
|
|
| 477 |
|
| 478 |
return results_df, end_time - start_time, vector_store, results
|
| 479 |
|
| 480 |
+
# Enhanced Result Analysis
|
| 481 |
+
class ResultAnalyzer:
|
| 482 |
+
@staticmethod
|
| 483 |
+
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
|
| 484 |
+
top_k, expected_result=None, model_feedback=None):
|
| 485 |
+
stats = {
|
| 486 |
+
"num_results": len(results),
|
| 487 |
+
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
| 488 |
+
"min_content_length": min([len(doc.page_content) for doc in results]) if results else 0,
|
| 489 |
+
"max_content_length": max([len(doc.page_content) for doc in results]) if results else 0,
|
| 490 |
+
"search_time": search_time,
|
| 491 |
+
"num_tokens": num_tokens,
|
| 492 |
+
"embedding_dimension": len(embedding_model.embed_query(query)),
|
| 493 |
+
"top_k": top_k,
|
| 494 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
+
# Add vector store statistics
|
| 497 |
+
try:
|
| 498 |
+
if hasattr(vector_store, '_index'):
|
| 499 |
+
stats["vector_store_size"] = vector_store._index.ntotal
|
| 500 |
+
elif hasattr(vector_store, '_collection'):
|
| 501 |
+
stats["vector_store_size"] = len(vector_store._collection.get())
|
| 502 |
+
except:
|
| 503 |
+
stats["vector_store_size"] = "N/A"
|
| 504 |
+
|
| 505 |
+
# Add expected result statistics if provided
|
| 506 |
+
if expected_result:
|
| 507 |
+
stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
|
| 508 |
+
stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
|
| 509 |
+
if expected_result in doc.page_content), -1) + 1
|
| 510 |
+
|
| 511 |
+
# Calculate diversity metrics for larger result sets
|
| 512 |
+
if len(results) > 3: # Changed from 1000 to make it more practical
|
| 513 |
+
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
| 514 |
+
stats["result_diversity"] = ResultAnalyzer._calculate_diversity(embeddings)
|
| 515 |
+
stats["silhouette_score"] = ResultAnalyzer._calculate_silhouette(embeddings)
|
| 516 |
else:
|
| 517 |
+
stats["result_diversity"] = "N/A"
|
| 518 |
stats["silhouette_score"] = "N/A"
|
| 519 |
+
|
| 520 |
+
# Add ranking correlation
|
| 521 |
+
query_embedding = embedding_model.embed_query(query)
|
| 522 |
+
result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
| 523 |
+
similarities = [np.inner(query_embedding, emb) for emb in result_embeddings]
|
| 524 |
+
if len(similarities) > 1:
|
| 525 |
+
rank_correlation, _ = spearmanr(similarities, range(len(similarities)))
|
| 526 |
+
stats["rank_correlation"] = rank_correlation
|
| 527 |
+
else:
|
| 528 |
+
stats["rank_correlation"] = "N/A"
|
| 529 |
+
|
| 530 |
+
# Add model feedback if provided
|
| 531 |
+
if model_feedback:
|
| 532 |
+
stats["model_feedback"] = model_feedback
|
| 533 |
+
|
| 534 |
+
return stats
|
| 535 |
|
| 536 |
+
@staticmethod
|
| 537 |
+
def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
|
| 538 |
+
"""Calculate diversity score for embeddings"""
|
| 539 |
+
embeddings_array = np.array(embeddings)
|
| 540 |
+
pairwise_similarities = np.inner(embeddings_array, embeddings_array)
|
| 541 |
+
return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
| 542 |
|
| 543 |
+
@staticmethod
|
| 544 |
+
def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
|
| 545 |
+
"""Calculate silhouette score for embeddings"""
|
| 546 |
+
if len(embeddings) < 3:
|
| 547 |
+
return 0.0
|
| 548 |
+
try:
|
| 549 |
+
return silhouette_score(embeddings, range(len(embeddings)))
|
| 550 |
+
except:
|
| 551 |
+
return 0.0
|
| 552 |
+
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
|
| 557 |
# Visualization
|
| 558 |
def visualize_results(results_df, stats_df):
|
| 559 |
# Add model column if not present
|
|
|
|
| 728 |
|
| 729 |
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
|
| 730 |
|
| 731 |
+
stats = ResultAnalyzer.calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
|
| 732 |
stats["model"] = f"{model_type} - {model_name}"
|
| 733 |
stats["model_type"] = model_type
|
| 734 |
stats["model_name"] = model_name
|
|
|
|
| 823 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 824 |
results_raw = rerank_results(results_raw, query, reranker)
|
| 825 |
|
| 826 |
+
stats = ResultAnalyzer.calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'], expected_result)
|
| 827 |
stats["model"] = f"{params['model_type']} - {params['model_name']}"
|
| 828 |
stats["model_type"] = params['model_type']
|
| 829 |
stats["model_name"] = params['model_name']
|
|
|
|
| 1029 |
search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
|
| 1030 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
| 1031 |
|
| 1032 |
+
with gr.Tab("Expert"):
|
| 1033 |
+
apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
|
| 1034 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
| 1035 |
+
apply_phonetic_input = gr.Checkbox(label="Apply Phonetic Matching", value=False)
|
| 1036 |
phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
|
| 1037 |
custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
|
| 1038 |
custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
|
| 1039 |
custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
|
| 1040 |
custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
| 1041 |
use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
|
| 1042 |
+
query_optimization_model_input = gr.Textbox(label="Query Optimization Model (google/flan-t5-base) ", value="")
|
| 1043 |
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
| 1044 |
|
| 1045 |
with gr.Tab("Automation"):
|
| 1046 |
+
|
| 1047 |
+
|
| 1048 |
+
with gr.Row():
|
| 1049 |
+
auto_file_input = gr.File(label="Upload File (Optional)")
|
| 1050 |
+
auto_query_input = gr.Textbox(label="Search Query")
|
| 1051 |
+
|
| 1052 |
+
with gr.Row():
|
| 1053 |
+
auto_expected_result_input = gr.Textbox(
|
| 1054 |
+
label="Expected Result (Optional)",
|
| 1055 |
+
placeholder="Enter expected text if you want to evaluate accuracy"
|
| 1056 |
+
)
|
| 1057 |
+
model_feedback_input = gr.Textbox(
|
| 1058 |
+
label="Model Feedback (Optional)",
|
| 1059 |
+
placeholder="Enter any feedback about model performance"
|
| 1060 |
+
)
|
| 1061 |
+
|
| 1062 |
+
with gr.Row():
|
| 1063 |
+
with gr.Column():
|
| 1064 |
+
# Default model selection
|
| 1065 |
+
default_models_input = gr.CheckboxGroup(
|
| 1066 |
+
choices=[f"{type}:{name}"
|
| 1067 |
+
for type, names in DEFAULT_MODELS.items()
|
| 1068 |
+
for name in names],
|
| 1069 |
+
label="Default Models",
|
| 1070 |
+
value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
|
| 1071 |
+
)
|
| 1072 |
+
|
| 1073 |
+
with gr.Column():
|
| 1074 |
+
# Custom model input
|
| 1075 |
+
custom_models_input = gr.TextArea(
|
| 1076 |
+
label="Custom Models (Optional)",
|
| 1077 |
+
placeholder="Enter one model per line in format: type:name",
|
| 1078 |
+
lines=3
|
| 1079 |
+
)
|
| 1080 |
+
|
| 1081 |
auto_split_strategies = gr.CheckboxGroup(
|
| 1082 |
choices=["token", "recursive"],
|
| 1083 |
label="Split Strategies to Test"
|
|
|
|
| 1097 |
auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
|
| 1098 |
auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
|
| 1099 |
|
| 1100 |
+
|
| 1101 |
+
auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
|
| 1102 |
+
auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
|
| 1103 |
+
recommendations_output = gr.JSON(label="Recommendations")
|
| 1104 |
+
|
| 1105 |
+
auto_submit_button = gr.Button("Run Automated Tests")
|
| 1106 |
+
auto_submit_button.click(
|
| 1107 |
+
fn=lambda *args: run_automated_tests(*args),
|
| 1108 |
+
inputs=[
|
| 1109 |
+
auto_file_input, auto_query_input, auto_expected_result_input, auto_model_types, auto_model_names,
|
| 1110 |
+
auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
|
| 1111 |
+
auto_vector_store_types, auto_search_types, auto_top_k,
|
| 1112 |
+
auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking
|
| 1113 |
+
],
|
| 1114 |
+
outputs=[auto_results_output, auto_stats_output, recommendations_output]
|
| 1115 |
+
)
|
| 1116 |
+
###
|
| 1117 |
+
|
| 1118 |
+
with gr.Tab("Results"):
|
| 1119 |
+
with gr.Row():
|
| 1120 |
+
results_output = gr.DataFrame(label="Results")
|
| 1121 |
+
stats_output = gr.DataFrame(label="Statistics")
|
| 1122 |
+
|
| 1123 |
+
with gr.Row():
|
| 1124 |
+
plot_output = gr.Plot(label="Visualizations")
|
| 1125 |
+
model_rankings_output = gr.JSON(label="Model Rankings")
|
| 1126 |
+
|
| 1127 |
+
with gr.Row():
|
| 1128 |
+
recommendations_output = gr.JSON(label="Recommendations")
|
| 1129 |
+
|
| 1130 |
with gr.Tab("LLM Suggestions"):
|
| 1131 |
llm_file_input = gr.File(label="Upload File for LLM Suggestions")
|
| 1132 |
llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
|
|
|
|
| 1169 |
outputs=[results_output, stats_output, plot_output, best_settings_output]
|
| 1170 |
)
|
| 1171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1172 |
|
| 1173 |
|
| 1174 |
use_case_md = """
|
|
|
|
| 1572 |
|
| 1573 |
iface.launch(share=share)
|
| 1574 |
|
| 1575 |
+
# Enhanced Automated Testing
|
| 1576 |
+
def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
|
| 1577 |
+
test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
|
| 1578 |
+
model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 1579 |
+
"""
|
| 1580 |
+
Enhanced automated testing function with support for custom models and feedback
|
| 1581 |
+
"""
|
| 1582 |
+
all_results = []
|
| 1583 |
+
all_stats = []
|
| 1584 |
+
model_manager = ModelManager()
|
| 1585 |
+
|
| 1586 |
+
# Create parameter grid excluding model configurations
|
| 1587 |
+
base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
|
| 1588 |
+
param_grid = ParameterGrid(base_params)
|
| 1589 |
+
|
| 1590 |
+
# Test each model configuration with all parameter combinations
|
| 1591 |
+
for model_config in tqdm(model_configs, desc="Testing models"):
|
| 1592 |
+
model_type = model_config['type']
|
| 1593 |
+
model_name = model_config['name']
|
| 1594 |
+
|
| 1595 |
+
for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
|
| 1596 |
+
try:
|
| 1597 |
+
# Process files and get chunks
|
| 1598 |
+
chunks, embedding_model, num_tokens = process_files(
|
| 1599 |
+
file_path,
|
| 1600 |
+
model_type,
|
| 1601 |
+
model_name,
|
| 1602 |
+
params['split_strategy'],
|
| 1603 |
+
params['chunk_size'],
|
| 1604 |
+
params['overlap_size'],
|
| 1605 |
+
params.get('custom_separators'),
|
| 1606 |
+
params['lang'],
|
| 1607 |
+
params['apply_preprocessing']
|
| 1608 |
+
)
|
| 1609 |
+
|
| 1610 |
+
# Apply vocabulary optimization if specified
|
| 1611 |
+
if params['optimize_vocab']:
|
| 1612 |
+
tokenizer, chunks = optimize_vocabulary(chunks)
|
| 1613 |
+
|
| 1614 |
+
# Apply query optimization if specified
|
| 1615 |
+
current_query = query
|
| 1616 |
+
if params['use_query_optimization']:
|
| 1617 |
+
optimized_queries = optimize_query(
|
| 1618 |
+
query,
|
| 1619 |
+
params['query_optimization_model'],
|
| 1620 |
+
chunks,
|
| 1621 |
+
embedding_model,
|
| 1622 |
+
params['vector_store_type'],
|
| 1623 |
+
params['search_type'],
|
| 1624 |
+
params['top_k']
|
| 1625 |
+
)
|
| 1626 |
+
current_query = " ".join(optimized_queries)
|
| 1627 |
+
|
| 1628 |
+
# Perform search
|
| 1629 |
+
results, search_time, vector_store, raw_results = search_embeddings(
|
| 1630 |
+
chunks,
|
| 1631 |
+
embedding_model,
|
| 1632 |
+
params['vector_store_type'],
|
| 1633 |
+
params['search_type'],
|
| 1634 |
+
current_query,
|
| 1635 |
+
params['top_k'],
|
| 1636 |
+
expected_result,
|
| 1637 |
+
params['lang'],
|
| 1638 |
+
params['apply_phonetic'],
|
| 1639 |
+
params['phonetic_weight']
|
| 1640 |
+
)
|
| 1641 |
+
|
| 1642 |
+
# Apply reranking if specified
|
| 1643 |
+
if params['use_reranking']:
|
| 1644 |
+
reranker = pipeline("text-classification",
|
| 1645 |
+
model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 1646 |
+
raw_results = rerank_results(raw_results, current_query, reranker)
|
| 1647 |
+
|
| 1648 |
+
# Calculate statistics
|
| 1649 |
+
stats = ResultAnalyzer.calculate_statistics(
|
| 1650 |
+
raw_results, search_time, vector_store, num_tokens,
|
| 1651 |
+
embedding_model, current_query, params['top_k'],
|
| 1652 |
+
expected_result, model_feedback
|
| 1653 |
+
)
|
| 1654 |
+
|
| 1655 |
+
# Update model rankings
|
| 1656 |
+
model_id = f"{model_type}:{model_name}"
|
| 1657 |
+
ranking_score = calculate_model_ranking_score(stats)
|
| 1658 |
+
model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
|
| 1659 |
+
|
| 1660 |
+
# Add model information to stats
|
| 1661 |
+
stats.update({
|
| 1662 |
+
"model_type": model_type,
|
| 1663 |
+
"model_name": model_name,
|
| 1664 |
+
"model": f"{model_type} - {model_name}",
|
| 1665 |
+
**params
|
| 1666 |
+
})
|
| 1667 |
+
|
| 1668 |
+
# Format and store results
|
| 1669 |
+
all_results.extend(format_results(raw_results, stats))
|
| 1670 |
+
all_stats.append(stats)
|
| 1671 |
+
|
| 1672 |
+
except Exception as e:
|
| 1673 |
+
print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
|
| 1674 |
+
continue
|
| 1675 |
+
|
| 1676 |
+
return pd.DataFrame(all_results), pd.DataFrame(all_stats)
|
| 1677 |
|
| 1678 |
+
# Helper function to calculate model ranking score
|
| 1679 |
+
def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
|
| 1680 |
+
"""Calculate a composite score for model ranking"""
|
| 1681 |
+
weights = {
|
| 1682 |
+
'search_time': -0.2, # Negative weight because lower is better
|
| 1683 |
+
'result_diversity': 0.2,
|
| 1684 |
+
'rank_correlation': 0.3,
|
| 1685 |
+
'contains_expected': 0.3,
|
| 1686 |
+
'expected_result_rank': -0.2 # Negative weight because lower rank is better
|
| 1687 |
+
}
|
| 1688 |
+
|
| 1689 |
+
score = 0.0
|
| 1690 |
+
for metric, weight in weights.items():
|
| 1691 |
+
if metric in stats and not isinstance(stats[metric], str):
|
| 1692 |
+
if metric == 'contains_expected':
|
| 1693 |
+
value = float(stats[metric])
|
| 1694 |
+
elif metric == 'expected_result_rank':
|
| 1695 |
+
value = 1.0 / max(stats[metric], 1) # Convert rank to score (higher is better)
|
| 1696 |
+
else:
|
| 1697 |
+
value = float(stats[metric])
|
| 1698 |
+
score += weight * value
|
| 1699 |
+
|
| 1700 |
+
return score
|
| 1701 |
|
| 1702 |
if __name__ == "__main__":
|
| 1703 |
launch_interface()
|