Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -30,6 +30,9 @@ from sklearn.manifold import TSNE
|
|
| 30 |
from sklearn.metrics import silhouette_score
|
| 31 |
from scipy.stats import spearmanr
|
| 32 |
from functools import lru_cache
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# NLTK Resource Download
|
| 35 |
def download_nltk_resources():
|
|
@@ -141,13 +144,11 @@ def preprocess_text(text, lang='german'):
|
|
| 141 |
def phonetic_match(text, query, method='levenshtein_distance'):
|
| 142 |
if method == 'levenshtein_distance':
|
| 143 |
text_phonetic = jellyfish.soundex(text)
|
| 144 |
-
#query_phonetic = jellyfish.cologne_phonetic(query)
|
| 145 |
query_phonetic = jellyfish.soundex(query)
|
| 146 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
| 147 |
return 0
|
| 148 |
|
| 149 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
| 150 |
-
# Tokenize the texts
|
| 151 |
tokenized_texts = [text.split() for text in texts]
|
| 152 |
|
| 153 |
if model_type == 'word2vec':
|
|
@@ -169,7 +170,6 @@ class CustomEmbeddings(HuggingFaceEmbeddings):
|
|
| 169 |
def embed_query(self, text):
|
| 170 |
return self.model.wv[text.split()]
|
| 171 |
|
| 172 |
-
|
| 173 |
# Custom Tokenizer
|
| 174 |
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
| 175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
|
@@ -191,6 +191,7 @@ def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000,
|
|
| 191 |
tokenizer.train_from_iterator([text], trainer)
|
| 192 |
|
| 193 |
return tokenizer
|
|
|
|
| 194 |
def custom_tokenize(text, tokenizer):
|
| 195 |
return tokenizer.encode(text).tokens
|
| 196 |
|
|
@@ -220,15 +221,16 @@ def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separator
|
|
| 220 |
raise ValueError(f"Unsupported split strategy: {split_strategy}")
|
| 221 |
|
| 222 |
def get_vector_store(vector_store_type, chunks, embedding_model):
|
| 223 |
-
# Convert chunks to a tuple to make it hashable
|
| 224 |
chunks_tuple = tuple(chunks)
|
| 225 |
-
|
| 226 |
-
# Use a helper function for the actual vector store creation
|
| 227 |
return _create_vector_store(vector_store_type, chunks_tuple, embedding_model)
|
| 228 |
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
| 231 |
-
# Convert the tuple back to a list for use with the vector store
|
| 232 |
chunks = list(chunks_tuple)
|
| 233 |
|
| 234 |
if vector_store_type == 'FAISS':
|
|
@@ -238,15 +240,13 @@ def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
|
| 238 |
else:
|
| 239 |
raise ValueError(f"Unsupported vector store type: {vector_store_type}")
|
| 240 |
|
| 241 |
-
|
| 242 |
def get_retriever(vector_store, search_type, search_kwargs):
|
| 243 |
if search_type == 'similarity':
|
| 244 |
return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
|
| 245 |
elif search_type == 'mmr':
|
| 246 |
return vector_store.as_retriever(search_type="mmr", search_kwargs=search_kwargs)
|
| 247 |
elif search_type == 'custom':
|
| 248 |
-
|
| 249 |
-
pass
|
| 250 |
else:
|
| 251 |
raise ValueError(f"Unsupported search type: {search_type}")
|
| 252 |
|
|
@@ -290,15 +290,13 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
| 290 |
results = sorted(results, key=score_result, reverse=True)
|
| 291 |
end_time = time.time()
|
| 292 |
|
| 293 |
-
# Check if embeddings are available
|
| 294 |
embeddings = []
|
| 295 |
for doc in results:
|
| 296 |
if hasattr(doc, 'embedding'):
|
| 297 |
-
embeddings.append(doc.embedding)
|
| 298 |
else:
|
| 299 |
-
embeddings.append(None)
|
| 300 |
|
| 301 |
-
# Create a DataFrame with the results and embeddings
|
| 302 |
results_df = pd.DataFrame({
|
| 303 |
'content': [doc.page_content for doc in results],
|
| 304 |
'embedding': embeddings
|
|
@@ -307,13 +305,12 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
| 307 |
return results_df, end_time - start_time, vector_store, results
|
| 308 |
|
| 309 |
# Evaluation Metrics
|
|
|
|
|
|
|
| 310 |
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
|
| 311 |
stats = {
|
| 312 |
"num_results": len(results),
|
| 313 |
-
# "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results) if results else 0,
|
| 314 |
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
| 315 |
-
|
| 316 |
-
#"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if not results.empty else 0,
|
| 317 |
"search_time": search_time,
|
| 318 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
| 319 |
"num_documents": len(vector_store.docstore._dict),
|
|
@@ -328,10 +325,7 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
|
|
| 328 |
pairwise_similarities = np.inner(embeddings, embeddings)
|
| 329 |
stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
| 330 |
|
| 331 |
-
# Silhouette Score
|
| 332 |
if len(embeddings) > 2:
|
| 333 |
-
print('-----')
|
| 334 |
-
#stats["silhouette_score"] = "N/A"
|
| 335 |
stats["silhouette_score"] = silhouette_score(embeddings, range(len(embeddings)))
|
| 336 |
else:
|
| 337 |
stats["silhouette_score"] = "N/A"
|
|
@@ -378,24 +372,34 @@ def visualize_results(results_df, stats_df):
|
|
| 378 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
| 379 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 380 |
|
| 381 |
-
# Count word frequencies
|
| 382 |
word_freq = Counter(word for text in texts for word in text.split())
|
| 383 |
|
| 384 |
-
# Remove rare words
|
| 385 |
optimized_texts = [
|
| 386 |
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
| 387 |
for text in texts
|
| 388 |
]
|
| 389 |
|
| 390 |
-
# Train BPE tokenizer
|
| 391 |
-
# tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
| 392 |
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
| 393 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
| 394 |
|
| 395 |
return tokenizer, optimized_texts
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
# Main Comparison Function
|
| 398 |
-
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
|
| 399 |
all_results = []
|
| 400 |
all_stats = []
|
| 401 |
settings = {
|
|
@@ -408,16 +412,16 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 408 |
"top_k": top_k,
|
| 409 |
"lang": lang,
|
| 410 |
"optimize_vocab": optimize_vocab,
|
| 411 |
-
"phonetic_weight": phonetic_weight
|
|
|
|
|
|
|
| 412 |
}
|
| 413 |
|
| 414 |
-
# Parse embedding models
|
| 415 |
models = [model.strip().split(':') for model in embedding_models.split(',')]
|
| 416 |
if custom_embedding_model:
|
| 417 |
models.append(custom_embedding_model.strip().split(':'))
|
| 418 |
|
| 419 |
for model_type, model_name in models:
|
| 420 |
-
# Process the file and generate chunks & embeddings
|
| 421 |
chunks, embedding_model, num_tokens = process_files(
|
| 422 |
file.name if file else None,
|
| 423 |
model_type,
|
|
@@ -433,17 +437,19 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 433 |
custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
|
| 434 |
)
|
| 435 |
|
| 436 |
-
# Custom embedding handling
|
| 437 |
-
#if use_custom_embedding:
|
| 438 |
-
# custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
|
| 439 |
-
# embedding_model = CustomEmbeddings(custom_model)
|
| 440 |
-
|
| 441 |
-
# Optimizing vocabulary if required
|
| 442 |
if optimize_vocab:
|
| 443 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
| 444 |
chunks = optimized_chunks
|
| 445 |
|
| 446 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
results, search_time, vector_store, results_raw = search_embeddings(
|
| 448 |
chunks,
|
| 449 |
embedding_model,
|
|
@@ -455,32 +461,26 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 455 |
phonetic_weight
|
| 456 |
)
|
| 457 |
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw] # Adjust this based on the actual attribute names
|
| 465 |
-
# result_embeddings = [doc['embedding'] for doc in results_raw] # Assuming each result has an embedding
|
| 466 |
|
| 467 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
| 468 |
stats["model"] = f"{model_type} - {model_name}"
|
| 469 |
stats.update(settings)
|
| 470 |
|
| 471 |
-
# Formatting results and attaching embeddings
|
| 472 |
formatted_results = format_results(results_raw, stats)
|
| 473 |
for i, result in enumerate(formatted_results):
|
| 474 |
-
result['embedding'] = result_embeddings[i]
|
| 475 |
|
| 476 |
all_results.extend(formatted_results)
|
| 477 |
all_stats.append(stats)
|
| 478 |
|
| 479 |
-
# Create DataFrames with embeddings now included
|
| 480 |
results_df = pd.DataFrame(all_results)
|
| 481 |
stats_df = pd.DataFrame(all_stats)
|
| 482 |
|
| 483 |
-
# Visualization of the results
|
| 484 |
fig = visualize_results(results_df, stats_df)
|
| 485 |
|
| 486 |
return results_df, stats_df, fig
|
|
@@ -500,36 +500,52 @@ def format_results(results, stats):
|
|
| 500 |
|
| 501 |
# Gradio Interface
|
| 502 |
def launch_interface(share=True):
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
gr.
|
| 508 |
-
gr.Textbox(label="
|
| 509 |
-
gr.Textbox(label="
|
| 510 |
-
gr.
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
gr.Textbox(label="Custom
|
| 514 |
-
gr.Radio(choices=["
|
| 515 |
-
gr.
|
| 516 |
-
gr.Slider(
|
| 517 |
-
gr.
|
| 518 |
-
gr.
|
| 519 |
-
gr.
|
| 520 |
-
gr.
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
gr.
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
gr.
|
| 527 |
-
gr.
|
| 528 |
-
gr.
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
tutorial_md = """
|
| 535 |
# Advanced Embedding Comparison Tool Tutorial
|
|
@@ -541,13 +557,10 @@ def launch_interface(share=True):
|
|
| 541 |
1. Upload a file (optional) or use the default files in the system.
|
| 542 |
2. Enter a search query.
|
| 543 |
3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
|
| 544 |
-
4.
|
| 545 |
-
5.
|
| 546 |
-
6.
|
| 547 |
-
7.
|
| 548 |
-
8. Choose the language of your documents.
|
| 549 |
-
9. Optionally, optimize vocabulary or adjust phonetic matching weight.
|
| 550 |
-
10. If you have a custom tokenizer, upload the file and specify its attributes.
|
| 551 |
|
| 552 |
The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
|
| 553 |
"""
|
|
@@ -559,4 +572,5 @@ def launch_interface(share=True):
|
|
| 559 |
|
| 560 |
iface.launch(share=share)
|
| 561 |
|
| 562 |
-
|
|
|
|
|
|
| 30 |
from sklearn.metrics import silhouette_score
|
| 31 |
from scipy.stats import spearmanr
|
| 32 |
from functools import lru_cache
|
| 33 |
+
from langchain.retrievers import MultiQueryRetriever
|
| 34 |
+
from langchain.llms import HuggingFacePipeline
|
| 35 |
+
from transformers import pipeline
|
| 36 |
|
| 37 |
# NLTK Resource Download
|
| 38 |
def download_nltk_resources():
|
|
|
|
| 144 |
def phonetic_match(text, query, method='levenshtein_distance'):
|
| 145 |
if method == 'levenshtein_distance':
|
| 146 |
text_phonetic = jellyfish.soundex(text)
|
|
|
|
| 147 |
query_phonetic = jellyfish.soundex(query)
|
| 148 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
| 149 |
return 0
|
| 150 |
|
| 151 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
|
|
|
| 152 |
tokenized_texts = [text.split() for text in texts]
|
| 153 |
|
| 154 |
if model_type == 'word2vec':
|
|
|
|
| 170 |
def embed_query(self, text):
|
| 171 |
return self.model.wv[text.split()]
|
| 172 |
|
|
|
|
| 173 |
# Custom Tokenizer
|
| 174 |
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
| 175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
| 191 |
tokenizer.train_from_iterator([text], trainer)
|
| 192 |
|
| 193 |
return tokenizer
|
| 194 |
+
|
| 195 |
def custom_tokenize(text, tokenizer):
|
| 196 |
return tokenizer.encode(text).tokens
|
| 197 |
|
|
|
|
| 221 |
raise ValueError(f"Unsupported split strategy: {split_strategy}")
|
| 222 |
|
| 223 |
def get_vector_store(vector_store_type, chunks, embedding_model):
|
|
|
|
| 224 |
chunks_tuple = tuple(chunks)
|
|
|
|
|
|
|
| 225 |
return _create_vector_store(vector_store_type, chunks_tuple, embedding_model)
|
| 226 |
|
| 227 |
+
def custom_similarity(query_embedding, doc_embedding, query, doc_text, phonetic_weight=0.3):
|
| 228 |
+
embedding_sim = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
|
| 229 |
+
phonetic_sim = phonetic_match(doc_text, query)
|
| 230 |
+
combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
|
| 231 |
+
return combined_sim
|
| 232 |
+
|
| 233 |
def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
|
|
|
| 234 |
chunks = list(chunks_tuple)
|
| 235 |
|
| 236 |
if vector_store_type == 'FAISS':
|
|
|
|
| 240 |
else:
|
| 241 |
raise ValueError(f"Unsupported vector store type: {vector_store_type}")
|
| 242 |
|
|
|
|
| 243 |
def get_retriever(vector_store, search_type, search_kwargs):
|
| 244 |
if search_type == 'similarity':
|
| 245 |
return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
|
| 246 |
elif search_type == 'mmr':
|
| 247 |
return vector_store.as_retriever(search_type="mmr", search_kwargs=search_kwargs)
|
| 248 |
elif search_type == 'custom':
|
| 249 |
+
return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
|
|
|
|
| 250 |
else:
|
| 251 |
raise ValueError(f"Unsupported search type: {search_type}")
|
| 252 |
|
|
|
|
| 290 |
results = sorted(results, key=score_result, reverse=True)
|
| 291 |
end_time = time.time()
|
| 292 |
|
|
|
|
| 293 |
embeddings = []
|
| 294 |
for doc in results:
|
| 295 |
if hasattr(doc, 'embedding'):
|
| 296 |
+
embeddings.append(doc.embedding)
|
| 297 |
else:
|
| 298 |
+
embeddings.append(None)
|
| 299 |
|
|
|
|
| 300 |
results_df = pd.DataFrame({
|
| 301 |
'content': [doc.page_content for doc in results],
|
| 302 |
'embedding': embeddings
|
|
|
|
| 305 |
return results_df, end_time - start_time, vector_store, results
|
| 306 |
|
| 307 |
# Evaluation Metrics
|
| 308 |
+
# ... (previous code remains the same)
|
| 309 |
+
|
| 310 |
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
|
| 311 |
stats = {
|
| 312 |
"num_results": len(results),
|
|
|
|
| 313 |
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
|
|
|
|
|
|
| 314 |
"search_time": search_time,
|
| 315 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
| 316 |
"num_documents": len(vector_store.docstore._dict),
|
|
|
|
| 325 |
pairwise_similarities = np.inner(embeddings, embeddings)
|
| 326 |
stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
| 327 |
|
|
|
|
| 328 |
if len(embeddings) > 2:
|
|
|
|
|
|
|
| 329 |
stats["silhouette_score"] = silhouette_score(embeddings, range(len(embeddings)))
|
| 330 |
else:
|
| 331 |
stats["silhouette_score"] = "N/A"
|
|
|
|
| 372 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
| 373 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 374 |
|
|
|
|
| 375 |
word_freq = Counter(word for text in texts for word in text.split())
|
| 376 |
|
|
|
|
| 377 |
optimized_texts = [
|
| 378 |
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
| 379 |
for text in texts
|
| 380 |
]
|
| 381 |
|
|
|
|
|
|
|
| 382 |
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
| 383 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
| 384 |
|
| 385 |
return tokenizer, optimized_texts
|
| 386 |
|
| 387 |
+
# New preprocessing function
|
| 388 |
+
def optimize_query(query, llm):
|
| 389 |
+
multi_query_retriever = MultiQueryRetriever.from_llm(
|
| 390 |
+
retriever=get_retriever(vector_store, search_type, search_kwargs),
|
| 391 |
+
llm=llm
|
| 392 |
+
)
|
| 393 |
+
optimized_queries = multi_query_retriever.generate_queries(query)
|
| 394 |
+
return optimized_queries
|
| 395 |
+
|
| 396 |
+
# New postprocessing function
|
| 397 |
+
def rerank_results(results, query, reranker):
|
| 398 |
+
reranked_results = reranker.rerank(query, [doc.page_content for doc in results])
|
| 399 |
+
return reranked_results
|
| 400 |
+
|
| 401 |
# Main Comparison Function
|
| 402 |
+
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, use_reranking=False):
|
| 403 |
all_results = []
|
| 404 |
all_stats = []
|
| 405 |
settings = {
|
|
|
|
| 412 |
"top_k": top_k,
|
| 413 |
"lang": lang,
|
| 414 |
"optimize_vocab": optimize_vocab,
|
| 415 |
+
"phonetic_weight": phonetic_weight,
|
| 416 |
+
"use_query_optimization": use_query_optimization,
|
| 417 |
+
"use_reranking": use_reranking
|
| 418 |
}
|
| 419 |
|
|
|
|
| 420 |
models = [model.strip().split(':') for model in embedding_models.split(',')]
|
| 421 |
if custom_embedding_model:
|
| 422 |
models.append(custom_embedding_model.strip().split(':'))
|
| 423 |
|
| 424 |
for model_type, model_name in models:
|
|
|
|
| 425 |
chunks, embedding_model, num_tokens = process_files(
|
| 426 |
file.name if file else None,
|
| 427 |
model_type,
|
|
|
|
| 437 |
custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
|
| 438 |
)
|
| 439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
if optimize_vocab:
|
| 441 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
| 442 |
chunks = optimized_chunks
|
| 443 |
|
| 444 |
+
if use_query_optimization:
|
| 445 |
+
llm = HuggingFacePipeline.from_model_id(
|
| 446 |
+
model_id="google/flan-t5-base",
|
| 447 |
+
task="text2text-generation",
|
| 448 |
+
model_kwargs={"temperature": 0, "max_length": 64},
|
| 449 |
+
)
|
| 450 |
+
optimized_queries = optimize_query(query, llm)
|
| 451 |
+
query = " ".join(optimized_queries)
|
| 452 |
+
|
| 453 |
results, search_time, vector_store, results_raw = search_embeddings(
|
| 454 |
chunks,
|
| 455 |
embedding_model,
|
|
|
|
| 461 |
phonetic_weight
|
| 462 |
)
|
| 463 |
|
| 464 |
+
if use_reranking:
|
| 465 |
+
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 466 |
+
results_raw = rerank_results(results_raw, query, reranker)
|
| 467 |
|
| 468 |
+
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
| 471 |
stats["model"] = f"{model_type} - {model_name}"
|
| 472 |
stats.update(settings)
|
| 473 |
|
|
|
|
| 474 |
formatted_results = format_results(results_raw, stats)
|
| 475 |
for i, result in enumerate(formatted_results):
|
| 476 |
+
result['embedding'] = result_embeddings[i]
|
| 477 |
|
| 478 |
all_results.extend(formatted_results)
|
| 479 |
all_stats.append(stats)
|
| 480 |
|
|
|
|
| 481 |
results_df = pd.DataFrame(all_results)
|
| 482 |
stats_df = pd.DataFrame(all_stats)
|
| 483 |
|
|
|
|
| 484 |
fig = visualize_results(results_df, stats_df)
|
| 485 |
|
| 486 |
return results_df, stats_df, fig
|
|
|
|
| 500 |
|
| 501 |
# Gradio Interface
|
| 502 |
def launch_interface(share=True):
|
| 503 |
+
with gr.Blocks() as iface:
|
| 504 |
+
gr.Markdown("# Advanced Embedding Comparison Tool")
|
| 505 |
+
|
| 506 |
+
with gr.Tab("Simple"):
|
| 507 |
+
file_input = gr.File(label="Upload File (Optional)")
|
| 508 |
+
query_input = gr.Textbox(label="Search Query")
|
| 509 |
+
embedding_models_input = gr.Textbox(label="Embedding Models (comma-separated, e.g. HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002)")
|
| 510 |
+
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
| 511 |
+
|
| 512 |
+
with gr.Tab("Advanced"):
|
| 513 |
+
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
| 514 |
+
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
| 515 |
+
chunk_size_input = gr.Slider(100, 1000, step=100, value=500, label="Chunk Size")
|
| 516 |
+
overlap_size_input = gr.Slider(0, 100, step=10, value=50, label="Overlap Size")
|
| 517 |
+
custom_separators_input = gr.Textbox(label="Custom Split Separators (comma-separated, optional)")
|
| 518 |
+
vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
|
| 519 |
+
search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
|
| 520 |
+
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
| 521 |
+
|
| 522 |
+
with gr.Tab("Optional"):
|
| 523 |
+
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
| 524 |
+
phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
|
| 525 |
+
custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
|
| 526 |
+
custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
|
| 527 |
+
custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
|
| 528 |
+
custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
| 529 |
+
use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
|
| 530 |
+
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
| 531 |
+
|
| 532 |
+
results_output = gr.Dataframe(label="Results", interactive=False)
|
| 533 |
+
stats_output = gr.Dataframe(label="Statistics", interactive=False)
|
| 534 |
+
plot_output = gr.Plot(label="Visualizations")
|
| 535 |
+
|
| 536 |
+
submit_button = gr.Button("Compare Embeddings")
|
| 537 |
+
submit_button.click(
|
| 538 |
+
fn=compare_embeddings,
|
| 539 |
+
inputs=[
|
| 540 |
+
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
| 541 |
+
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
| 542 |
+
vector_store_type_input, search_type_input, top_k_input, lang_input,
|
| 543 |
+
optimize_vocab_input, phonetic_weight_input, custom_tokenizer_file_input,
|
| 544 |
+
custom_tokenizer_model_input, custom_tokenizer_vocab_size_input,
|
| 545 |
+
custom_tokenizer_special_tokens_input, use_query_optimization_input, use_reranking_input
|
| 546 |
+
],
|
| 547 |
+
outputs=[results_output, stats_output, plot_output]
|
| 548 |
+
)
|
| 549 |
|
| 550 |
tutorial_md = """
|
| 551 |
# Advanced Embedding Comparison Tool Tutorial
|
|
|
|
| 557 |
1. Upload a file (optional) or use the default files in the system.
|
| 558 |
2. Enter a search query.
|
| 559 |
3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
|
| 560 |
+
4. Set the number of top results to retrieve.
|
| 561 |
+
5. Optionally, specify advanced settings such as custom embedding models, text splitting strategies, and vector store types.
|
| 562 |
+
6. Choose whether to use optional features like vocabulary optimization, query optimization, or result reranking.
|
| 563 |
+
7. If you have a custom tokenizer, upload the file and specify its attributes.
|
|
|
|
|
|
|
|
|
|
| 564 |
|
| 565 |
The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
|
| 566 |
"""
|
|
|
|
| 572 |
|
| 573 |
iface.launch(share=share)
|
| 574 |
|
| 575 |
+
if __name__ == "__main__":
|
| 576 |
+
launch_interface()
|