Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -171,18 +171,26 @@ class CustomEmbeddings(HuggingFaceEmbeddings):
|
|
| 171 |
|
| 172 |
|
| 173 |
# Custom Tokenizer
|
| 174 |
-
def create_custom_tokenizer(file_path):
|
| 175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 176 |
text = f.read()
|
| 177 |
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
tokenizer.pre_tokenizer = Whitespace()
|
| 180 |
|
| 181 |
-
|
|
|
|
| 182 |
tokenizer.train_from_iterator([text], trainer)
|
| 183 |
|
| 184 |
return tokenizer
|
| 185 |
-
|
| 186 |
def custom_tokenize(text, tokenizer):
|
| 187 |
return tokenizer.encode(text).tokens
|
| 188 |
|
|
@@ -243,7 +251,7 @@ def get_retriever(vector_store, search_type, search_kwargs):
|
|
| 243 |
raise ValueError(f"Unsupported search type: {search_type}")
|
| 244 |
|
| 245 |
# Main Processing Functions
|
| 246 |
-
def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', custom_tokenizer_file=None):
|
| 247 |
if file_path:
|
| 248 |
text = FileHandler.extract_text(file_path)
|
| 249 |
else:
|
|
@@ -253,7 +261,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
|
|
| 253 |
text += FileHandler.extract_text(file_path)
|
| 254 |
|
| 255 |
if custom_tokenizer_file:
|
| 256 |
-
tokenizer = create_custom_tokenizer(custom_tokenizer_file)
|
| 257 |
text = ' '.join(custom_tokenize(text, tokenizer))
|
| 258 |
else:
|
| 259 |
text = preprocess_text(text, lang)
|
|
@@ -387,7 +395,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
|
| 387 |
return tokenizer, optimized_texts
|
| 388 |
|
| 389 |
# Main Comparison Function
|
| 390 |
-
def compare_embeddings(file, query,
|
| 391 |
all_results = []
|
| 392 |
all_stats = []
|
| 393 |
settings = {
|
|
@@ -399,12 +407,16 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
| 399 |
"search_type": search_type,
|
| 400 |
"top_k": top_k,
|
| 401 |
"lang": lang,
|
| 402 |
-
"use_custom_embedding": use_custom_embedding,
|
| 403 |
"optimize_vocab": optimize_vocab,
|
| 404 |
"phonetic_weight": phonetic_weight
|
| 405 |
}
|
| 406 |
|
| 407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
# Process the file and generate chunks & embeddings
|
| 409 |
chunks, embedding_model, num_tokens = process_files(
|
| 410 |
file.name if file else None,
|
|
@@ -415,13 +427,16 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
| 415 |
overlap_size,
|
| 416 |
custom_separators.split(',') if custom_separators else None,
|
| 417 |
lang,
|
| 418 |
-
custom_tokenizer_file
|
|
|
|
|
|
|
|
|
|
| 419 |
)
|
| 420 |
|
| 421 |
# Custom embedding handling
|
| 422 |
-
if use_custom_embedding:
|
| 423 |
-
|
| 424 |
-
|
| 425 |
|
| 426 |
# Optimizing vocabulary if required
|
| 427 |
if optimize_vocab:
|
|
@@ -490,8 +505,8 @@ def launch_interface(share=True):
|
|
| 490 |
inputs=[
|
| 491 |
gr.File(label="Upload File (Optional)"),
|
| 492 |
gr.Textbox(label="Search Query"),
|
| 493 |
-
gr.
|
| 494 |
-
gr.
|
| 495 |
gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
|
| 496 |
gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
|
| 497 |
gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
|
|
@@ -500,10 +515,12 @@ def launch_interface(share=True):
|
|
| 500 |
gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity"),
|
| 501 |
gr.Slider(1, 10, step=1, value=5, label="Top K"),
|
| 502 |
gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german"),
|
| 503 |
-
gr.Checkbox(label="Use Custom Embedding", value=False),
|
| 504 |
gr.Checkbox(label="Optimize Vocabulary", value=False),
|
| 505 |
gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight"),
|
| 506 |
-
gr.File(label="Custom Tokenizer File (Optional)")
|
|
|
|
|
|
|
|
|
|
| 507 |
],
|
| 508 |
outputs=[
|
| 509 |
gr.Dataframe(label="Results", interactive=False),
|
|
@@ -523,13 +540,14 @@ def launch_interface(share=True):
|
|
| 523 |
|
| 524 |
1. Upload a file (optional) or use the default files in the system.
|
| 525 |
2. Enter a search query.
|
| 526 |
-
3.
|
| 527 |
-
4.
|
| 528 |
-
5.
|
| 529 |
-
6.
|
| 530 |
-
7.
|
| 531 |
-
8.
|
| 532 |
-
9.
|
|
|
|
| 533 |
|
| 534 |
The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
|
| 535 |
"""
|
|
@@ -539,7 +557,4 @@ def launch_interface(share=True):
|
|
| 539 |
["Embedding Comparison", "Tutorial"]
|
| 540 |
)
|
| 541 |
|
| 542 |
-
iface.launch(share=share)
|
| 543 |
-
|
| 544 |
-
if __name__ == "__main__":
|
| 545 |
-
launch_interface()
|
|
|
|
| 171 |
|
| 172 |
|
| 173 |
# Custom Tokenizer
|
| 174 |
+
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
| 175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 176 |
text = f.read()
|
| 177 |
|
| 178 |
+
if model_type == 'WordLevel':
|
| 179 |
+
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
|
| 180 |
+
elif model_type == 'BPE':
|
| 181 |
+
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 182 |
+
elif model_type == 'Unigram':
|
| 183 |
+
tokenizer = Tokenizer(models.Unigram())
|
| 184 |
+
else:
|
| 185 |
+
raise ValueError(f"Unsupported tokenizer model: {model_type}")
|
| 186 |
+
|
| 187 |
tokenizer.pre_tokenizer = Whitespace()
|
| 188 |
|
| 189 |
+
special_tokens = special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
| 190 |
+
trainer = trainers.WordLevelTrainer(special_tokens=special_tokens, vocab_size=vocab_size)
|
| 191 |
tokenizer.train_from_iterator([text], trainer)
|
| 192 |
|
| 193 |
return tokenizer
|
|
|
|
| 194 |
def custom_tokenize(text, tokenizer):
|
| 195 |
return tokenizer.encode(text).tokens
|
| 196 |
|
|
|
|
| 251 |
raise ValueError(f"Unsupported search type: {search_type}")
|
| 252 |
|
| 253 |
# Main Processing Functions
|
| 254 |
+
def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
|
| 255 |
if file_path:
|
| 256 |
text = FileHandler.extract_text(file_path)
|
| 257 |
else:
|
|
|
|
| 261 |
text += FileHandler.extract_text(file_path)
|
| 262 |
|
| 263 |
if custom_tokenizer_file:
|
| 264 |
+
tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
|
| 265 |
text = ' '.join(custom_tokenize(text, tokenizer))
|
| 266 |
else:
|
| 267 |
text = preprocess_text(text, lang)
|
|
|
|
| 395 |
return tokenizer, optimized_texts
|
| 396 |
|
| 397 |
# Main Comparison Function
|
| 398 |
+
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
|
| 399 |
all_results = []
|
| 400 |
all_stats = []
|
| 401 |
settings = {
|
|
|
|
| 407 |
"search_type": search_type,
|
| 408 |
"top_k": top_k,
|
| 409 |
"lang": lang,
|
|
|
|
| 410 |
"optimize_vocab": optimize_vocab,
|
| 411 |
"phonetic_weight": phonetic_weight
|
| 412 |
}
|
| 413 |
|
| 414 |
+
# Parse embedding models
|
| 415 |
+
models = [model.strip().split(':') for model in embedding_models.split(',')]
|
| 416 |
+
if custom_embedding_model:
|
| 417 |
+
models.append(custom_embedding_model.strip().split(':'))
|
| 418 |
+
|
| 419 |
+
for model_type, model_name in models:
|
| 420 |
# Process the file and generate chunks & embeddings
|
| 421 |
chunks, embedding_model, num_tokens = process_files(
|
| 422 |
file.name if file else None,
|
|
|
|
| 427 |
overlap_size,
|
| 428 |
custom_separators.split(',') if custom_separators else None,
|
| 429 |
lang,
|
| 430 |
+
custom_tokenizer_file,
|
| 431 |
+
custom_tokenizer_model,
|
| 432 |
+
int(custom_tokenizer_vocab_size),
|
| 433 |
+
custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
|
| 434 |
)
|
| 435 |
|
| 436 |
# Custom embedding handling
|
| 437 |
+
#if use_custom_embedding:
|
| 438 |
+
# custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
|
| 439 |
+
# embedding_model = CustomEmbeddings(custom_model)
|
| 440 |
|
| 441 |
# Optimizing vocabulary if required
|
| 442 |
if optimize_vocab:
|
|
|
|
| 505 |
inputs=[
|
| 506 |
gr.File(label="Upload File (Optional)"),
|
| 507 |
gr.Textbox(label="Search Query"),
|
| 508 |
+
gr.Textbox(label="Embedding Models (comma-separated, e.g. HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002)"),
|
| 509 |
+
gr.Textbox(label="Custom Embedding Model (optional, format: type:name)"),
|
| 510 |
gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
|
| 511 |
gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
|
| 512 |
gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
|
|
|
|
| 515 |
gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity"),
|
| 516 |
gr.Slider(1, 10, step=1, value=5, label="Top K"),
|
| 517 |
gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german"),
|
|
|
|
| 518 |
gr.Checkbox(label="Optimize Vocabulary", value=False),
|
| 519 |
gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight"),
|
| 520 |
+
gr.File(label="Custom Tokenizer File (Optional)"),
|
| 521 |
+
gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)"),
|
| 522 |
+
gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000"),
|
| 523 |
+
gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
| 524 |
],
|
| 525 |
outputs=[
|
| 526 |
gr.Dataframe(label="Results", interactive=False),
|
|
|
|
| 540 |
|
| 541 |
1. Upload a file (optional) or use the default files in the system.
|
| 542 |
2. Enter a search query.
|
| 543 |
+
3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
|
| 544 |
+
4. Optionally, specify a custom embedding model in the format type:name.
|
| 545 |
+
5. Choose a text splitting strategy and set chunk size and overlap.
|
| 546 |
+
6. Select a vector store type and search type.
|
| 547 |
+
7. Set the number of top results to retrieve.
|
| 548 |
+
8. Choose the language of your documents.
|
| 549 |
+
9. Optionally, optimize vocabulary or adjust phonetic matching weight.
|
| 550 |
+
10. If you have a custom tokenizer, upload the file and specify its attributes.
|
| 551 |
|
| 552 |
The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
|
| 553 |
"""
|
|
|
|
| 557 |
["Embedding Comparison", "Tutorial"]
|
| 558 |
)
|
| 559 |
|
| 560 |
+
iface.launch(share=share)
|
|
|
|
|
|
|
|
|