Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -20,15 +20,16 @@ def embed_sparse(text: str):
|
|
| 20 |
if not text.strip():
|
| 21 |
return {"error": "Input text is empty."}
|
| 22 |
tokens = text.split()
|
|
|
|
| 23 |
bm25 = BM25Okapi([tokens])
|
| 24 |
-
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
indices = list(range(len(
|
| 30 |
-
values = [term_weights[term] for term in
|
| 31 |
-
return {"indices": indices, "values": values, "terms":
|
| 32 |
|
| 33 |
# 3. Late-interaction embedding model (ColBERT)
|
| 34 |
colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)
|
|
|
|
| 20 |
if not text.strip():
|
| 21 |
return {"error": "Input text is empty."}
|
| 22 |
tokens = text.split()
|
| 23 |
+
# Treat the input as a single document and also as the query
|
| 24 |
bm25 = BM25Okapi([tokens])
|
| 25 |
+
unique_terms = sorted(set(tokens))
|
| 26 |
+
# BM25 expects a query, so we use the unique terms as the query
|
| 27 |
+
scores = bm25.get_scores(unique_terms)
|
| 28 |
+
term_weights = {term: float(score) for term, score in zip(unique_terms, scores)}
|
| 29 |
+
# Build Qdrant format
|
| 30 |
+
indices = list(range(len(unique_terms)))
|
| 31 |
+
values = [term_weights[term] for term in unique_terms]
|
| 32 |
+
return {"indices": indices, "values": values, "terms": unique_terms}
|
| 33 |
|
| 34 |
# 3. Late-interaction embedding model (ColBERT)
|
| 35 |
colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)
|