Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import torch | |
| import sentence_transformers as sent | |
| import datasets as ds | |
| d = ds.load_dataset("wikipedia", "20220301.simple") | |
| t = d["train"] | |
| titles = t['title'] | |
| def load_model(): | |
| return sent.SentenceTransformer("distiluse-base-multilingual-cased-v1")#"all-MiniLM-L6-v2") | |
| def load_wikipedia_embeddings(): | |
| return torch.load("titles-simple-0.pt", map_location=torch.device('cpu')) | |
| st.title("Multilingual Semantic Search for Wikipedia Simple English") | |
| st.markdown(""" | |
| Use semantic search to find related articles in Wikipedia Simple English: using a language model (sentence-transformers/distiluse-base-multilingual-cased-v1) we can find the closests titles from Wikipedia Simple English (wikipedia) queried in any of the model's trained languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish: | |
| - colesterol | |
| - développement humain | |
| - Crise dos mísseis de Cuba | |
| Also, "near natural language" queries are usually enough to bring up relevant results. Try: | |
| - ¿cuál es el edificio más alto del mundo? | |
| - comment préparer du poulet frit | |
| - melhores películas de pixar | |
| (note: search is done only on the article titles, not the content) | |
| """) | |
| model = load_model() | |
| embeddings = load_wikipedia_embeddings() | |
| #queries = ["Aristoteles", "Autismo", "Mental", "crecimiento poblacional"] | |
| query = st.text_input("Query (es, fr, pt, ...)") | |
| if query != "": | |
| queries = [query] | |
| queries_emb = model.encode(queries, convert_to_tensor=True) | |
| hits = sent.util.semantic_search(queries_emb, embeddings, top_k=5) | |
| for i,q in enumerate(queries): | |
| f"----\n{q}:\n" | |
| for hit in hits[i]: | |
| cid = hit['corpus_id'] | |
| title = titles[cid] | |
| url = t[cid]['url'] | |
| text = t[cid]['text'][:500] + "..." | |
| st.header(f"{title}") | |
| url | |
| text | |
| hit | |