Spaces:
Sleeping
Sleeping
| import http.client as http_client | |
| import json | |
| import logging | |
| import os | |
| import pprint | |
| import re | |
| import time | |
| import string | |
| import streamlit as st | |
| import streamlit.components.v1 as components | |
| from typing import Callable, Optional, Tuple, Union | |
| from pyserini import util | |
| from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder | |
| VERSION = '1.0' | |
| st.set_page_config(page_title="Miracl Search - Finnish", layout="wide") | |
| os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True) | |
| with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file: | |
| file.write( | |
| '[theme]\nbase="light"' | |
| ) | |
| Searcher = Union[FaissSearcher, LuceneSearcher] | |
| LANG_MAPPING = {'Finnish':'fi'} | |
| st.sidebar.markdown( | |
| """ | |
| <style> | |
| .aligncenter { | |
| text-align: center; | |
| font-weight: bold; | |
| font-size: 30px; | |
| } | |
| </style> | |
| <p class="aligncenter">MIRACL Finnish Demo</p> | |
| <p class="aligncenter">πππ</p> | |
| <p style="text-align: center;"> MIRACL is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.</p> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| st.sidebar.markdown( | |
| """ | |
| <style> | |
| .aligncenter { | |
| text-align: center; | |
| } | |
| </style> | |
| <p style='text-align: center'> | |
| <a href="https://github.com/project-miracl" >GitHub</a> | <a href="https://arxiv.org/abs/2210.09984" >Paper</a> | |
| </p> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| query = st.sidebar.text_input(label='Search query', value='') | |
| language = 'Finnish' | |
| max_results = st.sidebar.slider( | |
| "Maximum Number of Results", | |
| min_value=1, | |
| max_value=1000, | |
| step=1, | |
| value=10, | |
| help="Maximum Number of Documents to return", | |
| ) | |
| def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher): | |
| searcher = LuceneSearcher(f'lucene-index.miracl-v1.0-{language}.20221004.2b2856') | |
| searcher.set_language(language) | |
| if k1 is not None and b is not None: | |
| searcher.set_bm25(k1, b) | |
| retriever_name = f'BM25 (k1={k1}, b={b})' | |
| else: | |
| retriever_name = 'BM25' | |
| return searcher | |
| def search(query, language, num_results=10): | |
| searcher = _load_sparse_searcher(language=LANG_MAPPING[language]) | |
| t_0 = time.time() | |
| search_results = searcher.search(query, k=num_results) | |
| search_time = time.time() - t_0 | |
| results_dict ={"docs": [], "doc_ids": [], "score":[], "lang": language} | |
| for i, result in enumerate(search_results): | |
| result = json.loads(result.raw) | |
| results_dict["docs"].append(result["text"]) | |
| results_dict["doc_ids"].append(result["docid"]) | |
| results_dict["score"].append(search_results[i].score) | |
| return results_dict, search_time | |
| def highlight_string(paragraph: str, highlight_terms: list) -> str: | |
| for term in highlight_terms: | |
| paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I) | |
| return paragraph | |
| def process_results(hits: dict, highlight_terms: list) -> str: | |
| hit_list = [] | |
| for i in range(len(hits['doc_ids'])): | |
| res_head = f""" | |
| <div class='searchresult'> | |
| <h2>{i+1}. Document ID: {hits['doc_ids'][i]}</h2> | |
| <p>Language: <string>{hits['lang']}</string>, Score: {round(hits['score'][i], 2)}</p> | |
| <p>{highlight_string(hits['docs'][i], highlight_terms)}</p> | |
| </div> | |
| <hr> | |
| """ | |
| hit_list.append(res_head) | |
| return " ".join(hit_list) | |
| if st.sidebar.button("Search"): | |
| hits, search_time = search(query, language, max_results) | |
| html_results = process_results(hits, []) | |
| rendered_results = f""" | |
| <div id="searchresultsarea"> | |
| <br> | |
| <p id="searchresultsnumber">About {max_results} results</p> | |
| {html_results} | |
| </div> | |
| """ | |
| st.markdown(""" | |
| <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet" | |
| integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous"> | |
| """, | |
| unsafe_allow_html=True) | |
| st.markdown( | |
| """ | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> | |
| """, | |
| unsafe_allow_html=True) | |
| st.markdown( | |
| f""" | |
| <div class="row no-gutters mt-3 align-items-center"> | |
| <h2> Search Results </h2> | |
| </div> | |
| """, | |
| unsafe_allow_html=True) | |
| components.html( | |
| """ | |
| <style> | |
| #searchresultsarea { | |
| font-family: 'Arial'; | |
| } | |
| #searchresultsnumber { | |
| font-size: 0.8rem; | |
| color: gray; | |
| } | |
| .searchresult h2 { | |
| font-size: 19px; | |
| line-height: 18px; | |
| font-weight: normal; | |
| color: rgb(7, 111, 222); | |
| margin-bottom: 0px; | |
| margin-top: 25px; | |
| } | |
| .searchresult a { | |
| font-size: 12px; | |
| line-height: 12px; | |
| color: green; | |
| margin-bottom: 0px; | |
| } | |
| .dark-mode { | |
| color: white; | |
| } | |
| </style> | |
| <script> | |
| function load_image(id){ | |
| console.log(id) | |
| var x = document.getElementById(id); | |
| console.log(x) | |
| if (x.style.display === "none") { | |
| x.style.display = "block"; | |
| } else { | |
| x.style.display = "none"; | |
| } | |
| }; | |
| function myFunction() { | |
| var element = document.body; | |
| element.classList.toggle("dark-mode"); | |
| } | |
| </script> | |
| <button onclick="myFunction()">Toggle dark mode</button> | |
| """ + rendered_results, height=800, scrolling=True | |
| ) | |