Spaces:

domenicrosati
/

scite-qa-demo

Runtime error

App Files Files Community

domenicrosati commited on Sep 27, 2022

Commit

3f1f616

1 Parent(s): bdb2b00

update to use api

Browse files

Files changed (1) hide show

app.py +165 -135

app.py CHANGED Viewed

@@ -12,15 +12,15 @@ import torch
 SCITE_API_KEY = st.secrets["SCITE_API_KEY"]
-class CrossEncoder:
-    def __init__(self, model_path: str, **kwargs):
-        self.model = CE(model_path, **kwargs)
-    def predict(self, sentences: List[Tuple[str,str]], batch_size: int = 32, show_progress_bar: bool = True) -> List[float]:
-        return self.model.predict(
-            sentences=sentences,
-            batch_size=batch_size,
-            show_progress_bar=show_progress_bar)
 def remove_html(x):
@@ -134,23 +134,23 @@ def find_source(text, docs, matched):
     return None
-@st.experimental_singleton
-def init_models():
-    nltk.download('stopwords')
-    nltk.download('punkt')
-    from nltk.corpus import stopwords
-    stop = set(stopwords.words('english') + list(string.punctuation))
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    question_answerer = pipeline(
-        "question-answering", model='nlpconnect/roberta-base-squad2-nq',
-        device=device, handle_impossible_answer=False,
-    )
-    reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)
-    # queryexp_tokenizer = AutoTokenizer.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
-    # queryexp_model = AutoModelWithLMHead.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
-    return question_answerer, reranker, stop, device
-qa_model, reranker, stop, device = init_models() # queryexp_model, queryexp_tokenizer
 def clean_query(query, strict=True, clean=True):
@@ -206,32 +206,32 @@ Answers are linked to source documents containing citations where users can expl
 For example try: Do tanning beds cause cancer?
 """)
-st.markdown("""
-<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
-""", unsafe_allow_html=True)
-with st.expander("Settings (strictness, context limit, top hits)"):
-    concat_passages = st.radio(
-        "Concatenate passages as one long context?",
-        ('yes', 'no'))
-    present_impossible = st.radio(
-        "Present impossible answers? (if the model thinks its impossible to answer should it still try?)",
-        ('yes', 'no'))
-    support_all = st.radio(
-        "Use abstracts and titles as a ranking signal (if the words are matched in the abstract then the document is more relevant)?",
-        ('no', 'yes'))
-    support_abstracts = st.radio(
-        "Use abstracts as a source document?",
-        ('yes', 'no', 'abstract only'))
-    strict_lenient_mix = st.radio(
-        "Type of strict+lenient combination: Fallback or Mix? If fallback, strict is run first then if the results are less than context_lim we also search lenient. Mix will search them both and let reranking sort em out",
-        ('mix', 'fallback'))
-    confidence_threshold = st.slider('Confidence threshold for answering questions? This number represents how confident the model should be in the answers it gives. The number is out of 100%', 0, 100, 1)
-    use_reranking = st.radio(
-        "Use Reranking? Reranking will rerank the top hits using semantic similarity of document and query.",
-        ('yes', 'no'))
-    top_hits_limit = st.slider('Top hits? How many documents to use for reranking. Larger is slower but higher quality', 10, 300, 100)
-    context_lim = st.slider('Context limit? How many documents to use for answering from. Larger is slower but higher quality', 10, 300, 25)
 # def paraphrase(text, max_length=128):
 #     input_ids = queryexp_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
@@ -272,38 +272,120 @@ def matched_context(start_i, end_i, contexts_string, seperator='---'):
     return None
-def run_query(query, progress_bar):
-#     if use_query_exp == 'yes':
-#         query_exp = paraphrase(f"question2question: {query}")
-#         st.markdown(f"""
-# If you are not getting good results try one of:
-# * {query_exp}
-# """)
-    # could also try fallback if there are no good answers by score...
-    limit = top_hits_limit or 100
-    context_limit = context_lim or 10
-    contexts_strict, orig_docs_strict = search(query, limit=limit, strict=True, all_mode=support_all == 'yes', abstracts= support_abstracts == 'yes', abstract_only=support_abstracts == 'abstract only')
-    if strict_lenient_mix == 'fallback' and len(contexts_strict) < context_limit:
-        contexts_lenient, orig_docs_lenient = search(query, limit=limit, strict=False, all_mode=support_all == 'yes',  abstracts= support_abstracts == 'yes', abstract_only= support_abstracts == 'abstract only')
-        contexts = list(
-            set(contexts_strict + contexts_lenient)
-        )
-        orig_docs = orig_docs_strict + orig_docs_lenient
-    elif strict_lenient_mix == 'mix':
-        contexts_lenient, orig_docs_lenient = search(query, limit=limit, strict=False)
-        contexts = list(
-            set(contexts_strict + contexts_lenient)
-        )
-        orig_docs = orig_docs_strict + orig_docs_lenient
-    else:
-        contexts = list(
-            set(contexts_strict)
-        )
-        orig_docs = orig_docs_strict
-    progress_bar.progress(25)
-    if len(contexts) == 0 or not ''.join(contexts).strip():
         return st.markdown("""
         <div class="container-fluid">
         <div class="row align-items-start">
@@ -314,58 +396,7 @@ def run_query(query, progress_bar):
      </div>
         """, unsafe_allow_html=True)
-    if use_reranking == 'yes':
-        sentence_pairs = [[query, context] for context in contexts]
-        scores = reranker.predict(sentence_pairs, batch_size=len(sentence_pairs), show_progress_bar=False)
-        hits = {contexts[idx]: scores[idx] for idx in range(len(scores))}
-        sorted_contexts = [k for k,v in sorted(hits.items(), key=lambda x: x[0], reverse=True)]
-        contexts = sorted_contexts[:context_limit]
-    else:
-        contexts = contexts[:context_limit]
-    progress_bar.progress(50)
-    if concat_passages == 'yes':
-        context = '\n---'.join(contexts)
-        model_results = qa_model(question=query, context=context, top_k=10, doc_stride=512 // 2, max_answer_len=128, max_seq_len=512, handle_impossible_answer=present_impossible=='yes')
-    else:
-        context = ['\n---\n'+ctx for ctx in contexts]
-        model_results = qa_model(question=[query]*len(contexts), context=context, handle_impossible_answer=present_impossible=='yes')
-    results = []
-    progress_bar.progress(75)
-    for i, result in enumerate(model_results):
-        if concat_passages == 'yes':
-            matched = matched_context(result['start'], result['end'], context)
-        else:
-            matched = matched_context(result['start'], result['end'], context[i])
-        support = find_source(result['answer'], orig_docs, matched)
-        if not support:
-            continue
-        results.append({
-            "answer": support['text'],
-            "title": support['source_title'],
-            "link": support['source_link'],
-            "context": support['citation_statement'],
-            "score": result['score'],
-            "doi": support["supporting"]
-        })
-    grouped_results = group_results_by_context(results)
-    sorted_result = sorted(grouped_results, key=lambda x: x['score'], reverse=True)
-    if confidence_threshold == 0:
-        threshold = 0
-    else:
-        threshold = (confidence_threshold or 10) / 100
-    sorted_result = list(filter(
-        lambda x: x['score'] > threshold,
-        sorted_result
-    ))
-    progress_bar.progress(100)
-    for r in sorted_result:
         ctx = remove_html(r["context"])
         for answer in r['texts']:
             ctx = ctx.replace(answer.strip(), f"<mark>{answer.strip()}</mark>")
@@ -377,5 +408,4 @@ def run_query(query, progress_bar):
 query = st.text_input("Ask scientific literature a question", "")
 if query != "":
     with st.spinner('Loading...'):
-        progress_bar = st.progress(0)
-        run_query(query, progress_bar)

 SCITE_API_KEY = st.secrets["SCITE_API_KEY"]
+# class CrossEncoder:
+#     def __init__(self, model_path: str, **kwargs):
+#         self.model = CE(model_path, **kwargs)
+#     def predict(self, sentences: List[Tuple[str,str]], batch_size: int = 32, show_progress_bar: bool = True) -> List[float]:
+#         return self.model.predict(
+#             sentences=sentences,
+#             batch_size=batch_size,
+#             show_progress_bar=show_progress_bar)
 def remove_html(x):
     return None
+# @st.experimental_singleton
+# def init_models():
+#     nltk.download('stopwords')
+#     nltk.download('punkt')
+#     from nltk.corpus import stopwords
+#     stop = set(stopwords.words('english') + list(string.punctuation))
+#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#     question_answerer = pipeline(
+#         "question-answering", model='nlpconnect/roberta-base-squad2-nq',
+#         device=0 if torch.cuda.is_available() else -1, handle_impossible_answer=False,
+#     )
+#     reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)
+#     # queryexp_tokenizer = AutoTokenizer.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
+#     # queryexp_model = AutoModelWithLMHead.from_pretrained("doc2query/all-with_prefix-t5-base-v1")
+#     return question_answerer, reranker, stop, device
+# qa_model, reranker, stop, device = init_models() # queryexp_model, queryexp_tokenizer
 def clean_query(query, strict=True, clean=True):
 For example try: Do tanning beds cause cancer?
 """)
+# st.markdown("""
+# <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
+# """, unsafe_allow_html=True)
+# with st.expander("Settings (strictness, context limit, top hits)"):
+#     concat_passages = st.radio(
+#         "Concatenate passages as one long context?",
+#         ('yes', 'no'))
+#     present_impossible = st.radio(
+#         "Present impossible answers? (if the model thinks its impossible to answer should it still try?)",
+#         ('yes', 'no'))
+#     support_all = st.radio(
+#         "Use abstracts and titles as a ranking signal (if the words are matched in the abstract then the document is more relevant)?",
+#         ('no', 'yes'))
+#     support_abstracts = st.radio(
+#         "Use abstracts as a source document?",
+#         ('yes', 'no', 'abstract only'))
+#     strict_lenient_mix = st.radio(
+#         "Type of strict+lenient combination: Fallback or Mix? If fallback, strict is run first then if the results are less than context_lim we also search lenient. Mix will search them both and let reranking sort em out",
+#         ('mix', 'fallback'))
+#     confidence_threshold = st.slider('Confidence threshold for answering questions? This number represents how confident the model should be in the answers it gives. The number is out of 100%', 0, 100, 1)
+#     use_reranking = st.radio(
+#         "Use Reranking? Reranking will rerank the top hits using semantic similarity of document and query.",
+#         ('yes', 'no'))
+#     top_hits_limit = st.slider('Top hits? How many documents to use for reranking. Larger is slower but higher quality', 10, 300, 100)
+#     context_lim = st.slider('Context limit? How many documents to use for answering from. Larger is slower but higher quality', 10, 300, 25)
 # def paraphrase(text, max_length=128):
 #     input_ids = queryexp_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
     return None
+# def run_query_full(query, progress_bar):
+# #     if use_query_exp == 'yes':
+# #         query_exp = paraphrase(f"question2question: {query}")
+# #         st.markdown(f"""
+# # If you are not getting good results try one of:
+# # * {query_exp}
+# # """)
+#     # could also try fallback if there are no good answers by score...
+#     limit = top_hits_limit or 100
+#     context_limit = context_lim or 10
+#     contexts_strict, orig_docs_strict = search(query, limit=limit, strict=True, all_mode=support_all == 'yes', abstracts= support_abstracts == 'yes', abstract_only=support_abstracts == 'abstract only')
+#     if strict_lenient_mix == 'fallback' and len(contexts_strict) < context_limit:
+#         contexts_lenient, orig_docs_lenient = search(query, limit=limit, strict=False, all_mode=support_all == 'yes',  abstracts= support_abstracts == 'yes', abstract_only= support_abstracts == 'abstract only')
+#         contexts = list(
+#             set(contexts_strict + contexts_lenient)
+#         )
+#         orig_docs = orig_docs_strict + orig_docs_lenient
+#     elif strict_lenient_mix == 'mix':
+#         contexts_lenient, orig_docs_lenient = search(query, limit=limit, strict=False)
+#         contexts = list(
+#             set(contexts_strict + contexts_lenient)
+#         )
+#         orig_docs = orig_docs_strict + orig_docs_lenient
+#     else:
+#         contexts = list(
+#             set(contexts_strict)
+#         )
+#         orig_docs = orig_docs_strict
+#     progress_bar.progress(25)
+#     if len(contexts) == 0 or not ''.join(contexts).strip():
+#         return st.markdown("""
+#         <div class="container-fluid">
+#         <div class="row align-items-start">
+#              <div  class="col-md-12 col-sm-12">
+    #         Sorry... no results for that question! Try another...
+    #      </div>
+    #     </div>
+    #  </div>
+    #     """, unsafe_allow_html=True)
+    # if use_reranking == 'yes':
+    #     sentence_pairs = [[query, context] for context in contexts]
+    #     scores = reranker.predict(sentence_pairs, batch_size=len(sentence_pairs), show_progress_bar=False)
+    #     hits = {contexts[idx]: scores[idx] for idx in range(len(scores))}
+    #     sorted_contexts = [k for k,v in sorted(hits.items(), key=lambda x: x[0], reverse=True)]
+    #     contexts = sorted_contexts[:context_limit]
+    # else:
+    #     contexts = contexts[:context_limit]
+    # progress_bar.progress(50)
+    # if concat_passages == 'yes':
+    #     context = '\n---'.join(contexts)
+    #     model_results = qa_model(question=query, context=context, top_k=10, doc_stride=512 // 2, max_answer_len=128, max_seq_len=512, handle_impossible_answer=present_impossible=='yes')
+    # else:
+    #     context = ['\n---\n'+ctx for ctx in contexts]
+    #     model_results = qa_model(question=[query]*len(contexts), context=context, handle_impossible_answer=present_impossible=='yes')
+    # results = []
+    # progress_bar.progress(75)
+    # for i, result in enumerate(model_results):
+    #     if concat_passages == 'yes':
+    #         matched = matched_context(result['start'], result['end'], context)
+    #     else:
+    #         matched = matched_context(result['start'], result['end'], context[i])
+    #     support = find_source(result['answer'], orig_docs, matched)
+    #     if not support:
+    #         continue
+    #     results.append({
+    #         "answer": support['text'],
+    #         "title": support['source_title'],
+    #         "link": support['source_link'],
+    #         "context": support['citation_statement'],
+    #         "score": result['score'],
+    #         "doi": support["supporting"]
+    #     })
+    # grouped_results = group_results_by_context(results)
+    # sorted_result = sorted(grouped_results, key=lambda x: x['score'], reverse=True)
+    # if confidence_threshold == 0:
+    #     threshold = 0
+    # else:
+    #     threshold = (confidence_threshold or 10) / 100
+    # sorted_result = list(filter(
+    #     lambda x: x['score'] > threshold,
+    #     sorted_result
+    # ))
+    # progress_bar.progress(100)
+    # for r in sorted_result:
+    #     ctx = remove_html(r["context"])
+    #     for answer in r['texts']:
+    #         ctx = ctx.replace(answer.strip(), f"<mark>{answer.strip()}</mark>")
+    #     # .replace( '<cite', '<a').replace('</cite', '</a').replace('data-doi="', 'href="https://scite.ai/reports/')
+    #     title = r.get("title", '')
+    #     score = round(round(r["score"], 4) * 100, 2)
+    #     card(title, ctx, score, r['link'], r['doi'])
+def run_query(query):
+    api_location = 'http://74.82.31.93'
+    resp_raw = requests.get(
+        f'{api_location}/question-answer?query={query}'
+    )
+    try:
+        resp = resp_raw.json()
+    except:
+        resp = {'results': []}
+    if len(resp.get('results', [])) == 0:
         return st.markdown("""
         <div class="container-fluid">
         <div class="row align-items-start">
      </div>
         """, unsafe_allow_html=True)
+    for r in resp['results']:
         ctx = remove_html(r["context"])
         for answer in r['texts']:
             ctx = ctx.replace(answer.strip(), f"<mark>{answer.strip()}</mark>")
 query = st.text_input("Ask scientific literature a question", "")
 if query != "":
     with st.spinner('Loading...'):
+        run_query(query)