Spaces:

domenicrosati
/

scite-qa-demo

Runtime error

App Files Files Community

domenicrosati commited on Sep 20, 2022

Commit

964c419

1 Parent(s): dd426a1

add proper matching

Browse files

Files changed (1) hide show

app.py +32 -11

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import nltk
 import string
 from streamlit.components.v1 import html
 from sentence_transformers.cross_encoder import CrossEncoder as CE
-import numpy as np
 from typing import List, Tuple
 import torch
@@ -26,7 +26,7 @@ class CrossEncoder:
 def remove_html(x):
     soup = BeautifulSoup(x, 'html.parser')
     text = soup.get_text()
-    return text
 # 4 searches: strict y/n, supported y/n
@@ -58,7 +58,7 @@ def search(term, limit=10, clean=True, strict=True, all_mode=True, abstracts=Tru
         except:
             pass
-        contexts += [remove_html('\n'.join([cite['snippet'] for cite in doc['citations']])) for doc in req.json()['hits']]
         docs += [(doc['doi'], doc['citations'], doc['title'], doc['abstract'] or '')
             for doc in req.json()['hits']]
@@ -85,10 +85,12 @@ def search(term, limit=10, clean=True, strict=True, all_mode=True, abstracts=Tru
     )
-def find_source(text, docs):
     for doc in docs:
         for snippet in doc[1]:
             if text in remove_html(snippet.get('snippet', '')):
                 new_text = text
                 for sent in nltk.sent_tokenize(remove_html(snippet.get('snippet', ''))):
                     if text in sent:
@@ -98,10 +100,12 @@ def find_source(text, docs):
                     'text': new_text,
                     'from': snippet['source'],
                     'supporting': snippet['target'],
-                    'source_title': remove_html(doc[2]),
                     'source_link': f"https://scite.ai/reports/{doc[0]}"
                 }
         if text in remove_html(doc[3]):
             new_text = text
             for sent in nltk.sent_tokenize(remove_html(doc[3])):
                 if text in sent:
@@ -111,7 +115,7 @@ def find_source(text, docs):
                     'text': new_text,
                     'from': doc[0],
                     'supporting': doc[0],
-                    'source_title': "ABSTRACT of " + remove_html(doc[2]),
                     'source_link': f"https://scite.ai/reports/{doc[0]}"
                 }
     return None
@@ -233,6 +237,22 @@ def group_results_by_context(results):
     return list(result_groups.values())
 def run_query(query):
 #     if use_query_exp == 'yes':
 #         query_exp = paraphrase(f"question2question: {query}")
@@ -278,19 +298,21 @@ def run_query(query):
         </div>
      </div>
         """, unsafe_allow_html=True)
     if use_reranking == 'yes':
         sentence_pairs = [[query, context] for context in contexts]
         scores = reranker.predict(sentence_pairs, batch_size=len(sentence_pairs), show_progress_bar=False)
         hits = {contexts[idx]: scores[idx] for idx in range(len(scores))}
         sorted_contexts = [k for k,v in sorted(hits.items(), key=lambda x: x[0], reverse=True)]
-        context = '\n'.join(sorted_contexts[:context_limit])
     else:
-        context = '\n'.join(contexts[:context_limit])
     results = []
     model_results = qa_model(question=query, context=context, top_k=10)
     for result in model_results:
-        support = find_source(result['answer'], orig_docs)
         if not support:
             continue
         results.append({
@@ -316,10 +338,9 @@ def run_query(query):
     )
     for r in sorted_result:
-        answer = r["answer"]
         ctx = remove_html(r["context"])
         for answer in r['texts']:
-            ctx = ctx.replace(answer, f"<mark>{answer}</mark>")
         # .replace( '<cite', '<a').replace('</cite', '</a').replace('data-doi="', 'href="https://scite.ai/reports/')
         title = r.get("title", '')
         score = round(round(r["score"], 4) * 100, 2)

 import string
 from streamlit.components.v1 import html
 from sentence_transformers.cross_encoder import CrossEncoder as CE
+import re
 from typing import List, Tuple
 import torch
 def remove_html(x):
     soup = BeautifulSoup(x, 'html.parser')
     text = soup.get_text()
+    return text.strip()
 # 4 searches: strict y/n, supported y/n
         except:
             pass
+        contexts += [remove_html('\n'.join([cite['snippet'] for cite in doc['citations'] if cite['lang'] == 'en'])) for doc in req.json()['hits']]
         docs += [(doc['doi'], doc['citations'], doc['title'], doc['abstract'] or '')
             for doc in req.json()['hits']]
     )
+def find_source(text, docs, matched):
     for doc in docs:
         for snippet in doc[1]:
             if text in remove_html(snippet.get('snippet', '')):
+                if matched and remove_html(snippet.get('snippet', '')).strip() != matched.strip():
+                    continue
                 new_text = text
                 for sent in nltk.sent_tokenize(remove_html(snippet.get('snippet', ''))):
                     if text in sent:
                     'text': new_text,
                     'from': snippet['source'],
                     'supporting': snippet['target'],
+                    'source_title': remove_html(doc[2] or ''),
                     'source_link': f"https://scite.ai/reports/{doc[0]}"
                 }
         if text in remove_html(doc[3]):
+            if matched and remove_html(doc[3]).strip() != matched.strip():
+                    continue
             new_text = text
             for sent in nltk.sent_tokenize(remove_html(doc[3])):
                 if text in sent:
                     'text': new_text,
                     'from': doc[0],
                     'supporting': doc[0],
+                    'source_title': "ABSTRACT of " + remove_html(doc[2] or ''),
                     'source_link': f"https://scite.ai/reports/{doc[0]}"
                 }
     return None
     return list(result_groups.values())
+def matched_context(start_i, end_i, contexts_string, seperator='---'):
+    # find seperators to identify start and end
+    doc_starts = [0]
+    for match in re.finditer(seperator, contexts_string):
+        doc_starts.append(match.end())
+    for i in range(len(doc_starts)):
+        if i == len(doc_starts) - 1:
+            if start_i >= doc_starts[i]:
+                return contexts_string[doc_starts[i]:len(contexts_string)].replace(seperator, '')
+        if start_i >= doc_starts[i] and end_i <= doc_starts[i+1]:
+            return contexts_string[doc_starts[i]:doc_starts[i+1]].replace(seperator, '')
+    return None
 def run_query(query):
 #     if use_query_exp == 'yes':
 #         query_exp = paraphrase(f"question2question: {query}")
         </div>
      </div>
         """, unsafe_allow_html=True)
     if use_reranking == 'yes':
         sentence_pairs = [[query, context] for context in contexts]
         scores = reranker.predict(sentence_pairs, batch_size=len(sentence_pairs), show_progress_bar=False)
         hits = {contexts[idx]: scores[idx] for idx in range(len(scores))}
         sorted_contexts = [k for k,v in sorted(hits.items(), key=lambda x: x[0], reverse=True)]
+        context = '\n---'.join(sorted_contexts[:context_limit])
     else:
+        context = '\n---'.join(contexts[:context_limit])
     results = []
     model_results = qa_model(question=query, context=context, top_k=10)
     for result in model_results:
+        matched = matched_context(result['start'], result['end'], context)
+        support = find_source(result['answer'], orig_docs, matched)
         if not support:
             continue
         results.append({
     )
     for r in sorted_result:
         ctx = remove_html(r["context"])
         for answer in r['texts']:
+            ctx = ctx.replace(answer.strip(), f"<mark>{answer.strip()}</mark>")
         # .replace( '<cite', '<a').replace('</cite', '</a').replace('data-doi="', 'href="https://scite.ai/reports/')
         title = r.get("title", '')
         score = round(round(r["score"], 4) * 100, 2)