Spaces:
Sleeping
Sleeping
Update ✨Entity Linking Application✨.py
Browse files- ✨Entity Linking Application✨.py +27 -52
✨Entity Linking Application✨.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
import json
|
| 3 |
import numpy as np
|
| 4 |
import re
|
|
@@ -15,20 +14,18 @@ from openai import OpenAI
|
|
| 15 |
import sys
|
| 16 |
import time
|
| 17 |
from bs4 import BeautifulSoup
|
| 18 |
-
from fake_useragent import UserAgent
|
| 19 |
import requests
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
headers = {
|
| 24 |
-
"User-Agent": f"{ua.random}"
|
| 25 |
-
}
|
| 26 |
|
| 27 |
folder_path = '/home/user/app/qids_folder'
|
| 28 |
|
| 29 |
if not os.path.exists(folder_path):
|
| 30 |
os.mkdir(folder_path)
|
| 31 |
-
print(f"folder created at {folder_path}")
|
| 32 |
else:
|
| 33 |
pass
|
| 34 |
|
|
@@ -54,14 +51,11 @@ async def combination_method(name, session):
|
|
| 54 |
x = itertools_combinations(new_name, 2)
|
| 55 |
for i in x:
|
| 56 |
new_word = (i[0] + " " + i[1])
|
| 57 |
-
url = f"https://
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
for link in href_links:
|
| 63 |
-
if link.startswith('https://en.wikipedia.org/wiki/'):
|
| 64 |
-
data.add(link.split("/")[-1])
|
| 65 |
return data
|
| 66 |
|
| 67 |
async def single_method(name, session):
|
|
@@ -69,30 +63,24 @@ async def single_method(name, session):
|
|
| 69 |
data = set()
|
| 70 |
new_name = name.replace("-", " ").replace("/", " ").split()
|
| 71 |
for i in new_name:
|
| 72 |
-
url = f"https://
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
for link in href_links:
|
| 78 |
-
if link.startswith('https://en.wikipedia.org/wiki/'):
|
| 79 |
-
data.add(link.split("/")[-1])
|
| 80 |
return data
|
| 81 |
|
| 82 |
-
async def mains(name,
|
| 83 |
data = set()
|
| 84 |
disam_data = set()
|
| 85 |
qids = set()
|
| 86 |
|
| 87 |
async with aiohttp.ClientSession() as session:
|
| 88 |
-
url = f"https://
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
for link in href_links:
|
| 94 |
-
if link.startswith('https://en.wikipedia.org/wiki/'):
|
| 95 |
-
data.add(link.split("/")[-1])
|
| 96 |
|
| 97 |
wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
|
| 98 |
json_data = await fetch_json(wikipedia_url, session)
|
|
@@ -126,14 +114,14 @@ async def mains(name, single, combi):
|
|
| 126 |
disam_data.add(ids)
|
| 127 |
|
| 128 |
# Makes combinations of the name
|
| 129 |
-
if
|
| 130 |
if len(name.replace("-", " ").split()) >= 3:
|
| 131 |
combination_names = await combination_method(name, session)
|
| 132 |
for i in combination_names:
|
| 133 |
disam_data.add(i)
|
| 134 |
|
| 135 |
# Checks every word alone
|
| 136 |
-
if
|
| 137 |
if len(name.replace("-", " ").replace("/", " ").split()) >= 2:
|
| 138 |
singles = await single_method(name, session)
|
| 139 |
for i in singles:
|
|
@@ -270,18 +258,6 @@ async def main(name):
|
|
| 270 |
with open(f"/home/user/app/info_extraction/{name}.json", "w", encoding="utf-8") as flast:
|
| 271 |
json.dump(final_list, flast)
|
| 272 |
|
| 273 |
-
#def check_sentence(sentence):
|
| 274 |
-
# two_consecutive_uppercase = r"[A-Z]{2}"
|
| 275 |
-
# uppercase_followed_by_fullstop = r"[A-Z]\."
|
| 276 |
-
|
| 277 |
-
# if re.search(two_consecutive_uppercase, sentence):
|
| 278 |
-
# return True
|
| 279 |
-
|
| 280 |
-
# if re.search(uppercase_followed_by_fullstop, sentence):
|
| 281 |
-
# return True
|
| 282 |
-
|
| 283 |
-
# return False
|
| 284 |
-
|
| 285 |
def main_cli():
|
| 286 |
st.title("✨ Entity Linking Application ✨")
|
| 287 |
st.caption("This web application is part of my master’s dissertation.")
|
|
@@ -306,8 +282,7 @@ def main_cli():
|
|
| 306 |
|
| 307 |
input_sentence_user = st.text_input("Enter a sentence:", "", disabled=st.session_state.running)
|
| 308 |
input_mention_user = st.text_input("Enter a textural reference (mention) that is inside the sentence:", "", disabled=st.session_state.running)
|
| 309 |
-
|
| 310 |
-
combi = st.selectbox("Make combinations of each word? (Useful for difficult mentions)", ['Yes', 'No'], index=1, disabled=st.session_state.running)
|
| 311 |
disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention include an acronym or if it is nested)", ['Yes', 'No'], index=0, disabled=st.session_state.running)
|
| 312 |
|
| 313 |
if st.button("Run Entity Linking", key="run_button", disabled=st.session_state.running):
|
|
@@ -426,18 +401,18 @@ def main_cli():
|
|
| 426 |
list_with_contexts.append(context)
|
| 427 |
st.write("✅ Applied Data Normilzation module (1/5)")
|
| 428 |
# Candidate Retrieval & Information Gathering
|
| 429 |
-
async def big_main(mention,
|
| 430 |
mention = mention.split(",")
|
| 431 |
with st.spinner("Applying Candidate Retrieval module... (2/5)"):
|
| 432 |
for i in mention:
|
| 433 |
-
await mains(i,
|
| 434 |
st.write("✅ Applied Candidate Retrieval module (2/5)")
|
| 435 |
with st.spinner("Applying Information Gathering module... (3/5)"):
|
| 436 |
for i in mention:
|
| 437 |
await main(i)
|
| 438 |
st.write("✅ Applied Information Gathering module (3/5)")
|
| 439 |
|
| 440 |
-
asyncio.run(big_main(name,
|
| 441 |
|
| 442 |
number = 0
|
| 443 |
for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import numpy as np
|
| 3 |
import re
|
|
|
|
| 14 |
import sys
|
| 15 |
import time
|
| 16 |
from bs4 import BeautifulSoup
|
|
|
|
| 17 |
import requests
|
| 18 |
+
import nest_asyncio
|
| 19 |
+
import httpx
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
nest_asyncio.apply()
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
folder_path = '/home/user/app/qids_folder'
|
| 26 |
|
| 27 |
if not os.path.exists(folder_path):
|
| 28 |
os.mkdir(folder_path)
|
|
|
|
| 29 |
else:
|
| 30 |
pass
|
| 31 |
|
|
|
|
| 51 |
x = itertools_combinations(new_name, 2)
|
| 52 |
for i in x:
|
| 53 |
new_word = (i[0] + " " + i[1])
|
| 54 |
+
url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={new_word}&srlimit=20&srprop=&srenablerewrites=True&format=json"
|
| 55 |
+
json_data = await fetch_json(url, session)
|
| 56 |
+
suggestion = json_data.get('query', {}).get('search', {})
|
| 57 |
+
for pageid in suggestion:
|
| 58 |
+
data.add(pageid.get('title', {}))
|
|
|
|
|
|
|
|
|
|
| 59 |
return data
|
| 60 |
|
| 61 |
async def single_method(name, session):
|
|
|
|
| 63 |
data = set()
|
| 64 |
new_name = name.replace("-", " ").replace("/", " ").split()
|
| 65 |
for i in new_name:
|
| 66 |
+
url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={i}&srlimit=20&srprop=&srenablerewrites=True&format=json"
|
| 67 |
+
json_data = await fetch_json(url, session)
|
| 68 |
+
suggestion = json_data.get('query', {}).get('search', {})
|
| 69 |
+
for pageid in suggestion:
|
| 70 |
+
data.add(pageid.get('title', {}))
|
|
|
|
|
|
|
|
|
|
| 71 |
return data
|
| 72 |
|
| 73 |
+
async def mains(name, deep_search):
|
| 74 |
data = set()
|
| 75 |
disam_data = set()
|
| 76 |
qids = set()
|
| 77 |
|
| 78 |
async with aiohttp.ClientSession() as session:
|
| 79 |
+
url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=20&srprop=&srenablerewrites=True&format=json"
|
| 80 |
+
json_data = await fetch_json(url, session)
|
| 81 |
+
suggestion = json_data.get('query', {}).get('search', {})
|
| 82 |
+
for pageid in suggestion:
|
| 83 |
+
data.add(pageid.get('title', {}))
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
|
| 86 |
json_data = await fetch_json(wikipedia_url, session)
|
|
|
|
| 114 |
disam_data.add(ids)
|
| 115 |
|
| 116 |
# Makes combinations of the name
|
| 117 |
+
if deep_search == "Yes":
|
| 118 |
if len(name.replace("-", " ").split()) >= 3:
|
| 119 |
combination_names = await combination_method(name, session)
|
| 120 |
for i in combination_names:
|
| 121 |
disam_data.add(i)
|
| 122 |
|
| 123 |
# Checks every word alone
|
| 124 |
+
if deep_search == "Yes":
|
| 125 |
if len(name.replace("-", " ").replace("/", " ").split()) >= 2:
|
| 126 |
singles = await single_method(name, session)
|
| 127 |
for i in singles:
|
|
|
|
| 258 |
with open(f"/home/user/app/info_extraction/{name}.json", "w", encoding="utf-8") as flast:
|
| 259 |
json.dump(final_list, flast)
|
| 260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
def main_cli():
|
| 262 |
st.title("✨ Entity Linking Application ✨")
|
| 263 |
st.caption("This web application is part of my master’s dissertation.")
|
|
|
|
| 282 |
|
| 283 |
input_sentence_user = st.text_input("Enter a sentence:", "", disabled=st.session_state.running)
|
| 284 |
input_mention_user = st.text_input("Enter a textural reference (mention) that is inside the sentence:", "", disabled=st.session_state.running)
|
| 285 |
+
deep_search = st.selectbox("Perform deep search? (Useful for difficult mentions)", ['Yes', 'No'], index=1, disabled=st.session_state.running)
|
|
|
|
| 286 |
disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention include an acronym or if it is nested)", ['Yes', 'No'], index=0, disabled=st.session_state.running)
|
| 287 |
|
| 288 |
if st.button("Run Entity Linking", key="run_button", disabled=st.session_state.running):
|
|
|
|
| 401 |
list_with_contexts.append(context)
|
| 402 |
st.write("✅ Applied Data Normilzation module (1/5)")
|
| 403 |
# Candidate Retrieval & Information Gathering
|
| 404 |
+
async def big_main(mention, deep_search):
|
| 405 |
mention = mention.split(",")
|
| 406 |
with st.spinner("Applying Candidate Retrieval module... (2/5)"):
|
| 407 |
for i in mention:
|
| 408 |
+
await mains(i, deep_search)
|
| 409 |
st.write("✅ Applied Candidate Retrieval module (2/5)")
|
| 410 |
with st.spinner("Applying Information Gathering module... (3/5)"):
|
| 411 |
for i in mention:
|
| 412 |
await main(i)
|
| 413 |
st.write("✅ Applied Information Gathering module (3/5)")
|
| 414 |
|
| 415 |
+
asyncio.run(big_main(name, deep_search))
|
| 416 |
|
| 417 |
number = 0
|
| 418 |
for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
|