Spaces:
Sleeping
Sleeping
Commit
·
558c227
1
Parent(s):
273bc17
Add system
Browse files- system/augmented_searching.py +4 -3
- system/date_verifier.py +58 -0
system/augmented_searching.py
CHANGED
|
@@ -6,10 +6,11 @@ import pandas as pd
|
|
| 6 |
from datetime import datetime
|
| 7 |
from pathlib import Path
|
| 8 |
import spacy
|
|
|
|
| 9 |
|
| 10 |
def google_search(query, api_key, search_engine_id, start_date, end_date):
|
| 11 |
# print(f"[SYSTEM] Calling Google Search API for: {query}")
|
| 12 |
-
sort = f"date:r:{start_date}:{end_date}"
|
| 13 |
url = "https://www.googleapis.com/customsearch/v1"
|
| 14 |
params = {
|
| 15 |
"q": query,
|
|
@@ -80,7 +81,7 @@ def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_
|
|
| 80 |
|
| 81 |
results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
|
| 82 |
for result in results:
|
| 83 |
-
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
|
| 84 |
string_values.append("claim")
|
| 85 |
urls.append(result["link"])
|
| 86 |
queries.append(f"{pledge_author}: {claim_text}")
|
|
@@ -88,7 +89,7 @@ def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_
|
|
| 88 |
for question in questions:
|
| 89 |
results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
|
| 90 |
for result in results:
|
| 91 |
-
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
|
| 92 |
string_values.append("question")
|
| 93 |
urls.append(result["link"])
|
| 94 |
queries.append(f"{question}")
|
|
|
|
| 6 |
from datetime import datetime
|
| 7 |
from pathlib import Path
|
| 8 |
import spacy
|
| 9 |
+
from date_verifier import is_after_start
|
| 10 |
|
| 11 |
def google_search(query, api_key, search_engine_id, start_date, end_date):
|
| 12 |
# print(f"[SYSTEM] Calling Google Search API for: {query}")
|
| 13 |
+
sort = f"date:r:{start_date}:{end_date}" #20241230:20250130
|
| 14 |
url = "https://www.googleapis.com/customsearch/v1"
|
| 15 |
params = {
|
| 16 |
"q": query,
|
|
|
|
| 81 |
|
| 82 |
results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
|
| 83 |
for result in results:
|
| 84 |
+
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
|
| 85 |
string_values.append("claim")
|
| 86 |
urls.append(result["link"])
|
| 87 |
queries.append(f"{pledge_author}: {claim_text}")
|
|
|
|
| 89 |
for question in questions:
|
| 90 |
results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
|
| 91 |
for result in results:
|
| 92 |
+
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
|
| 93 |
string_values.append("question")
|
| 94 |
urls.append(result["link"])
|
| 95 |
queries.append(f"{question}")
|
system/date_verifier.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
import re, trafilatura
|
| 3 |
+
from trafilatura.settings import DEFAULT_CONFIG
|
| 4 |
+
|
| 5 |
+
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
|
| 6 |
+
|
| 7 |
+
_URL_DATE_PATS = [
|
| 8 |
+
re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"), # 2025-07-03
|
| 9 |
+
re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"), # 2025/07/03
|
| 10 |
+
re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"), # 20250703
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
def _meta_date(url: str):
|
| 14 |
+
|
| 15 |
+
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
|
| 16 |
+
if not page:
|
| 17 |
+
return None
|
| 18 |
+
meta = trafilatura.extract_metadata(page)
|
| 19 |
+
if not meta or not meta.date:
|
| 20 |
+
return None
|
| 21 |
+
try:
|
| 22 |
+
return datetime.fromisoformat(meta.date)
|
| 23 |
+
except ValueError:
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
return datetime.fromisoformat(meta.date.split("T")[0])
|
| 27 |
+
except Exception:
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
def _regex_date(url: str):
|
| 31 |
+
|
| 32 |
+
for pat in _URL_DATE_PATS:
|
| 33 |
+
m = pat.search(url)
|
| 34 |
+
if m:
|
| 35 |
+
try:
|
| 36 |
+
return datetime(
|
| 37 |
+
int(m.group("y")), int(m.group("m")), int(m.group("d"))
|
| 38 |
+
)
|
| 39 |
+
except ValueError:
|
| 40 |
+
pass
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def is_after_start(url: str, start_ymd: str) -> bool:
|
| 45 |
+
"""
|
| 46 |
+
- start_ymd: 'YYYYMMDD'
|
| 47 |
+
"""
|
| 48 |
+
t0 = datetime.strptime(start_ymd, "%Y%m%d")
|
| 49 |
+
|
| 50 |
+
pub_dt = _meta_date(url)
|
| 51 |
+
|
| 52 |
+
if pub_dt is None:
|
| 53 |
+
pub_dt = _regex_date(url)
|
| 54 |
+
|
| 55 |
+
if pub_dt is None:
|
| 56 |
+
return True
|
| 57 |
+
|
| 58 |
+
return pub_dt >= t0
|