Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,9 @@ import phonenumbers
|
|
| 7 |
import pandas as pd
|
| 8 |
import urllib.parse
|
| 9 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
import torch
|
| 12 |
from transformers import (
|
|
@@ -105,15 +108,96 @@ def dedup_by_url(items):
|
|
| 105 |
# ============================
|
| 106 |
# SEARCH & SCRAPING
|
| 107 |
# ============================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
def google_search(query, num_results=5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
if not API_KEY or not CX or "YOUR_GOOGLE_API_KEY" in API_KEY or "YOUR_CSE_ID" in CX:
|
| 110 |
raise RuntimeError("Google API key and CSE ID must be set as environment variables.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
url = "https://www.googleapis.com/customsearch/v1"
|
| 112 |
-
params = {"q":query, "key":API_KEY, "cx":CX, "num":num_results}
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
def extract_phones(text, region="GH"):
|
| 119 |
phones = []
|
|
@@ -345,9 +429,23 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
|
|
| 345 |
region = get_region_for_country(country)
|
| 346 |
queries, profs = build_queries(story, country)
|
| 347 |
|
| 348 |
-
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
for q in queries:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
try:
|
| 352 |
items = google_search(q, num_results=results_per_query)
|
| 353 |
for it in items:
|
|
@@ -356,10 +454,6 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
|
|
| 356 |
except Exception as e:
|
| 357 |
print("[search error]", q, e)
|
| 358 |
|
| 359 |
-
search_results = dedup_by_url(search_results)
|
| 360 |
-
if not search_results:
|
| 361 |
-
return {"summary":"No results found. Try a different country or wording.",
|
| 362 |
-
"professionals":[], "queries_used":queries}
|
| 363 |
|
| 364 |
# NER on titles/snippets
|
| 365 |
all_people, all_orgs, all_locs = [], [], []
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
import urllib.parse
|
| 9 |
from bs4 import BeautifulSoup
|
| 10 |
+
import json
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
|
| 14 |
import torch
|
| 15 |
from transformers import (
|
|
|
|
| 108 |
# ============================
|
| 109 |
# SEARCH & SCRAPING
|
| 110 |
# ============================
|
| 111 |
+
CACHE_PATH = Path("tmp/google_cse_cache.json")
|
| 112 |
+
CACHE_TTL = 60 * 60 * 24 # 24 hours cache; adjust as needed
|
| 113 |
+
MAX_GOOGLE_RETRIES = 5
|
| 114 |
+
MIN_SECONDS_BETWEEN_CALLS = 1.0 # throttle: 1 sec between Google calls to avoid bursts
|
| 115 |
+
|
| 116 |
+
# load cache (simple file-based)
|
| 117 |
+
def _load_cache():
|
| 118 |
+
try:
|
| 119 |
+
if CACHE_PATH.exists():
|
| 120 |
+
return json.loads(CACHE_PATH.read_text(encoding="utf-8"))
|
| 121 |
+
except Exception:
|
| 122 |
+
pass
|
| 123 |
+
return {}
|
| 124 |
+
|
| 125 |
+
def _save_cache(cache):
|
| 126 |
+
try:
|
| 127 |
+
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 128 |
+
CACHE_PATH.write_text(json.dumps(cache), encoding="utf-8")
|
| 129 |
+
except Exception:
|
| 130 |
+
pass
|
| 131 |
+
|
| 132 |
+
# track last call time so we can throttle
|
| 133 |
+
_last_google_call = {"t": 0.0}
|
| 134 |
+
|
| 135 |
def google_search(query, num_results=5):
|
| 136 |
+
"""
|
| 137 |
+
Robust Google CSE caller with caching, exponential backoff on 429, and simple throttling.
|
| 138 |
+
Returns list of {"title","link","snippet"}.
|
| 139 |
+
"""
|
| 140 |
+
global _last_google_call
|
| 141 |
if not API_KEY or not CX or "YOUR_GOOGLE_API_KEY" in API_KEY or "YOUR_CSE_ID" in CX:
|
| 142 |
raise RuntimeError("Google API key and CSE ID must be set as environment variables.")
|
| 143 |
+
|
| 144 |
+
cache = _load_cache()
|
| 145 |
+
cache_key = f"gse::{query}::n{num_results}"
|
| 146 |
+
now = time.time()
|
| 147 |
+
|
| 148 |
+
# Check cache and TTL
|
| 149 |
+
if cache_key in cache:
|
| 150 |
+
entry = cache[cache_key]
|
| 151 |
+
if now - entry.get("ts", 0) < CACHE_TTL:
|
| 152 |
+
# cached
|
| 153 |
+
return entry.get("items", [])
|
| 154 |
+
|
| 155 |
+
# throttle to avoid bursts
|
| 156 |
+
elapsed = now - _last_google_call["t"]
|
| 157 |
+
if elapsed < MIN_SECONDS_BETWEEN_CALLS:
|
| 158 |
+
time.sleep(MIN_SECONDS_BETWEEN_CALLS - elapsed)
|
| 159 |
+
|
| 160 |
url = "https://www.googleapis.com/customsearch/v1"
|
| 161 |
+
params = {"q": query, "key": API_KEY, "cx": CX, "num": num_results}
|
| 162 |
+
|
| 163 |
+
backoff = 1.0
|
| 164 |
+
for attempt in range(1, MAX_GOOGLE_RETRIES + 1):
|
| 165 |
+
try:
|
| 166 |
+
r = requests.get(url, params=params, timeout=15)
|
| 167 |
+
_last_google_call["t"] = time.time()
|
| 168 |
+
if r.status_code == 200:
|
| 169 |
+
items = r.json().get("items", []) or []
|
| 170 |
+
parsed = [{"title": i.get("title",""), "link": i.get("link",""), "snippet": i.get("snippet","")} for i in items]
|
| 171 |
+
# write to cache
|
| 172 |
+
cache[cache_key] = {"ts": time.time(), "items": parsed}
|
| 173 |
+
_save_cache(cache)
|
| 174 |
+
return parsed
|
| 175 |
+
elif r.status_code == 429:
|
| 176 |
+
# handle Retry-After if present
|
| 177 |
+
ra = r.headers.get("Retry-After")
|
| 178 |
+
wait = float(ra) if ra and ra.isdigit() else backoff
|
| 179 |
+
print(f"[google_search] 429 -> sleeping {wait}s (attempt {attempt})")
|
| 180 |
+
time.sleep(wait)
|
| 181 |
+
backoff *= 2
|
| 182 |
+
continue
|
| 183 |
+
else:
|
| 184 |
+
r.raise_for_status()
|
| 185 |
+
except requests.HTTPError as e:
|
| 186 |
+
print(f"[google_search HTTPError] {e} (attempt {attempt})")
|
| 187 |
+
if attempt == MAX_GOOGLE_RETRIES:
|
| 188 |
+
raise
|
| 189 |
+
time.sleep(backoff)
|
| 190 |
+
backoff *= 2
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f"[google_search error] {e} (attempt {attempt})")
|
| 193 |
+
if attempt == MAX_GOOGLE_RETRIES:
|
| 194 |
+
raise
|
| 195 |
+
time.sleep(backoff)
|
| 196 |
+
backoff *= 2
|
| 197 |
+
|
| 198 |
+
# If we exit loop without return, return empty list
|
| 199 |
+
return []
|
| 200 |
+
# ----------------- end google_search replacement -----------------
|
| 201 |
|
| 202 |
def extract_phones(text, region="GH"):
|
| 203 |
phones = []
|
|
|
|
| 429 |
region = get_region_for_country(country)
|
| 430 |
queries, profs = build_queries(story, country)
|
| 431 |
|
| 432 |
+
# Build queries earlier as you already do:
|
| 433 |
+
queries, profs = build_queries(story, country)
|
| 434 |
+
|
| 435 |
+
# Deduplicate and limit queries to e.g. top 3
|
| 436 |
+
unique_queries = []
|
| 437 |
+
seen_q = set()
|
| 438 |
for q in queries:
|
| 439 |
+
if q not in seen_q:
|
| 440 |
+
seen_q.add(q)
|
| 441 |
+
unique_queries.append(q)
|
| 442 |
+
# limit number of queries to reduce CSE calls
|
| 443 |
+
MAX_QUERIES_PER_STORY = 3
|
| 444 |
+
unique_queries = unique_queries[:MAX_QUERIES_PER_STORY]
|
| 445 |
+
|
| 446 |
+
# Search (serialized, cached, with error handling)
|
| 447 |
+
search_results = []
|
| 448 |
+
for q in unique_queries:
|
| 449 |
try:
|
| 450 |
items = google_search(q, num_results=results_per_query)
|
| 451 |
for it in items:
|
|
|
|
| 454 |
except Exception as e:
|
| 455 |
print("[search error]", q, e)
|
| 456 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
# NER on titles/snippets
|
| 459 |
all_people, all_orgs, all_locs = [], [], []
|