Gamortsey commited on
Commit
c4b8b95
·
verified ·
1 Parent(s): 511a82e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -11
app.py CHANGED
@@ -7,6 +7,9 @@ import phonenumbers
7
  import pandas as pd
8
  import urllib.parse
9
  from bs4 import BeautifulSoup
 
 
 
10
 
11
  import torch
12
  from transformers import (
@@ -105,15 +108,96 @@ def dedup_by_url(items):
105
  # ============================
106
  # SEARCH & SCRAPING
107
  # ============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def google_search(query, num_results=5):
 
 
 
 
 
109
  if not API_KEY or not CX or "YOUR_GOOGLE_API_KEY" in API_KEY or "YOUR_CSE_ID" in CX:
110
  raise RuntimeError("Google API key and CSE ID must be set as environment variables.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  url = "https://www.googleapis.com/customsearch/v1"
112
- params = {"q":query, "key":API_KEY, "cx":CX, "num":num_results}
113
- r = requests.get(url, params=params, timeout=20)
114
- r.raise_for_status()
115
- items = r.json().get("items", []) or []
116
- return [{"title":i.get("title",""), "link":i.get("link",""), "snippet":i.get("snippet","")} for i in items]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def extract_phones(text, region="GH"):
119
  phones = []
@@ -345,9 +429,23 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
345
  region = get_region_for_country(country)
346
  queries, profs = build_queries(story, country)
347
 
348
- # Search
349
- search_results = []
 
 
 
 
350
  for q in queries:
 
 
 
 
 
 
 
 
 
 
351
  try:
352
  items = google_search(q, num_results=results_per_query)
353
  for it in items:
@@ -356,10 +454,6 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
356
  except Exception as e:
357
  print("[search error]", q, e)
358
 
359
- search_results = dedup_by_url(search_results)
360
- if not search_results:
361
- return {"summary":"No results found. Try a different country or wording.",
362
- "professionals":[], "queries_used":queries}
363
 
364
  # NER on titles/snippets
365
  all_people, all_orgs, all_locs = [], [], []
 
7
  import pandas as pd
8
  import urllib.parse
9
  from bs4 import BeautifulSoup
10
+ import json
11
+ from pathlib import Path
12
+
13
 
14
  import torch
15
  from transformers import (
 
108
  # ============================
109
  # SEARCH & SCRAPING
110
  # ============================
111
+ CACHE_PATH = Path("tmp/google_cse_cache.json")
112
+ CACHE_TTL = 60 * 60 * 24 # 24 hours cache; adjust as needed
113
+ MAX_GOOGLE_RETRIES = 5
114
+ MIN_SECONDS_BETWEEN_CALLS = 1.0 # throttle: 1 sec between Google calls to avoid bursts
115
+
116
+ # load cache (simple file-based)
117
+ def _load_cache():
118
+ try:
119
+ if CACHE_PATH.exists():
120
+ return json.loads(CACHE_PATH.read_text(encoding="utf-8"))
121
+ except Exception:
122
+ pass
123
+ return {}
124
+
125
+ def _save_cache(cache):
126
+ try:
127
+ CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
128
+ CACHE_PATH.write_text(json.dumps(cache), encoding="utf-8")
129
+ except Exception:
130
+ pass
131
+
132
+ # track last call time so we can throttle
133
+ _last_google_call = {"t": 0.0}
134
+
135
  def google_search(query, num_results=5):
136
+ """
137
+ Robust Google CSE caller with caching, exponential backoff on 429, and simple throttling.
138
+ Returns list of {"title","link","snippet"}.
139
+ """
140
+ global _last_google_call
141
  if not API_KEY or not CX or "YOUR_GOOGLE_API_KEY" in API_KEY or "YOUR_CSE_ID" in CX:
142
  raise RuntimeError("Google API key and CSE ID must be set as environment variables.")
143
+
144
+ cache = _load_cache()
145
+ cache_key = f"gse::{query}::n{num_results}"
146
+ now = time.time()
147
+
148
+ # Check cache and TTL
149
+ if cache_key in cache:
150
+ entry = cache[cache_key]
151
+ if now - entry.get("ts", 0) < CACHE_TTL:
152
+ # cached
153
+ return entry.get("items", [])
154
+
155
+ # throttle to avoid bursts
156
+ elapsed = now - _last_google_call["t"]
157
+ if elapsed < MIN_SECONDS_BETWEEN_CALLS:
158
+ time.sleep(MIN_SECONDS_BETWEEN_CALLS - elapsed)
159
+
160
  url = "https://www.googleapis.com/customsearch/v1"
161
+ params = {"q": query, "key": API_KEY, "cx": CX, "num": num_results}
162
+
163
+ backoff = 1.0
164
+ for attempt in range(1, MAX_GOOGLE_RETRIES + 1):
165
+ try:
166
+ r = requests.get(url, params=params, timeout=15)
167
+ _last_google_call["t"] = time.time()
168
+ if r.status_code == 200:
169
+ items = r.json().get("items", []) or []
170
+ parsed = [{"title": i.get("title",""), "link": i.get("link",""), "snippet": i.get("snippet","")} for i in items]
171
+ # write to cache
172
+ cache[cache_key] = {"ts": time.time(), "items": parsed}
173
+ _save_cache(cache)
174
+ return parsed
175
+ elif r.status_code == 429:
176
+ # handle Retry-After if present
177
+ ra = r.headers.get("Retry-After")
178
+ wait = float(ra) if ra and ra.isdigit() else backoff
179
+ print(f"[google_search] 429 -> sleeping {wait}s (attempt {attempt})")
180
+ time.sleep(wait)
181
+ backoff *= 2
182
+ continue
183
+ else:
184
+ r.raise_for_status()
185
+ except requests.HTTPError as e:
186
+ print(f"[google_search HTTPError] {e} (attempt {attempt})")
187
+ if attempt == MAX_GOOGLE_RETRIES:
188
+ raise
189
+ time.sleep(backoff)
190
+ backoff *= 2
191
+ except Exception as e:
192
+ print(f"[google_search error] {e} (attempt {attempt})")
193
+ if attempt == MAX_GOOGLE_RETRIES:
194
+ raise
195
+ time.sleep(backoff)
196
+ backoff *= 2
197
+
198
+ # If we exit loop without return, return empty list
199
+ return []
200
+ # ----------------- end google_search replacement -----------------
201
 
202
  def extract_phones(text, region="GH"):
203
  phones = []
 
429
  region = get_region_for_country(country)
430
  queries, profs = build_queries(story, country)
431
 
432
+ # Build queries earlier as you already do:
433
+ queries, profs = build_queries(story, country)
434
+
435
+ # Deduplicate and limit queries to e.g. top 3
436
+ unique_queries = []
437
+ seen_q = set()
438
  for q in queries:
439
+ if q not in seen_q:
440
+ seen_q.add(q)
441
+ unique_queries.append(q)
442
+ # limit number of queries to reduce CSE calls
443
+ MAX_QUERIES_PER_STORY = 3
444
+ unique_queries = unique_queries[:MAX_QUERIES_PER_STORY]
445
+
446
+ # Search (serialized, cached, with error handling)
447
+ search_results = []
448
+ for q in unique_queries:
449
  try:
450
  items = google_search(q, num_results=results_per_query)
451
  for it in items:
 
454
  except Exception as e:
455
  print("[search error]", q, e)
456
 
 
 
 
 
457
 
458
  # NER on titles/snippets
459
  all_people, all_orgs, all_locs = [], [], []