Gamortsey commited on
Commit
511a82e
·
verified ·
1 Parent(s): f0a56b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -15
app.py CHANGED
@@ -124,19 +124,150 @@ def extract_phones(text, region="GH"):
124
  pass
125
  return list(set(phones))
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  def scrape_contacts(url, region="GH"):
 
 
 
 
 
 
 
 
128
  try:
129
- res = requests.get(url, headers=HEADERS, timeout=12)
130
- if not res.ok or not res.text:
 
131
  return {"emails": [], "phones": []}
132
- text = BeautifulSoup(res.text, "html.parser").get_text(separator=" ")
133
- text = " ".join(text.split())[:300000]
134
- emails = list(set(EMAIL_REGEX.findall(text)))
135
- phones = extract_phones(text, region)
136
- return {"emails": emails, "phones": phones}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  except Exception as e:
138
  print(f"[scrape error] {url} -> {e}")
139
  return {"emails": [], "phones": []}
 
 
140
 
141
  # ============================
142
  # NER + STORY → PROFESSIONS
@@ -256,13 +387,17 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
256
  "source_query": r.get("query","")
257
  })
258
 
259
- summary = generate_summary("; ".join(queries[:3]) + (" ..." if len(queries)>3 else ""),
260
- list(set(all_people)), list(set(all_orgs)), list(set(all_locs)))
261
-
262
- # Sort by availability of email/phone
263
- professionals.sort(key=lambda it: (0 if it["email"]!="Not found" else 1,
264
- 0 if it["phone"]!="Not found" else 1))
265
- return {"summary": summary, "professionals": professionals, "queries_used": queries}
 
 
 
 
266
 
267
  # ============================
268
  # DRAFT (mailto + .eml)
@@ -285,7 +420,8 @@ def build_mailto_and_eml(to_addr, subject, body, default_from="noreply@ally.ai")
285
  f.write(msg.as_bytes())
286
 
287
  # Create mailto link (this part is fine)
288
- mailto = f"mailto:{to_addr}?subject={subject}&body={body}"
 
289
 
290
  return mailto, fname
291
 
 
124
  pass
125
  return list(set(phones))
126
 
127
+ # ---------- REPLACE scrape_contacts WITH THIS FUNCTION ----------
128
+ def _fetch_url_text(url, timeout=10):
129
+ """Fetch url and return BeautifulSoup-parsed object and raw text (or (None, ""))"""
130
+ try:
131
+ r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
132
+ if not r.ok or not r.text:
133
+ return None, ""
134
+ soup = BeautifulSoup(r.text, "html.parser")
135
+ text = soup.get_text(separator=" ")
136
+ text = " ".join(text.split())[:300000]
137
+ return soup, text
138
+ except Exception as e:
139
+ # network/DNS errors will be logged by caller
140
+ return None, ""
141
+
142
+ def _extract_emails_from_soup(soup, text):
143
+ """Return list of unique candidate emails found in anchors, JSON-LD, meta, and text."""
144
+ emails = set()
145
+
146
+ # 1) mailto: links
147
+ try:
148
+ for a in soup.find_all("a", href=True):
149
+ href = a["href"].strip()
150
+ if href.startswith("mailto:"):
151
+ # mailto may contain name and params -> split
152
+ mail = href.split("mailto:")[1].split("?")[0]
153
+ if EMAIL_REGEX.fullmatch(mail):
154
+ emails.add(mail)
155
+ except Exception:
156
+ pass
157
+
158
+ # 2) JSON-LD structured data (common for org pages)
159
+ try:
160
+ for script in soup.find_all("script", type="application/ld+json"):
161
+ try:
162
+ import json
163
+ data = json.loads(script.string or "{}")
164
+ # walk data for email fields (simple)
165
+ def walk(o):
166
+ if isinstance(o, dict):
167
+ for k,v in o.items():
168
+ if isinstance(v, (dict,list)):
169
+ walk(v)
170
+ else:
171
+ if isinstance(v, str) and EMAIL_REGEX.search(v):
172
+ emails.add(EMAIL_REGEX.search(v).group(0))
173
+ elif isinstance(o, list):
174
+ for it in o:
175
+ walk(it)
176
+ walk(data)
177
+ except Exception:
178
+ continue
179
+ except Exception:
180
+ pass
181
+
182
+ # 3) meta tags
183
+ try:
184
+ for meta in soup.find_all("meta"):
185
+ for attr in ("content","name"):
186
+ if meta.get(attr) and isinstance(meta.get(attr), str):
187
+ m = EMAIL_REGEX.search(meta.get(attr))
188
+ if m:
189
+ emails.add(m.group(0))
190
+ except Exception:
191
+ pass
192
+
193
+ # 4) text regex fallback
194
+ try:
195
+ for m in EMAIL_REGEX.findall(text or ""):
196
+ emails.add(m)
197
+ except Exception:
198
+ pass
199
+
200
+ return list(emails)
201
+
202
  def scrape_contacts(url, region="GH"):
203
+ """
204
+ Robustly scrape the given URL for emails and phones.
205
+ Strategy:
206
+ 1) Fetch the page, extract mailto and regex emails.
207
+ 2) If none found, try common contact/about/team URLs (bounded attempts).
208
+ 3) Return {"emails": [..], "phones": [..]}
209
+ """
210
+ urls_tried = set()
211
  try:
212
+ # normalize url
213
+ orig = url or ""
214
+ if not orig:
215
  return {"emails": [], "phones": []}
216
+ # ensure scheme
217
+ if not orig.startswith("http"):
218
+ orig = "http://" + orig
219
+ # first fetch main page
220
+ soup, text = _fetch_url_text(orig)
221
+ urls_tried.add(orig)
222
+ emails = []
223
+ phones = []
224
+
225
+ if soup or text:
226
+ emails = _extract_emails_from_soup(soup if soup else BeautifulSoup("", "html.parser"), text)
227
+ phones = extract_phones(text or "", region)
228
+
229
+ # If we have no emails, attempt a small set of common contact pages (bounded)
230
+ if not emails:
231
+ contact_paths = ["/contact", "/contact-us", "/contact-us/", "/contact.html",
232
+ "/about", "/about-us", "/team", "/staff", "/contactus"]
233
+ # prefer same host; build base url
234
+ try:
235
+ from urllib.parse import urljoin
236
+ for p in contact_paths:
237
+ next_url = urljoin(orig, p)
238
+ if next_url in urls_tried:
239
+ continue
240
+ soup2, text2 = _fetch_url_text(next_url)
241
+ urls_tried.add(next_url)
242
+ if not soup2 and not text2:
243
+ continue
244
+ emails2 = _extract_emails_from_soup(soup2 if soup2 else BeautifulSoup("", "html.parser"), text2)
245
+ phones2 = extract_phones(text2 or "", region)
246
+ if emails2:
247
+ emails = emails2
248
+ if phones2 and not phones:
249
+ phones = phones2
250
+ # stop early if found emails
251
+ if emails:
252
+ break
253
+ except Exception:
254
+ pass
255
+
256
+ # Final dedup & sanitization: prefer readable emails
257
+ final_emails = []
258
+ for e in emails:
259
+ if isinstance(e, str) and EMAIL_REGEX.fullmatch(e):
260
+ final_emails.append(e.strip())
261
+ final_emails = list(dict.fromkeys(final_emails)) # preserve order unique
262
+
263
+ final_phones = list(dict.fromkeys(phones))
264
+
265
+ return {"emails": final_emails, "phones": final_phones}
266
  except Exception as e:
267
  print(f"[scrape error] {url} -> {e}")
268
  return {"emails": [], "phones": []}
269
+ # ---------- END scrape_contacts replacement ----------
270
+
271
 
272
  # ============================
273
  # NER + STORY → PROFESSIONS
 
387
  "source_query": r.get("query","")
388
  })
389
 
390
+ # Second pass: for entries with "Not found", try a focused contact path (sequentially, bounded)
391
+ for p in professionals:
392
+ if p["email"] == "Not found":
393
+ try:
394
+ contacts = scrape_contacts(p["url"], region)
395
+ if contacts["emails"]:
396
+ p["email"] = contacts["emails"][0]
397
+ if contacts["phones"]:
398
+ p["phone"] = contacts["phones"][0]
399
+ except Exception:
400
+ pass
401
 
402
  # ============================
403
  # DRAFT (mailto + .eml)
 
420
  f.write(msg.as_bytes())
421
 
422
  # Create mailto link (this part is fine)
423
+ mailto = f"mailto:{urllib.parse.quote(to_addr)}?subject={urllib.parse.quote(subject or '')}&body={urllib.parse.quote(body or '')}"
424
+
425
 
426
  return mailto, fname
427