Spaces:

Gamortsey
/

AllyAI_Help_finder

Sleeping

App Files Files Community

Gamortsey commited on Aug 19

Commit

511a82e

verified ·

1 Parent(s): f0a56b8

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -15

app.py CHANGED Viewed

@@ -124,19 +124,150 @@ def extract_phones(text, region="GH"):
             pass
     return list(set(phones))
 def scrape_contacts(url, region="GH"):
     try:
-        res = requests.get(url, headers=HEADERS, timeout=12)
-        if not res.ok or not res.text:
             return {"emails": [], "phones": []}
-        text = BeautifulSoup(res.text, "html.parser").get_text(separator=" ")
-        text = " ".join(text.split())[:300000]
-        emails = list(set(EMAIL_REGEX.findall(text)))
-        phones = extract_phones(text, region)
-        return {"emails": emails, "phones": phones}
     except Exception as e:
         print(f"[scrape error] {url} -> {e}")
         return {"emails": [], "phones": []}
 # ============================
 # NER + STORY → PROFESSIONS
@@ -256,13 +387,17 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
                 "source_query": r.get("query","")
             })
-    summary = generate_summary("; ".join(queries[:3]) + (" ..." if len(queries)>3 else ""),
-                               list(set(all_people)), list(set(all_orgs)), list(set(all_locs)))
-    # Sort by availability of email/phone
-    professionals.sort(key=lambda it: (0 if it["email"]!="Not found" else 1,
-                                       0 if it["phone"]!="Not found" else 1))
-    return {"summary": summary, "professionals": professionals, "queries_used": queries}
 # ============================
 # DRAFT (mailto + .eml)
@@ -285,7 +420,8 @@ def build_mailto_and_eml(to_addr, subject, body, default_from="noreply@ally.ai")
         f.write(msg.as_bytes())
     # Create mailto link (this part is fine)
-    mailto = f"mailto:{to_addr}?subject={subject}&body={body}"
     return mailto, fname

             pass
     return list(set(phones))
+# ---------- REPLACE scrape_contacts WITH THIS FUNCTION ----------
+def _fetch_url_text(url, timeout=10):
+    """Fetch url and return BeautifulSoup-parsed object and raw text (or (None, ""))"""
+    try:
+        r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
+        if not r.ok or not r.text:
+            return None, ""
+        soup = BeautifulSoup(r.text, "html.parser")
+        text = soup.get_text(separator=" ")
+        text = " ".join(text.split())[:300000]
+        return soup, text
+    except Exception as e:
+        # network/DNS errors will be logged by caller
+        return None, ""
+def _extract_emails_from_soup(soup, text):
+    """Return list of unique candidate emails found in anchors, JSON-LD, meta, and text."""
+    emails = set()
+    # 1) mailto: links
+    try:
+        for a in soup.find_all("a", href=True):
+            href = a["href"].strip()
+            if href.startswith("mailto:"):
+                # mailto may contain name and params -> split
+                mail = href.split("mailto:")[1].split("?")[0]
+                if EMAIL_REGEX.fullmatch(mail):
+                    emails.add(mail)
+    except Exception:
+        pass
+    # 2) JSON-LD structured data (common for org pages)
+    try:
+        for script in soup.find_all("script", type="application/ld+json"):
+            try:
+                import json
+                data = json.loads(script.string or "{}")
+                # walk data for email fields (simple)
+                def walk(o):
+                    if isinstance(o, dict):
+                        for k,v in o.items():
+                            if isinstance(v, (dict,list)):
+                                walk(v)
+                            else:
+                                if isinstance(v, str) and EMAIL_REGEX.search(v):
+                                    emails.add(EMAIL_REGEX.search(v).group(0))
+                    elif isinstance(o, list):
+                        for it in o:
+                            walk(it)
+                walk(data)
+            except Exception:
+                continue
+    except Exception:
+        pass
+    # 3) meta tags
+    try:
+        for meta in soup.find_all("meta"):
+            for attr in ("content","name"):
+                if meta.get(attr) and isinstance(meta.get(attr), str):
+                    m = EMAIL_REGEX.search(meta.get(attr))
+                    if m:
+                        emails.add(m.group(0))
+    except Exception:
+        pass
+    # 4) text regex fallback
+    try:
+        for m in EMAIL_REGEX.findall(text or ""):
+            emails.add(m)
+    except Exception:
+        pass
+    return list(emails)
 def scrape_contacts(url, region="GH"):
+    """
+    Robustly scrape the given URL for emails and phones.
+    Strategy:
+      1) Fetch the page, extract mailto and regex emails.
+      2) If none found, try common contact/about/team URLs (bounded attempts).
+      3) Return {"emails": [..], "phones": [..]}
+    """
+    urls_tried = set()
     try:
+        # normalize url
+        orig = url or ""
+        if not orig:
             return {"emails": [], "phones": []}
+        # ensure scheme
+        if not orig.startswith("http"):
+            orig = "http://" + orig
+        # first fetch main page
+        soup, text = _fetch_url_text(orig)
+        urls_tried.add(orig)
+        emails = []
+        phones = []
+        if soup or text:
+            emails = _extract_emails_from_soup(soup if soup else BeautifulSoup("", "html.parser"), text)
+            phones = extract_phones(text or "", region)
+        # If we have no emails, attempt a small set of common contact pages (bounded)
+        if not emails:
+            contact_paths = ["/contact", "/contact-us", "/contact-us/", "/contact.html",
+                             "/about", "/about-us", "/team", "/staff", "/contactus"]
+            # prefer same host; build base url
+            try:
+                from urllib.parse import urljoin
+                for p in contact_paths:
+                    next_url = urljoin(orig, p)
+                    if next_url in urls_tried:
+                        continue
+                    soup2, text2 = _fetch_url_text(next_url)
+                    urls_tried.add(next_url)
+                    if not soup2 and not text2:
+                        continue
+                    emails2 = _extract_emails_from_soup(soup2 if soup2 else BeautifulSoup("", "html.parser"), text2)
+                    phones2 = extract_phones(text2 or "", region)
+                    if emails2:
+                        emails = emails2
+                    if phones2 and not phones:
+                        phones = phones2
+                    # stop early if found emails
+                    if emails:
+                        break
+            except Exception:
+                pass
+        # Final dedup & sanitization: prefer readable emails
+        final_emails = []
+        for e in emails:
+            if isinstance(e, str) and EMAIL_REGEX.fullmatch(e):
+                final_emails.append(e.strip())
+        final_emails = list(dict.fromkeys(final_emails))  # preserve order unique
+        final_phones = list(dict.fromkeys(phones))
+        return {"emails": final_emails, "phones": final_phones}
     except Exception as e:
         print(f"[scrape error] {url} -> {e}")
         return {"emails": [], "phones": []}
+# ---------- END scrape_contacts replacement ----------
 # ============================
 # NER + STORY → PROFESSIONS
                 "source_query": r.get("query","")
             })
+    # Second pass: for entries with "Not found", try a focused contact path (sequentially, bounded)
+    for p in professionals:
+        if p["email"] == "Not found":
+            try:
+                contacts = scrape_contacts(p["url"], region)
+                if contacts["emails"]:
+                    p["email"] = contacts["emails"][0]
+                if contacts["phones"]:
+                    p["phone"] = contacts["phones"][0]
+            except Exception:
+                pass
 # ============================
 # DRAFT (mailto + .eml)
         f.write(msg.as_bytes())
     # Create mailto link (this part is fine)
+    mailto = f"mailto:{urllib.parse.quote(to_addr)}?subject={urllib.parse.quote(subject or '')}&body={urllib.parse.quote(body or '')}"
     return mailto, fname