MANOJSEQ commited on
Commit
d9314fb
·
verified ·
1 Parent(s): 3566a89

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +6 -40
main.py CHANGED
@@ -28,9 +28,9 @@ import threading
28
  import difflib
29
  from starlette.middleware.gzip import GZipMiddleware
30
  from transformers import pipeline as hf_pipeline
 
31
  os.environ.setdefault("OMP_NUM_THREADS", "1")
32
- from fastapi.responses import PlainTextResponse
33
- from fastapi import Response
34
 
35
  import torch
36
  torch.set_num_threads(2)
@@ -53,17 +53,6 @@ _local_pipes = {}
53
  _news_clf = None
54
  _sbert = None
55
 
56
-
57
- # set a writable cache for tldextract and avoid network PSL fetches
58
- _TLD_CACHE = os.getenv("TLDEXTRACT_CACHE", "/data/tld_cache")
59
- try:
60
- # suffix_list_urls=None => use cached public suffix list only (no HTTP on startup)
61
- _tld = tldextract.TLDExtract(cache_dir=_TLD_CACHE, suffix_list_urls=None)
62
- except Exception:
63
- # safe fallback: still parses domains without PSL refresh
64
- _tld = tldextract.extract
65
-
66
-
67
  # --- Translation runtime flags / caches ---
68
  ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
69
  _hf_bad_models: Set[str] = set()
@@ -807,21 +796,10 @@ def cluster_id(cluster, enriched_articles):
807
 
808
 
809
  # ----------------- NLTK / VADER -----------------
810
- NLTK_DATA_DIR = os.environ.get("NLTK_DATA", "/app/nltk_data")
811
-
812
- # Make sure NLTK looks in the baked, writable dir first
813
- if NLTK_DATA_DIR not in nltk.data.path:
814
- nltk.data.path.insert(0, NLTK_DATA_DIR)
815
-
816
  try:
817
  nltk.data.find("sentiment/vader_lexicon")
818
  except LookupError:
819
- # As a fallback, try downloading into the writable dir (won't run if already baked)
820
- try:
821
- os.makedirs(NLTK_DATA_DIR, exist_ok=True)
822
- nltk.download("vader_lexicon", download_dir=NLTK_DATA_DIR, quiet=True)
823
- except Exception:
824
- pass # don't crash if download is blocked
825
 
826
  try:
827
  _vader = SentimentIntensityAnalyzer()
@@ -897,7 +875,7 @@ def geocode_source(source_text: str, domain: str = "", do_network: bool = False)
897
  if cache_key in domain_geo_cache:
898
  return domain_geo_cache[cache_key]
899
 
900
- ext = _tld(domain or "")
901
  fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
902
 
903
  # 0) Major outlets / domain map
@@ -1478,7 +1456,7 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
1478
  # Canonicalize URL & derive domain
1479
  article_url = _canonical_url(a.get("url") or "")
1480
  try:
1481
- ext = _tld(article_url)
1482
  domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
1483
  except Exception:
1484
  domain = ""
@@ -1721,8 +1699,7 @@ def get_events(
1721
  max_events: int = Query(15, ge=5, le=50),
1722
  min_countries: int = Query(2, ge=1, le=50),
1723
  min_articles: int = Query(2, ge=1, le=200),
1724
- speed: Speed = Query(Speed.fast),
1725
- response: Response = None
1726
  ):
1727
 
1728
  # always build cache on untranslated data
@@ -1744,7 +1721,6 @@ def get_events(
1744
  events = [e for e in events if (e["country_count"] >= min_countries and e["article_count"] >= min_articles)]
1745
  events.sort(key=lambda e: e["article_count"], reverse=True)
1746
 
1747
- response.headers["Cache-Control"] = "public, max-age=30, s-maxage=60, stale-while-revalidate=300"
1748
  return {"events": events[:max_events], "cache_key": "|".join(map(str, cache_key))}
1749
 
1750
  @app.get("/event/{event_id}")
@@ -1817,7 +1793,6 @@ def get_news(
1817
  speed: Speed = Query(Speed.balanced),
1818
  page: int = Query(1, ge=1),
1819
  page_size: int = Query(120, ge=5, le=300),
1820
- response: Response = None
1821
  ):
1822
  enriched: List[Dict[str, Any]] = []
1823
 
@@ -1885,7 +1860,6 @@ def get_news(
1885
  for k in drop:
1886
  i.pop(k, None)
1887
 
1888
- response.headers["Cache-Control"] = "public, max-age=30, s-maxage=60, stale-while-revalidate=300"
1889
  return {
1890
  "items": items,
1891
  "total": total,
@@ -2046,11 +2020,3 @@ def diag_translate():
2046
  "libre_ok": bool(libre),
2047
  "sample": libre or remote or local
2048
  }
2049
-
2050
- @app.get("/", include_in_schema=False)
2051
- def root():
2052
- return {"ok": True, "service": "newsglobe-backend"}
2053
-
2054
- @app.get("/favicon.ico", include_in_schema=False)
2055
- def favicon():
2056
- return PlainTextResponse("", status_code=204)
 
28
  import difflib
29
  from starlette.middleware.gzip import GZipMiddleware
30
  from transformers import pipeline as hf_pipeline
31
+ import os
32
  os.environ.setdefault("OMP_NUM_THREADS", "1")
33
+ from fastapi import Path
 
34
 
35
  import torch
36
  torch.set_num_threads(2)
 
53
  _news_clf = None
54
  _sbert = None
55
 
 
 
 
 
 
 
 
 
 
 
 
56
  # --- Translation runtime flags / caches ---
57
  ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
58
  _hf_bad_models: Set[str] = set()
 
796
 
797
 
798
  # ----------------- NLTK / VADER -----------------
 
 
 
 
 
 
799
  try:
800
  nltk.data.find("sentiment/vader_lexicon")
801
  except LookupError:
802
+ nltk.download("vader_lexicon") # one-time fetch in a fresh container
 
 
 
 
 
803
 
804
  try:
805
  _vader = SentimentIntensityAnalyzer()
 
875
  if cache_key in domain_geo_cache:
876
  return domain_geo_cache[cache_key]
877
 
878
+ ext = tldextract.extract(domain or "")
879
  fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
880
 
881
  # 0) Major outlets / domain map
 
1456
  # Canonicalize URL & derive domain
1457
  article_url = _canonical_url(a.get("url") or "")
1458
  try:
1459
+ ext = tldextract.extract(article_url)
1460
  domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
1461
  except Exception:
1462
  domain = ""
 
1699
  max_events: int = Query(15, ge=5, le=50),
1700
  min_countries: int = Query(2, ge=1, le=50),
1701
  min_articles: int = Query(2, ge=1, le=200),
1702
+ speed: Speed = Query(Speed.balanced),
 
1703
  ):
1704
 
1705
  # always build cache on untranslated data
 
1721
  events = [e for e in events if (e["country_count"] >= min_countries and e["article_count"] >= min_articles)]
1722
  events.sort(key=lambda e: e["article_count"], reverse=True)
1723
 
 
1724
  return {"events": events[:max_events], "cache_key": "|".join(map(str, cache_key))}
1725
 
1726
  @app.get("/event/{event_id}")
 
1793
  speed: Speed = Query(Speed.balanced),
1794
  page: int = Query(1, ge=1),
1795
  page_size: int = Query(120, ge=5, le=300),
 
1796
  ):
1797
  enriched: List[Dict[str, Any]] = []
1798
 
 
1860
  for k in drop:
1861
  i.pop(k, None)
1862
 
 
1863
  return {
1864
  "items": items,
1865
  "total": total,
 
2020
  "libre_ok": bool(libre),
2021
  "sample": libre or remote or local
2022
  }