Spaces:

MANOJSEQ
/

newsglobe-backend

Sleeping

App Files Files Community

MANOJSEQ commited on Aug 20

Commit

d9314fb

verified ·

1 Parent(s): 3566a89

Upload main.py

Browse files

Files changed (1) hide show

main.py +6 -40

main.py CHANGED Viewed

@@ -28,9 +28,9 @@ import threading
 import difflib
 from starlette.middleware.gzip import GZipMiddleware
 from transformers import pipeline as hf_pipeline
 os.environ.setdefault("OMP_NUM_THREADS", "1")
-from fastapi.responses import PlainTextResponse
-from fastapi import Response
 import torch
 torch.set_num_threads(2)
@@ -53,17 +53,6 @@ _local_pipes = {}
 _news_clf = None
 _sbert = None
-# set a writable cache for tldextract and avoid network PSL fetches
-_TLD_CACHE = os.getenv("TLDEXTRACT_CACHE", "/data/tld_cache")
-try:
-    # suffix_list_urls=None => use cached public suffix list only (no HTTP on startup)
-    _tld = tldextract.TLDExtract(cache_dir=_TLD_CACHE, suffix_list_urls=None)
-except Exception:
-    # safe fallback: still parses domains without PSL refresh
-    _tld = tldextract.extract
 # --- Translation runtime flags / caches ---
 ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1"  # default OFF
 _hf_bad_models: Set[str] = set()
@@ -807,21 +796,10 @@ def cluster_id(cluster, enriched_articles):
 # ----------------- NLTK / VADER -----------------
-NLTK_DATA_DIR = os.environ.get("NLTK_DATA", "/app/nltk_data")
-# Make sure NLTK looks in the baked, writable dir first
-if NLTK_DATA_DIR not in nltk.data.path:
-    nltk.data.path.insert(0, NLTK_DATA_DIR)
 try:
     nltk.data.find("sentiment/vader_lexicon")
 except LookupError:
-    # As a fallback, try downloading into the writable dir (won't run if already baked)
-    try:
-        os.makedirs(NLTK_DATA_DIR, exist_ok=True)
-        nltk.download("vader_lexicon", download_dir=NLTK_DATA_DIR, quiet=True)
-    except Exception:
-        pass  # don't crash if download is blocked
 try:
     _vader = SentimentIntensityAnalyzer()
@@ -897,7 +875,7 @@ def geocode_source(source_text: str, domain: str = "", do_network: bool = False)
     if cache_key in domain_geo_cache:
         return domain_geo_cache[cache_key]
-    ext = _tld(domain or "")
     fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
     # 0) Major outlets / domain map
@@ -1478,7 +1456,7 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
     # Canonicalize URL & derive domain
     article_url = _canonical_url(a.get("url") or "")
     try:
-        ext = _tld(article_url)
         domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
     except Exception:
         domain = ""
@@ -1721,8 +1699,7 @@ def get_events(
     max_events: int = Query(15, ge=5, le=50),
     min_countries: int = Query(2, ge=1, le=50),
     min_articles: int = Query(2, ge=1, le=200),
-    speed: Speed = Query(Speed.fast),
-    response: Response = None
 ):
     # always build cache on untranslated data
@@ -1744,7 +1721,6 @@ def get_events(
     events = [e for e in events if (e["country_count"] >= min_countries and e["article_count"] >= min_articles)]
     events.sort(key=lambda e: e["article_count"], reverse=True)
-    response.headers["Cache-Control"] = "public, max-age=30, s-maxage=60, stale-while-revalidate=300"
     return {"events": events[:max_events], "cache_key": "|".join(map(str, cache_key))}
 @app.get("/event/{event_id}")
@@ -1817,7 +1793,6 @@ def get_news(
     speed: Speed = Query(Speed.balanced),
     page: int = Query(1, ge=1),
     page_size: int = Query(120, ge=5, le=300),
-    response: Response = None
 ):
     enriched: List[Dict[str, Any]] = []
@@ -1885,7 +1860,6 @@ def get_news(
             for k in drop:
                 i.pop(k, None)
-    response.headers["Cache-Control"] = "public, max-age=30, s-maxage=60, stale-while-revalidate=300"
     return {
         "items": items,
         "total": total,
@@ -2046,11 +2020,3 @@ def diag_translate():
         "libre_ok": bool(libre),
         "sample": libre or remote or local
     }
-@app.get("/", include_in_schema=False)
-def root():
-    return {"ok": True, "service": "newsglobe-backend"}
-@app.get("/favicon.ico", include_in_schema=False)
-def favicon():
-    return PlainTextResponse("", status_code=204)

 import difflib
 from starlette.middleware.gzip import GZipMiddleware
 from transformers import pipeline as hf_pipeline
+import os
 os.environ.setdefault("OMP_NUM_THREADS", "1")
+from fastapi import Path
 import torch
 torch.set_num_threads(2)
 _news_clf = None
 _sbert = None
 # --- Translation runtime flags / caches ---
 ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1"  # default OFF
 _hf_bad_models: Set[str] = set()
 # ----------------- NLTK / VADER -----------------
 try:
     nltk.data.find("sentiment/vader_lexicon")
 except LookupError:
+    nltk.download("vader_lexicon")  # one-time fetch in a fresh container
 try:
     _vader = SentimentIntensityAnalyzer()
     if cache_key in domain_geo_cache:
         return domain_geo_cache[cache_key]
+    ext = tldextract.extract(domain or "")
     fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
     # 0) Major outlets / domain map
     # Canonicalize URL & derive domain
     article_url = _canonical_url(a.get("url") or "")
     try:
+        ext = tldextract.extract(article_url)
         domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
     except Exception:
         domain = ""
     max_events: int = Query(15, ge=5, le=50),
     min_countries: int = Query(2, ge=1, le=50),
     min_articles: int = Query(2, ge=1, le=200),
+    speed: Speed = Query(Speed.balanced),
 ):
     # always build cache on untranslated data
     events = [e for e in events if (e["country_count"] >= min_countries and e["article_count"] >= min_articles)]
     events.sort(key=lambda e: e["article_count"], reverse=True)
     return {"events": events[:max_events], "cache_key": "|".join(map(str, cache_key))}
 @app.get("/event/{event_id}")
     speed: Speed = Query(Speed.balanced),
     page: int = Query(1, ge=1),
     page_size: int = Query(120, ge=5, le=300),
 ):
     enriched: List[Dict[str, Any]] = []
             for k in drop:
                 i.pop(k, None)
     return {
         "items": items,
         "total": total,
         "libre_ok": bool(libre),
         "sample": libre or remote or local
     }