Spaces:
Sleeping
Sleeping
Upload main.py
Browse files
main.py
CHANGED
|
@@ -28,9 +28,9 @@ import threading
|
|
| 28 |
import difflib
|
| 29 |
from starlette.middleware.gzip import GZipMiddleware
|
| 30 |
from transformers import pipeline as hf_pipeline
|
|
|
|
| 31 |
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
| 32 |
-
from fastapi
|
| 33 |
-
from fastapi import Response
|
| 34 |
|
| 35 |
import torch
|
| 36 |
torch.set_num_threads(2)
|
|
@@ -53,17 +53,6 @@ _local_pipes = {}
|
|
| 53 |
_news_clf = None
|
| 54 |
_sbert = None
|
| 55 |
|
| 56 |
-
|
| 57 |
-
# set a writable cache for tldextract and avoid network PSL fetches
|
| 58 |
-
_TLD_CACHE = os.getenv("TLDEXTRACT_CACHE", "/data/tld_cache")
|
| 59 |
-
try:
|
| 60 |
-
# suffix_list_urls=None => use cached public suffix list only (no HTTP on startup)
|
| 61 |
-
_tld = tldextract.TLDExtract(cache_dir=_TLD_CACHE, suffix_list_urls=None)
|
| 62 |
-
except Exception:
|
| 63 |
-
# safe fallback: still parses domains without PSL refresh
|
| 64 |
-
_tld = tldextract.extract
|
| 65 |
-
|
| 66 |
-
|
| 67 |
# --- Translation runtime flags / caches ---
|
| 68 |
ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
|
| 69 |
_hf_bad_models: Set[str] = set()
|
|
@@ -807,21 +796,10 @@ def cluster_id(cluster, enriched_articles):
|
|
| 807 |
|
| 808 |
|
| 809 |
# ----------------- NLTK / VADER -----------------
|
| 810 |
-
NLTK_DATA_DIR = os.environ.get("NLTK_DATA", "/app/nltk_data")
|
| 811 |
-
|
| 812 |
-
# Make sure NLTK looks in the baked, writable dir first
|
| 813 |
-
if NLTK_DATA_DIR not in nltk.data.path:
|
| 814 |
-
nltk.data.path.insert(0, NLTK_DATA_DIR)
|
| 815 |
-
|
| 816 |
try:
|
| 817 |
nltk.data.find("sentiment/vader_lexicon")
|
| 818 |
except LookupError:
|
| 819 |
-
#
|
| 820 |
-
try:
|
| 821 |
-
os.makedirs(NLTK_DATA_DIR, exist_ok=True)
|
| 822 |
-
nltk.download("vader_lexicon", download_dir=NLTK_DATA_DIR, quiet=True)
|
| 823 |
-
except Exception:
|
| 824 |
-
pass # don't crash if download is blocked
|
| 825 |
|
| 826 |
try:
|
| 827 |
_vader = SentimentIntensityAnalyzer()
|
|
@@ -897,7 +875,7 @@ def geocode_source(source_text: str, domain: str = "", do_network: bool = False)
|
|
| 897 |
if cache_key in domain_geo_cache:
|
| 898 |
return domain_geo_cache[cache_key]
|
| 899 |
|
| 900 |
-
ext =
|
| 901 |
fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
|
| 902 |
|
| 903 |
# 0) Major outlets / domain map
|
|
@@ -1478,7 +1456,7 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
|
|
| 1478 |
# Canonicalize URL & derive domain
|
| 1479 |
article_url = _canonical_url(a.get("url") or "")
|
| 1480 |
try:
|
| 1481 |
-
ext =
|
| 1482 |
domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
|
| 1483 |
except Exception:
|
| 1484 |
domain = ""
|
|
@@ -1721,8 +1699,7 @@ def get_events(
|
|
| 1721 |
max_events: int = Query(15, ge=5, le=50),
|
| 1722 |
min_countries: int = Query(2, ge=1, le=50),
|
| 1723 |
min_articles: int = Query(2, ge=1, le=200),
|
| 1724 |
-
speed: Speed = Query(Speed.
|
| 1725 |
-
response: Response = None
|
| 1726 |
):
|
| 1727 |
|
| 1728 |
# always build cache on untranslated data
|
|
@@ -1744,7 +1721,6 @@ def get_events(
|
|
| 1744 |
events = [e for e in events if (e["country_count"] >= min_countries and e["article_count"] >= min_articles)]
|
| 1745 |
events.sort(key=lambda e: e["article_count"], reverse=True)
|
| 1746 |
|
| 1747 |
-
response.headers["Cache-Control"] = "public, max-age=30, s-maxage=60, stale-while-revalidate=300"
|
| 1748 |
return {"events": events[:max_events], "cache_key": "|".join(map(str, cache_key))}
|
| 1749 |
|
| 1750 |
@app.get("/event/{event_id}")
|
|
@@ -1817,7 +1793,6 @@ def get_news(
|
|
| 1817 |
speed: Speed = Query(Speed.balanced),
|
| 1818 |
page: int = Query(1, ge=1),
|
| 1819 |
page_size: int = Query(120, ge=5, le=300),
|
| 1820 |
-
response: Response = None
|
| 1821 |
):
|
| 1822 |
enriched: List[Dict[str, Any]] = []
|
| 1823 |
|
|
@@ -1885,7 +1860,6 @@ def get_news(
|
|
| 1885 |
for k in drop:
|
| 1886 |
i.pop(k, None)
|
| 1887 |
|
| 1888 |
-
response.headers["Cache-Control"] = "public, max-age=30, s-maxage=60, stale-while-revalidate=300"
|
| 1889 |
return {
|
| 1890 |
"items": items,
|
| 1891 |
"total": total,
|
|
@@ -2046,11 +2020,3 @@ def diag_translate():
|
|
| 2046 |
"libre_ok": bool(libre),
|
| 2047 |
"sample": libre or remote or local
|
| 2048 |
}
|
| 2049 |
-
|
| 2050 |
-
@app.get("/", include_in_schema=False)
|
| 2051 |
-
def root():
|
| 2052 |
-
return {"ok": True, "service": "newsglobe-backend"}
|
| 2053 |
-
|
| 2054 |
-
@app.get("/favicon.ico", include_in_schema=False)
|
| 2055 |
-
def favicon():
|
| 2056 |
-
return PlainTextResponse("", status_code=204)
|
|
|
|
| 28 |
import difflib
|
| 29 |
from starlette.middleware.gzip import GZipMiddleware
|
| 30 |
from transformers import pipeline as hf_pipeline
|
| 31 |
+
import os
|
| 32 |
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
| 33 |
+
from fastapi import Path
|
|
|
|
| 34 |
|
| 35 |
import torch
|
| 36 |
torch.set_num_threads(2)
|
|
|
|
| 53 |
_news_clf = None
|
| 54 |
_sbert = None
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# --- Translation runtime flags / caches ---
|
| 57 |
ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
|
| 58 |
_hf_bad_models: Set[str] = set()
|
|
|
|
| 796 |
|
| 797 |
|
| 798 |
# ----------------- NLTK / VADER -----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 799 |
try:
|
| 800 |
nltk.data.find("sentiment/vader_lexicon")
|
| 801 |
except LookupError:
|
| 802 |
+
nltk.download("vader_lexicon") # one-time fetch in a fresh container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 803 |
|
| 804 |
try:
|
| 805 |
_vader = SentimentIntensityAnalyzer()
|
|
|
|
| 875 |
if cache_key in domain_geo_cache:
|
| 876 |
return domain_geo_cache[cache_key]
|
| 877 |
|
| 878 |
+
ext = tldextract.extract(domain or "")
|
| 879 |
fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
|
| 880 |
|
| 881 |
# 0) Major outlets / domain map
|
|
|
|
| 1456 |
# Canonicalize URL & derive domain
|
| 1457 |
article_url = _canonical_url(a.get("url") or "")
|
| 1458 |
try:
|
| 1459 |
+
ext = tldextract.extract(article_url)
|
| 1460 |
domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
|
| 1461 |
except Exception:
|
| 1462 |
domain = ""
|
|
|
|
| 1699 |
max_events: int = Query(15, ge=5, le=50),
|
| 1700 |
min_countries: int = Query(2, ge=1, le=50),
|
| 1701 |
min_articles: int = Query(2, ge=1, le=200),
|
| 1702 |
+
speed: Speed = Query(Speed.balanced),
|
|
|
|
| 1703 |
):
|
| 1704 |
|
| 1705 |
# always build cache on untranslated data
|
|
|
|
| 1721 |
events = [e for e in events if (e["country_count"] >= min_countries and e["article_count"] >= min_articles)]
|
| 1722 |
events.sort(key=lambda e: e["article_count"], reverse=True)
|
| 1723 |
|
|
|
|
| 1724 |
return {"events": events[:max_events], "cache_key": "|".join(map(str, cache_key))}
|
| 1725 |
|
| 1726 |
@app.get("/event/{event_id}")
|
|
|
|
| 1793 |
speed: Speed = Query(Speed.balanced),
|
| 1794 |
page: int = Query(1, ge=1),
|
| 1795 |
page_size: int = Query(120, ge=5, le=300),
|
|
|
|
| 1796 |
):
|
| 1797 |
enriched: List[Dict[str, Any]] = []
|
| 1798 |
|
|
|
|
| 1860 |
for k in drop:
|
| 1861 |
i.pop(k, None)
|
| 1862 |
|
|
|
|
| 1863 |
return {
|
| 1864 |
"items": items,
|
| 1865 |
"total": total,
|
|
|
|
| 2020 |
"libre_ok": bool(libre),
|
| 2021 |
"sample": libre or remote or local
|
| 2022 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|