MANOJSEQ commited on
Commit
54cb7c1
Β·
verified Β·
1 Parent(s): ab6969d

Upload 2 files

Browse files
Files changed (2) hide show
  1. Dockerfile +20 -8
  2. main.py +19 -2
Dockerfile CHANGED
@@ -5,13 +5,14 @@ ENV PYTHONUNBUFFERED=1 \
5
  PIP_NO_CACHE_DIR=1 \
6
  HF_HUB_DISABLE_TELEMETRY=1 \
7
  PORT=7860 \
8
- # βœ… Writable + persistent on HF Spaces:
9
  HF_HOME=/data/hf_cache \
10
- TRANSFORMERS_CACHE=/data/hf_cache \
11
  SENTENCE_TRANSFORMERS_HOME=/data/hf_cache \
12
- NLTK_DATA=/data/nltk_data
 
 
13
 
14
- # small tools
15
  RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
16
  rm -rf /var/lib/apt/lists/*
17
 
@@ -20,6 +21,7 @@ WORKDIR /app
20
  # ---- Python deps ----
21
  COPY requirements.txt ./
22
  RUN python -m pip install --upgrade pip && \
 
23
  pip install torch --index-url https://download.pytorch.org/whl/cpu && \
24
  pip install -r requirements.txt && \
25
  pip install sentencepiece
@@ -28,23 +30,33 @@ RUN python -m pip install --upgrade pip && \
28
  COPY . .
29
 
30
  # βœ… Make caches writable for the runtime user
31
- RUN mkdir -p /data/hf_cache /data/nltk_data && chmod -R 777 /data
32
 
33
- # (optional) pre-warm models into /data caches to speed first run
 
34
  RUN python - <<'PY'
35
  from sentence_transformers import SentenceTransformer
36
  SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
37
  print("βœ… SBERT cached")
38
  PY
39
 
 
40
  RUN python - <<'PY'
41
  import os, nltk
42
  os.makedirs(os.getenv("NLTK_DATA","/data/nltk_data"), exist_ok=True)
43
- nltk.download("vader_lexicon")
44
  print("βœ… VADER cached")
45
  PY
46
 
47
- # ensure everything under /data is writable after warm
 
 
 
 
 
 
 
 
48
  RUN chmod -R 777 /data
49
 
50
  EXPOSE 7860
 
5
  PIP_NO_CACHE_DIR=1 \
6
  HF_HUB_DISABLE_TELEMETRY=1 \
7
  PORT=7860 \
8
+ # βœ… Writable + persistent on Spaces
9
  HF_HOME=/data/hf_cache \
 
10
  SENTENCE_TRANSFORMERS_HOME=/data/hf_cache \
11
+ NLTK_DATA=/data/nltk_data \
12
+ TLDEXTRACT_CACHE=/data/tld_cache \
13
+ HOME=/data
14
 
15
+ # Handy tools
16
  RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
17
  rm -rf /var/lib/apt/lists/*
18
 
 
21
  # ---- Python deps ----
22
  COPY requirements.txt ./
23
  RUN python -m pip install --upgrade pip && \
24
+ # CPU-only PyTorch first
25
  pip install torch --index-url https://download.pytorch.org/whl/cpu && \
26
  pip install -r requirements.txt && \
27
  pip install sentencepiece
 
30
  COPY . .
31
 
32
  # βœ… Make caches writable for the runtime user
33
+ RUN mkdir -p /data/hf_cache /data/nltk_data /data/tld_cache && chmod -R 777 /data
34
 
35
+ # ---- Warm caches into the image layer ----
36
+ # 1) Cache SBERT
37
  RUN python - <<'PY'
38
  from sentence_transformers import SentenceTransformer
39
  SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
40
  print("βœ… SBERT cached")
41
  PY
42
 
43
+ # 2) Cache NLTK VADER
44
  RUN python - <<'PY'
45
  import os, nltk
46
  os.makedirs(os.getenv("NLTK_DATA","/data/nltk_data"), exist_ok=True)
47
+ nltk.download("vader_lexicon", download_dir=os.getenv("NLTK_DATA","/data/nltk_data"))
48
  print("βœ… VADER cached")
49
  PY
50
 
51
+ # 3) (Recommended) Pre-warm tweet-topic model so first request is instant
52
+ RUN python - <<'PY'
53
+ from transformers import pipeline
54
+ p = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi", top_k=1)
55
+ p("warmup")
56
+ print("βœ… Topic model cached")
57
+ PY
58
+
59
+ # Ensure everything under /data is writable after warms
60
  RUN chmod -R 777 /data
61
 
62
  EXPOSE 7860
main.py CHANGED
@@ -29,6 +29,7 @@ import difflib
29
  from starlette.middleware.gzip import GZipMiddleware
30
  from transformers import pipeline as hf_pipeline
31
  os.environ.setdefault("OMP_NUM_THREADS", "1")
 
32
 
33
  import torch
34
  torch.set_num_threads(2)
@@ -52,6 +53,14 @@ _news_clf = None
52
  _sbert = None
53
 
54
 
 
 
 
 
 
 
 
 
55
 
56
 
57
  # --- Translation runtime flags / caches ---
@@ -887,7 +896,7 @@ def geocode_source(source_text: str, domain: str = "", do_network: bool = False)
887
  if cache_key in domain_geo_cache:
888
  return domain_geo_cache[cache_key]
889
 
890
- ext = tldextract.extract(domain or "")
891
  fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
892
 
893
  # 0) Major outlets / domain map
@@ -1468,7 +1477,7 @@ def enrich_article(a, language=None, translate=False, target_lang=None):
1468
  # Canonicalize URL & derive domain
1469
  article_url = _canonical_url(a.get("url") or "")
1470
  try:
1471
- ext = tldextract.extract(article_url)
1472
  domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
1473
  except Exception:
1474
  domain = ""
@@ -2032,3 +2041,11 @@ def diag_translate():
2032
  "libre_ok": bool(libre),
2033
  "sample": libre or remote or local
2034
  }
 
 
 
 
 
 
 
 
 
29
  from starlette.middleware.gzip import GZipMiddleware
30
  from transformers import pipeline as hf_pipeline
31
  os.environ.setdefault("OMP_NUM_THREADS", "1")
32
+ from fastapi.responses import PlainTextResponse
33
 
34
  import torch
35
  torch.set_num_threads(2)
 
53
  _sbert = None
54
 
55
 
56
+ # set a writable cache for tldextract and avoid network PSL fetches
57
+ _TLD_CACHE = os.getenv("TLDEXTRACT_CACHE", "/data/tld_cache")
58
+ try:
59
+ # suffix_list_urls=None => use cached public suffix list only (no HTTP on startup)
60
+ _tld = tldextract.TLDExtract(cache_dir=_TLD_CACHE, suffix_list_urls=None)
61
+ except Exception:
62
+ # safe fallback: still parses domains without PSL refresh
63
+ _tld = tldextract.extract
64
 
65
 
66
  # --- Translation runtime flags / caches ---
 
896
  if cache_key in domain_geo_cache:
897
  return domain_geo_cache[cache_key]
898
 
899
+ ext = _tld(domain or "")
900
  fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
901
 
902
  # 0) Major outlets / domain map
 
1477
  # Canonicalize URL & derive domain
1478
  article_url = _canonical_url(a.get("url") or "")
1479
  try:
1480
+ ext = _tld(article_url)
1481
  domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
1482
  except Exception:
1483
  domain = ""
 
2041
  "libre_ok": bool(libre),
2042
  "sample": libre or remote or local
2043
  }
2044
+
2045
+ @app.get("/", include_in_schema=False)
2046
+ def root():
2047
+ return {"ok": True, "service": "newsglobe-backend"}
2048
+
2049
+ @app.get("/favicon.ico", include_in_schema=False)
2050
+ def favicon():
2051
+ return PlainTextResponse("", status_code=204)