Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Upload 2 files
Browse files- Dockerfile +20 -8
- main.py +19 -2
    	
        Dockerfile
    CHANGED
    
    | @@ -5,13 +5,14 @@ ENV PYTHONUNBUFFERED=1 \ | |
| 5 | 
             
                PIP_NO_CACHE_DIR=1 \
         | 
| 6 | 
             
                HF_HUB_DISABLE_TELEMETRY=1 \
         | 
| 7 | 
             
                PORT=7860 \
         | 
| 8 | 
            -
                # β
 Writable + persistent on  | 
| 9 | 
             
                HF_HOME=/data/hf_cache \
         | 
| 10 | 
            -
                TRANSFORMERS_CACHE=/data/hf_cache \
         | 
| 11 | 
             
                SENTENCE_TRANSFORMERS_HOME=/data/hf_cache \
         | 
| 12 | 
            -
                NLTK_DATA=/data/nltk_data
         | 
|  | |
|  | |
| 13 |  | 
| 14 | 
            -
            #  | 
| 15 | 
             
            RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
         | 
| 16 | 
             
                rm -rf /var/lib/apt/lists/*
         | 
| 17 |  | 
| @@ -20,6 +21,7 @@ WORKDIR /app | |
| 20 | 
             
            # ---- Python deps ----
         | 
| 21 | 
             
            COPY requirements.txt ./
         | 
| 22 | 
             
            RUN python -m pip install --upgrade pip && \
         | 
|  | |
| 23 | 
             
                pip install torch --index-url https://download.pytorch.org/whl/cpu && \
         | 
| 24 | 
             
                pip install -r requirements.txt && \
         | 
| 25 | 
             
                pip install sentencepiece
         | 
| @@ -28,23 +30,33 @@ RUN python -m pip install --upgrade pip && \ | |
| 28 | 
             
            COPY . .
         | 
| 29 |  | 
| 30 | 
             
            # β
 Make caches writable for the runtime user
         | 
| 31 | 
            -
            RUN mkdir -p /data/hf_cache /data/nltk_data && chmod -R 777 /data
         | 
| 32 |  | 
| 33 | 
            -
            #  | 
|  | |
| 34 | 
             
            RUN python - <<'PY'
         | 
| 35 | 
             
            from sentence_transformers import SentenceTransformer
         | 
| 36 | 
             
            SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
         | 
| 37 | 
             
            print("β
 SBERT cached")
         | 
| 38 | 
             
            PY
         | 
| 39 |  | 
|  | |
| 40 | 
             
            RUN python - <<'PY'
         | 
| 41 | 
             
            import os, nltk
         | 
| 42 | 
             
            os.makedirs(os.getenv("NLTK_DATA","/data/nltk_data"), exist_ok=True)
         | 
| 43 | 
            -
            nltk.download("vader_lexicon")
         | 
| 44 | 
             
            print("β
 VADER cached")
         | 
| 45 | 
             
            PY
         | 
| 46 |  | 
| 47 | 
            -
            #  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 48 | 
             
            RUN chmod -R 777 /data
         | 
| 49 |  | 
| 50 | 
             
            EXPOSE 7860
         | 
|  | |
| 5 | 
             
                PIP_NO_CACHE_DIR=1 \
         | 
| 6 | 
             
                HF_HUB_DISABLE_TELEMETRY=1 \
         | 
| 7 | 
             
                PORT=7860 \
         | 
| 8 | 
            +
                # β
 Writable + persistent on Spaces
         | 
| 9 | 
             
                HF_HOME=/data/hf_cache \
         | 
|  | |
| 10 | 
             
                SENTENCE_TRANSFORMERS_HOME=/data/hf_cache \
         | 
| 11 | 
            +
                NLTK_DATA=/data/nltk_data \
         | 
| 12 | 
            +
                TLDEXTRACT_CACHE=/data/tld_cache \
         | 
| 13 | 
            +
                HOME=/data
         | 
| 14 |  | 
| 15 | 
            +
            # Handy tools
         | 
| 16 | 
             
            RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
         | 
| 17 | 
             
                rm -rf /var/lib/apt/lists/*
         | 
| 18 |  | 
|  | |
| 21 | 
             
            # ---- Python deps ----
         | 
| 22 | 
             
            COPY requirements.txt ./
         | 
| 23 | 
             
            RUN python -m pip install --upgrade pip && \
         | 
| 24 | 
            +
                # CPU-only PyTorch first
         | 
| 25 | 
             
                pip install torch --index-url https://download.pytorch.org/whl/cpu && \
         | 
| 26 | 
             
                pip install -r requirements.txt && \
         | 
| 27 | 
             
                pip install sentencepiece
         | 
|  | |
| 30 | 
             
            COPY . .
         | 
| 31 |  | 
| 32 | 
             
            # β
 Make caches writable for the runtime user
         | 
| 33 | 
            +
            RUN mkdir -p /data/hf_cache /data/nltk_data /data/tld_cache && chmod -R 777 /data
         | 
| 34 |  | 
| 35 | 
            +
            # ---- Warm caches into the image layer ----
         | 
| 36 | 
            +
            # 1) Cache SBERT
         | 
| 37 | 
             
            RUN python - <<'PY'
         | 
| 38 | 
             
            from sentence_transformers import SentenceTransformer
         | 
| 39 | 
             
            SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
         | 
| 40 | 
             
            print("β
 SBERT cached")
         | 
| 41 | 
             
            PY
         | 
| 42 |  | 
| 43 | 
            +
            # 2) Cache NLTK VADER
         | 
| 44 | 
             
            RUN python - <<'PY'
         | 
| 45 | 
             
            import os, nltk
         | 
| 46 | 
             
            os.makedirs(os.getenv("NLTK_DATA","/data/nltk_data"), exist_ok=True)
         | 
| 47 | 
            +
            nltk.download("vader_lexicon", download_dir=os.getenv("NLTK_DATA","/data/nltk_data"))
         | 
| 48 | 
             
            print("β
 VADER cached")
         | 
| 49 | 
             
            PY
         | 
| 50 |  | 
| 51 | 
            +
            # 3) (Recommended) Pre-warm tweet-topic model so first request is instant
         | 
| 52 | 
            +
            RUN python - <<'PY'
         | 
| 53 | 
            +
            from transformers import pipeline
         | 
| 54 | 
            +
            p = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi", top_k=1)
         | 
| 55 | 
            +
            p("warmup")
         | 
| 56 | 
            +
            print("β
 Topic model cached")
         | 
| 57 | 
            +
            PY
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            # Ensure everything under /data is writable after warms
         | 
| 60 | 
             
            RUN chmod -R 777 /data
         | 
| 61 |  | 
| 62 | 
             
            EXPOSE 7860
         | 
    	
        main.py
    CHANGED
    
    | @@ -29,6 +29,7 @@ import difflib | |
| 29 | 
             
            from starlette.middleware.gzip import GZipMiddleware
         | 
| 30 | 
             
            from transformers import pipeline as hf_pipeline
         | 
| 31 | 
             
            os.environ.setdefault("OMP_NUM_THREADS", "1")
         | 
|  | |
| 32 |  | 
| 33 | 
             
            import torch
         | 
| 34 | 
             
            torch.set_num_threads(2)
         | 
| @@ -52,6 +53,14 @@ _news_clf = None | |
| 52 | 
             
            _sbert = None
         | 
| 53 |  | 
| 54 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 55 |  | 
| 56 |  | 
| 57 | 
             
            # --- Translation runtime flags / caches ---
         | 
| @@ -887,7 +896,7 @@ def geocode_source(source_text: str, domain: str = "", do_network: bool = False) | |
| 887 | 
             
                if cache_key in domain_geo_cache:
         | 
| 888 | 
             
                    return domain_geo_cache[cache_key]
         | 
| 889 |  | 
| 890 | 
            -
                ext =  | 
| 891 | 
             
                fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
         | 
| 892 |  | 
| 893 | 
             
                # 0) Major outlets / domain map
         | 
| @@ -1468,7 +1477,7 @@ def enrich_article(a, language=None, translate=False, target_lang=None): | |
| 1468 | 
             
                # Canonicalize URL & derive domain
         | 
| 1469 | 
             
                article_url = _canonical_url(a.get("url") or "")
         | 
| 1470 | 
             
                try:
         | 
| 1471 | 
            -
                    ext =  | 
| 1472 | 
             
                    domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
         | 
| 1473 | 
             
                except Exception:
         | 
| 1474 | 
             
                    domain = ""
         | 
| @@ -2032,3 +2041,11 @@ def diag_translate(): | |
| 2032 | 
             
                    "libre_ok": bool(libre),
         | 
| 2033 | 
             
                    "sample": libre or remote or local
         | 
| 2034 | 
             
                }
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 29 | 
             
            from starlette.middleware.gzip import GZipMiddleware
         | 
| 30 | 
             
            from transformers import pipeline as hf_pipeline
         | 
| 31 | 
             
            os.environ.setdefault("OMP_NUM_THREADS", "1")
         | 
| 32 | 
            +
            from fastapi.responses import PlainTextResponse
         | 
| 33 |  | 
| 34 | 
             
            import torch
         | 
| 35 | 
             
            torch.set_num_threads(2)
         | 
|  | |
| 53 | 
             
            _sbert = None
         | 
| 54 |  | 
| 55 |  | 
| 56 | 
            +
            # set a writable cache for tldextract and avoid network PSL fetches
         | 
| 57 | 
            +
            _TLD_CACHE = os.getenv("TLDEXTRACT_CACHE", "/data/tld_cache")
         | 
| 58 | 
            +
            try:
         | 
| 59 | 
            +
                # suffix_list_urls=None => use cached public suffix list only (no HTTP on startup)
         | 
| 60 | 
            +
                _tld = tldextract.TLDExtract(cache_dir=_TLD_CACHE, suffix_list_urls=None)
         | 
| 61 | 
            +
            except Exception:
         | 
| 62 | 
            +
                # safe fallback: still parses domains without PSL refresh
         | 
| 63 | 
            +
                _tld = tldextract.extract
         | 
| 64 |  | 
| 65 |  | 
| 66 | 
             
            # --- Translation runtime flags / caches ---
         | 
|  | |
| 896 | 
             
                if cache_key in domain_geo_cache:
         | 
| 897 | 
             
                    return domain_geo_cache[cache_key]
         | 
| 898 |  | 
| 899 | 
            +
                ext = _tld(domain or "")
         | 
| 900 | 
             
                fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
         | 
| 901 |  | 
| 902 | 
             
                # 0) Major outlets / domain map
         | 
|  | |
| 1477 | 
             
                # Canonicalize URL & derive domain
         | 
| 1478 | 
             
                article_url = _canonical_url(a.get("url") or "")
         | 
| 1479 | 
             
                try:
         | 
| 1480 | 
            +
                    ext = _tld(article_url)
         | 
| 1481 | 
             
                    domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
         | 
| 1482 | 
             
                except Exception:
         | 
| 1483 | 
             
                    domain = ""
         | 
|  | |
| 2041 | 
             
                    "libre_ok": bool(libre),
         | 
| 2042 | 
             
                    "sample": libre or remote or local
         | 
| 2043 | 
             
                }
         | 
| 2044 | 
            +
             | 
| 2045 | 
            +
            @app.get("/", include_in_schema=False)
         | 
| 2046 | 
            +
            def root():
         | 
| 2047 | 
            +
                return {"ok": True, "service": "newsglobe-backend"}
         | 
| 2048 | 
            +
             | 
| 2049 | 
            +
            @app.get("/favicon.ico", include_in_schema=False)
         | 
| 2050 | 
            +
            def favicon():
         | 
| 2051 | 
            +
                return PlainTextResponse("", status_code=204)
         | 
