Spaces:

rayymaxx
/

DirectEd-AI-LoRA-API

Sleeping

App Files Files Community

rayymaxx commited on Aug 23

Commit

509b346

1 Parent(s): 29e3684

Made more fixes

Browse files

Files changed (1) hide show

app.py → app..py +17 -19

app.py → app..py RENAMED Viewed

@@ -1,11 +1,10 @@
-# app.py — robust pattern (use this as your main app)
 import os
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-import logging
-# Configure writable cache for HF libs (prevents '/.cache' PermissionError in Spaces)
-CACHE_DIR = "/app/cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
 os.environ["HF_HOME"] = CACHE_DIR
@@ -24,45 +23,44 @@ class Request(BaseModel):
     max_new_tokens: int = 150
     temperature: float = 0.7
-# global placeholders set at startup
-model = None
-tokenizer = None
 pipe = None
-# delayed import & model load on startup
 @app.on_event("startup")
 def load_model():
-    global model, tokenizer, pipe
     try:
-        # import heavy libraries here so import(app) is lightweight
         from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
         from peft import PeftModel
-        BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit"
-        ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA"  # replace
-        # load tokenizer and base model (wrap in try/except to catch runtime issues)
         tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
         base_model = AutoModelForCausalLM.from_pretrained(
-            BASE_MODEL, device_map="auto", torch_dtype="auto"
         )
         model = PeftModel.from_pretrained(base_model, ADAPTER_REPO)
         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
         logging.info("Model and adapter loaded successfully.")
     except Exception as e:
-        # Log the exception — the Space logs will show this traceback
         logging.exception("Failed to load model at startup: %s", e)
-        # keep app importable; /generate will return 500 until model loads
-        model = None
-        tokenizer = None
         pipe = None
 @app.post("/generate")
 def generate(req: Request):
     if pipe is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet. Check logs.")
     try:
         out = pipe(req.prompt, max_new_tokens=req.max_new_tokens, temperature=req.temperature, do_sample=True)
         return {"response": out[0]["generated_text"]}

 import os
+import logging
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+# --- Use a writable cache directory (current working dir) ---
+CACHE_DIR = os.path.join(os.getcwd(), "cache")  # /code/cache in the Dockerfile layout
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
 os.environ["HF_HOME"] = CACHE_DIR
     max_new_tokens: int = 150
     temperature: float = 0.7
+# Globals to be initialized on startup
 pipe = None
 @app.on_event("startup")
 def load_model():
+    global pipe
     try:
+        # heavy imports inside startup so module import stays lightweight
         from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
         from peft import PeftModel
+        BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit"   # unchanged
+        ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA"          # <<< replace with your adapter repo
+        # load tokenizer + base model then attach adapter
         tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
         base_model = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            torch_dtype="auto",
         )
         model = PeftModel.from_pretrained(base_model, ADAPTER_REPO)
+        model.eval()
         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
         logging.info("Model and adapter loaded successfully.")
     except Exception as e:
+        # Keep server up; logs will show why load failed
         logging.exception("Failed to load model at startup: %s", e)
         pipe = None
 @app.post("/generate")
 def generate(req: Request):
     if pipe is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet. Check Space logs.")
     try:
         out = pipe(req.prompt, max_new_tokens=req.max_new_tokens, temperature=req.temperature, do_sample=True)
         return {"response": out[0]["generated_text"]}