rayymaxx commited on
Commit
509b346
Β·
1 Parent(s): 29e3684

Made more fixes

Browse files
Files changed (1) hide show
  1. app.py β†’ app..py +17 -19
app.py β†’ app..py RENAMED
@@ -1,11 +1,10 @@
1
- # app.py β€” robust pattern (use this as your main app)
2
  import os
 
3
  from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
5
- import logging
6
 
7
- # Configure writable cache for HF libs (prevents '/.cache' PermissionError in Spaces)
8
- CACHE_DIR = "/app/cache"
9
  os.makedirs(CACHE_DIR, exist_ok=True)
10
  os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
11
  os.environ["HF_HOME"] = CACHE_DIR
@@ -24,45 +23,44 @@ class Request(BaseModel):
24
  max_new_tokens: int = 150
25
  temperature: float = 0.7
26
 
27
- # global placeholders set at startup
28
- model = None
29
- tokenizer = None
30
  pipe = None
31
 
32
- # delayed import & model load on startup
33
  @app.on_event("startup")
34
  def load_model():
35
- global model, tokenizer, pipe
36
  try:
37
- # import heavy libraries here so import(app) is lightweight
38
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
39
  from peft import PeftModel
40
 
41
- BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit"
42
- ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA" # replace
43
 
44
- # load tokenizer and base model (wrap in try/except to catch runtime issues)
45
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
46
  base_model = AutoModelForCausalLM.from_pretrained(
47
- BASE_MODEL, device_map="auto", torch_dtype="auto"
 
 
 
48
  )
 
49
  model = PeftModel.from_pretrained(base_model, ADAPTER_REPO)
 
50
 
51
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
52
 
53
  logging.info("Model and adapter loaded successfully.")
54
  except Exception as e:
55
- # Log the exception β€” the Space logs will show this traceback
56
  logging.exception("Failed to load model at startup: %s", e)
57
- # keep app importable; /generate will return 500 until model loads
58
- model = None
59
- tokenizer = None
60
  pipe = None
61
 
62
  @app.post("/generate")
63
  def generate(req: Request):
64
  if pipe is None:
65
- raise HTTPException(status_code=503, detail="Model not loaded yet. Check logs.")
66
  try:
67
  out = pipe(req.prompt, max_new_tokens=req.max_new_tokens, temperature=req.temperature, do_sample=True)
68
  return {"response": out[0]["generated_text"]}
 
 
1
  import os
2
+ import logging
3
  from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
 
5
 
6
+ # --- Use a writable cache directory (current working dir) ---
7
+ CACHE_DIR = os.path.join(os.getcwd(), "cache") # /code/cache in the Dockerfile layout
8
  os.makedirs(CACHE_DIR, exist_ok=True)
9
  os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
10
  os.environ["HF_HOME"] = CACHE_DIR
 
23
  max_new_tokens: int = 150
24
  temperature: float = 0.7
25
 
26
+ # Globals to be initialized on startup
 
 
27
  pipe = None
28
 
 
29
  @app.on_event("startup")
30
  def load_model():
31
+ global pipe
32
  try:
33
+ # heavy imports inside startup so module import stays lightweight
34
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
35
  from peft import PeftModel
36
 
37
+ BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit" # unchanged
38
+ ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA" # <<< replace with your adapter repo
39
 
40
+ # load tokenizer + base model then attach adapter
41
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
42
  base_model = AutoModelForCausalLM.from_pretrained(
43
+ BASE_MODEL,
44
+ device_map="auto",
45
+ low_cpu_mem_usage=True,
46
+ torch_dtype="auto",
47
  )
48
+
49
  model = PeftModel.from_pretrained(base_model, ADAPTER_REPO)
50
+ model.eval()
51
 
52
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
53
 
54
  logging.info("Model and adapter loaded successfully.")
55
  except Exception as e:
56
+ # Keep server up; logs will show why load failed
57
  logging.exception("Failed to load model at startup: %s", e)
 
 
 
58
  pipe = None
59
 
60
  @app.post("/generate")
61
  def generate(req: Request):
62
  if pipe is None:
63
+ raise HTTPException(status_code=503, detail="Model not loaded yet. Check Space logs.")
64
  try:
65
  out = pipe(req.prompt, max_new_tokens=req.max_new_tokens, temperature=req.temperature, do_sample=True)
66
  return {"response": out[0]["generated_text"]}