rayymaxx commited on
Commit
6103ef6
·
verified ·
1 Parent(s): b66d06d

Updated app again

Browse files
Files changed (1) hide show
  1. app.py +69 -3
app.py CHANGED
@@ -1,5 +1,25 @@
1
- from fastapi import FastAPI
2
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  @app.get("/health")
5
  def health():
@@ -7,4 +27,50 @@ def health():
7
 
8
  @app.get("/")
9
  def root():
10
- return {"Minimal code running"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py (safe, use /tmp for cache)
2
+ import os
3
+ import logging
4
+ from fastapi import FastAPI, HTTPException
5
+ from pydantic import BaseModel
6
+ import tempfile
7
+
8
+ # --- Put caches in a writable temp dir to avoid permission errors ---
9
+ TMP_CACHE = os.environ.get("HF_CACHE_DIR", os.path.join(tempfile.gettempdir(), "hf_cache"))
10
+ try:
11
+ os.makedirs(TMP_CACHE, exist_ok=True)
12
+ except Exception as e:
13
+ # if even this fails, fall back to tempfile.gettempdir()
14
+ TMP_CACHE = tempfile.gettempdir()
15
+
16
+ # export environment vars before importing transformers
17
+ os.environ["TRANSFORMERS_CACHE"] = TMP_CACHE
18
+ os.environ["HF_HOME"] = TMP_CACHE
19
+ os.environ["HF_DATASETS_CACHE"] = TMP_CACHE
20
+ os.environ["HF_METRICS_CACHE"] = TMP_CACHE
21
+
22
+ app = FastAPI(title="DirectEd LoRA API (safe startup)")
23
 
24
  @app.get("/health")
25
  def health():
 
27
 
28
  @app.get("/")
29
  def root():
30
+ return {"Status": "AI backend is running"}
31
+
32
+ class Request(BaseModel):
33
+ prompt: str
34
+ max_new_tokens: int = 150
35
+ temperature: float = 0.7
36
+
37
+ pipe = None
38
+
39
+ @app.on_event("startup")
40
+ def load_model():
41
+ global pipe
42
+ try:
43
+ # heavy imports done during startup
44
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
45
+ from peft import PeftModel
46
+
47
+ BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit"
48
+ ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA" # <-- replace with your adapter repo
49
+
50
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
51
+ base_model = AutoModelForCausalLM.from_pretrained(
52
+ BASE_MODEL,
53
+ device_map="auto",
54
+ low_cpu_mem_usage=True,
55
+ torch_dtype="auto",
56
+ )
57
+
58
+ model = PeftModel.from_pretrained(base_model, ADAPTER_REPO)
59
+ model.eval()
60
+
61
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
62
+ logging.info("Model and adapter loaded successfully.")
63
+ except Exception as e:
64
+ logging.exception("Failed to load model at startup: %s", e)
65
+ pipe = None
66
+
67
+ @app.post("/generate")
68
+ def generate(req: Request):
69
+ if pipe is None:
70
+ raise HTTPException(status_code=503, detail="Model not loaded. Check logs.")
71
+ try:
72
+ out = pipe(req.prompt, max_new_tokens=req.max_new_tokens, temperature=req.temperature, do_sample=True)
73
+ return {"response": out[0]["generated_text"]}
74
+ except Exception as e:
75
+ logging.exception("Generation failed: %s", e)
76
+ raise HTTPException(status_code=500, detail=str(e))