ThongCoding commited on
Commit
74d601a
·
1 Parent(s): 2ccca99
Files changed (1) hide show
  1. app.py +36 -37
app.py CHANGED
@@ -1,51 +1,50 @@
1
- from fastapi import FastAPI
2
- from pydantic import BaseModel
3
- from llama_cpp import Llama
4
  import os
5
  import requests
 
 
 
6
 
7
- app = FastAPI()
8
-
9
- # === Constants ===
10
- MODEL_REPO="nilbot/gemma-2b-it-Q4_K.gguf"
11
- MODEL_FILE="gemma-2b-it-Q4_K.gguf"
12
- MODEL_URL = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}"
13
- MODEL_DIR = "./models"
14
- MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE)
15
-
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
 
18
- # === Create model directory ===
19
- os.makedirs(MODEL_DIR, exist_ok=True)
 
20
 
21
- # === Manual download of GGUF ===
22
  if not os.path.exists(MODEL_PATH):
23
- print("📦 Downloading GGUF model manually from Hugging Face...")
 
24
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
25
- response = requests.get(MODEL_URL, headers=headers, stream=True)
26
- if response.status_code != 200:
27
- raise RuntimeError(f"❌ Failed to download model. Status: {response.status_code}")
28
  with open(MODEL_PATH, "wb") as f:
29
- for chunk in response.iter_content(chunk_size=8192):
30
  f.write(chunk)
31
- print(f"✅ Model downloaded to {MODEL_PATH}")
 
 
 
 
 
 
 
 
32
 
33
- # === Load model ===
34
- print("🔧 Loading GGUF model...")
35
- llm = Llama(model_path=MODEL_PATH, n_ctx=512, n_threads=os.cpu_count())
36
 
37
- # === Inference ===
38
- class PromptRequest(BaseModel):
39
- prompt: str
40
- max_tokens: int = 256
41
- temperature: float = 0.7
42
 
43
  @app.post("/prompt")
44
- def generate_prompt(req: PromptRequest):
45
- output = llm(
46
- prompt=req.prompt,
47
- max_tokens=req.max_tokens,
48
- temperature=req.temperature,
49
- stop=["</s>"],
50
- )
51
- return {"response": output["choices"][0]["text"].strip()}
 
 
 
 
1
  import os
2
  import requests
3
+ from fastapi import FastAPI, Request
4
+ from fastapi.responses import JSONResponse
5
+ from llama_cpp import Llama
6
 
7
+ REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
8
+ MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
9
+ MODEL_PATH = f"./models/{MODEL_FILENAME}"
 
 
 
 
 
 
10
  HF_TOKEN = os.getenv("HF_TOKEN")
11
 
12
+ # Ensure models folder exists
13
+ os.makedirs("./models", exist_ok=True)
14
+ os.chmod("./models", 0o777) # ensure write access
15
 
16
+ # Download model if missing
17
  if not os.path.exists(MODEL_PATH):
18
+ print("📦 Downloading TinyLlama Q4_K_M model...")
19
+ url = f"https://huggingface.co/{REPO_ID}/resolve/main/{MODEL_FILENAME}"
20
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
21
+ r = requests.get(url, headers=headers, stream=True)
22
+ if r.status_code != 200:
23
+ raise RuntimeError(f"❌ Download failed: {r.status_code} {r.text[:200]}")
24
  with open(MODEL_PATH, "wb") as f:
25
+ for chunk in r.iter_content(8192):
26
  f.write(chunk)
27
+ print("✅ Model downloaded")
28
+
29
+ # Load into llama-cpp
30
+ print("🔧 Loading TinyLlama model...")
31
+ llm = Llama(
32
+ model_path=MODEL_PATH,
33
+ n_ctx=512,
34
+ n_threads=os.cpu_count() or 1
35
+ )
36
 
37
+ app = FastAPI()
 
 
38
 
39
+ @app.get("/")
40
+ async def root():
41
+ return {"status": "🟢 TinyLlama-1.1B Q4_K_M is ready"}
 
 
42
 
43
  @app.post("/prompt")
44
+ async def prompt(req: Request):
45
+ body = await req.json()
46
+ prompt = body.get("prompt") or ""
47
+ if not prompt:
48
+ return JSONResponse(status_code=400, content={"error": "Missing 'prompt' field"})
49
+ resp = llm(prompt, max_tokens=512, stop=["</s>"])
50
+ return {"response": resp["choices"][0]["text"].strip()}