File size: 1,577 Bytes
062433d
deb83c9
d9d6b2c
 
062433d
deb83c9
d6bf7fd
 
062433d
2ccca99
bca9aeb
062433d
6bf37cd
062433d
6bf37cd
062433d
 
 
1f23ef2
 
062433d
6bf37cd
062433d
 
 
 
 
 
 
 
 
deb83c9
d6bf7fd
 
bca9aeb
6bf37cd
062433d
6bf37cd
231cb7b
d6bf7fd
 
deb83c9
072df7d
6bf37cd
 
d6bf7fd
 
 
 
6bf37cd
d6bf7fd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import os
import requests

app = FastAPI()

# === Constants ===
MODEL_REPO="nilbot/gemma-2b-it-Q4_K.gguf"
MODEL_FILE="gemma-2b-it-Q4_K.gguf"
MODEL_URL = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}"
MODEL_DIR = "./models"
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE)

HF_TOKEN = os.getenv("HF_TOKEN")

# === Create model directory ===
os.makedirs(MODEL_DIR, exist_ok=True)

# === Manual download of GGUF ===
if not os.path.exists(MODEL_PATH):
    print("📦 Downloading GGUF model manually from Hugging Face...")
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
    response = requests.get(MODEL_URL, headers=headers, stream=True)
    if response.status_code != 200:
        raise RuntimeError(f"❌ Failed to download model. Status: {response.status_code}")
    with open(MODEL_PATH, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"✅ Model downloaded to {MODEL_PATH}")

# === Load model ===
print("🔧 Loading GGUF model...")
llm = Llama(model_path=MODEL_PATH, n_ctx=512, n_threads=os.cpu_count())

# === Inference ===
class PromptRequest(BaseModel):
    prompt: str
    max_tokens: int = 256
    temperature: float = 0.7

@app.post("/prompt")
def generate_prompt(req: PromptRequest):
    output = llm(
        prompt=req.prompt,
        max_tokens=req.max_tokens,
        temperature=req.temperature,
        stop=["</s>"],
    )
    return {"response": output["choices"][0]["text"].strip()}