File size: 1,577 Bytes
062433d deb83c9 d9d6b2c 062433d deb83c9 d6bf7fd 062433d 2ccca99 bca9aeb 062433d 6bf37cd 062433d 6bf37cd 062433d 1f23ef2 062433d 6bf37cd 062433d deb83c9 d6bf7fd bca9aeb 6bf37cd 062433d 6bf37cd 231cb7b d6bf7fd deb83c9 072df7d 6bf37cd d6bf7fd 6bf37cd d6bf7fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import os
import requests
app = FastAPI()
# === Constants ===
MODEL_REPO="nilbot/gemma-2b-it-Q4_K.gguf"
MODEL_FILE="gemma-2b-it-Q4_K.gguf"
MODEL_URL = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}"
MODEL_DIR = "./models"
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE)
HF_TOKEN = os.getenv("HF_TOKEN")
# === Create model directory ===
os.makedirs(MODEL_DIR, exist_ok=True)
# === Manual download of GGUF ===
if not os.path.exists(MODEL_PATH):
print("📦 Downloading GGUF model manually from Hugging Face...")
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
response = requests.get(MODEL_URL, headers=headers, stream=True)
if response.status_code != 200:
raise RuntimeError(f"❌ Failed to download model. Status: {response.status_code}")
with open(MODEL_PATH, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"✅ Model downloaded to {MODEL_PATH}")
# === Load model ===
print("🔧 Loading GGUF model...")
llm = Llama(model_path=MODEL_PATH, n_ctx=512, n_threads=os.cpu_count())
# === Inference ===
class PromptRequest(BaseModel):
prompt: str
max_tokens: int = 256
temperature: float = 0.7
@app.post("/prompt")
def generate_prompt(req: PromptRequest):
output = llm(
prompt=req.prompt,
max_tokens=req.max_tokens,
temperature=req.temperature,
stop=["</s>"],
)
return {"response": output["choices"][0]["text"].strip()}
|