ThongCoding's picture
wdasadw
2ccca99
raw
history blame
1.58 kB
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import os
import requests
app = FastAPI()
# === Constants ===
MODEL_REPO="nilbot/gemma-2b-it-Q4_K.gguf"
MODEL_FILE="gemma-2b-it-Q4_K.gguf"
MODEL_URL = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}"
MODEL_DIR = "./models"
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE)
HF_TOKEN = os.getenv("HF_TOKEN")
# === Create model directory ===
os.makedirs(MODEL_DIR, exist_ok=True)
# === Manual download of GGUF ===
if not os.path.exists(MODEL_PATH):
print("📦 Downloading GGUF model manually from Hugging Face...")
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
response = requests.get(MODEL_URL, headers=headers, stream=True)
if response.status_code != 200:
raise RuntimeError(f"❌ Failed to download model. Status: {response.status_code}")
with open(MODEL_PATH, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"✅ Model downloaded to {MODEL_PATH}")
# === Load model ===
print("🔧 Loading GGUF model...")
llm = Llama(model_path=MODEL_PATH, n_ctx=512, n_threads=os.cpu_count())
# === Inference ===
class PromptRequest(BaseModel):
prompt: str
max_tokens: int = 256
temperature: float = 0.7
@app.post("/prompt")
def generate_prompt(req: PromptRequest):
output = llm(
prompt=req.prompt,
max_tokens=req.max_tokens,
temperature=req.temperature,
stop=["</s>"],
)
return {"response": output["choices"][0]["text"].strip()}