from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama import os import requests app = FastAPI() # === Constants === MODEL_REPO="nilbot/gemma-2b-it-Q4_K.gguf" MODEL_FILE="gemma-2b-it-Q4_K.gguf" MODEL_URL = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}" MODEL_DIR = "./models" MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE) HF_TOKEN = os.getenv("HF_TOKEN") # === Create model directory === os.makedirs(MODEL_DIR, exist_ok=True) # === Manual download of GGUF === if not os.path.exists(MODEL_PATH): print("📦 Downloading GGUF model manually from Hugging Face...") headers = {"Authorization": f"Bearer {HF_TOKEN}"} response = requests.get(MODEL_URL, headers=headers, stream=True) if response.status_code != 200: raise RuntimeError(f"❌ Failed to download model. Status: {response.status_code}") with open(MODEL_PATH, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print(f"✅ Model downloaded to {MODEL_PATH}") # === Load model === print("🔧 Loading GGUF model...") llm = Llama(model_path=MODEL_PATH, n_ctx=512, n_threads=os.cpu_count()) # === Inference === class PromptRequest(BaseModel): prompt: str max_tokens: int = 256 temperature: float = 0.7 @app.post("/prompt") def generate_prompt(req: PromptRequest): output = llm( prompt=req.prompt, max_tokens=req.max_tokens, temperature=req.temperature, stop=[""], ) return {"response": output["choices"][0]["text"].strip()}