from fastapi import FastAPI from pydantic import BaseModel from huggingface_hub import hf_hub_download from llama_cpp import Llama import os REPO_ID = "google/gemma-2b-it-GGUF" FILENAME = "gemma-2b-it.gguf" HF_TOKEN = os.environ.get("HF_TOKEN") # must be set in HF Spaces Secrets MODEL_DIR = "./models" MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) # Step 1: Auto-download model if not exists if not os.path.exists(MODEL_PATH): os.makedirs(MODEL_DIR, exist_ok=True) try: print("📦 Downloading model from Hugging Face Hub...") hf_hub_download( repo_id=REPO_ID, filename=FILENAME, token=HF_TOKEN, local_dir=MODEL_DIR, local_dir_use_symlinks=False ) print("✅ Model downloaded.") except Exception as e: print(f"❌ Download failed: {e}") raise # Step 2: Load model using llama-cpp-python print("🤖 Loading GGUF model...") llm = Llama( model_path=MODEL_PATH, n_ctx=512, n_threads=4, n_batch=512, verbose=False ) # Step 3: FastAPI app app = FastAPI() class PromptRequest(BaseModel): prompt: str @app.post("/prompt") def generate_prompt(req: PromptRequest): prompt = req.prompt.strip() output = llm( prompt, max_tokens=512, temperature=0.6, top_p=0.95, stop=["<|endoftext|>", "", "```"], echo=False ) result = output["choices"][0]["text"].strip() return {"response": result}