|
|
from fastapi import FastAPI |
|
|
from pydantic import BaseModel |
|
|
from huggingface_hub import hf_hub_download |
|
|
from llama_cpp import Llama |
|
|
import os |
|
|
|
|
|
REPO_ID = "google/gemma-2b-it-GGUF" |
|
|
FILENAME = "gemma-2b-it.gguf" |
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
MODEL_DIR = "./models" |
|
|
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) |
|
|
|
|
|
|
|
|
if not os.path.exists(MODEL_PATH): |
|
|
os.makedirs(MODEL_DIR, exist_ok=True) |
|
|
try: |
|
|
print("📦 Downloading model from Hugging Face Hub...") |
|
|
hf_hub_download( |
|
|
repo_id=REPO_ID, |
|
|
filename=FILENAME, |
|
|
token=HF_TOKEN, |
|
|
local_dir=MODEL_DIR, |
|
|
local_dir_use_symlinks=False |
|
|
) |
|
|
print("✅ Model downloaded.") |
|
|
except Exception as e: |
|
|
print(f"❌ Download failed: {e}") |
|
|
raise |
|
|
|
|
|
|
|
|
print("🤖 Loading GGUF model...") |
|
|
llm = Llama( |
|
|
model_path=MODEL_PATH, |
|
|
n_ctx=512, |
|
|
n_threads=4, |
|
|
n_batch=512, |
|
|
verbose=False |
|
|
) |
|
|
|
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
class PromptRequest(BaseModel): |
|
|
prompt: str |
|
|
|
|
|
@app.post("/prompt") |
|
|
def generate_prompt(req: PromptRequest): |
|
|
prompt = req.prompt.strip() |
|
|
|
|
|
output = llm( |
|
|
prompt, |
|
|
max_tokens=512, |
|
|
temperature=0.6, |
|
|
top_p=0.95, |
|
|
stop=["<|endoftext|>", "</s>", "```"], |
|
|
echo=False |
|
|
) |
|
|
|
|
|
result = output["choices"][0]["text"].strip() |
|
|
return {"response": result} |
|
|
|