File size: 1,617 Bytes
6bf37cd
deb83c9
d9d6b2c
1f23ef2
d9d6b2c
deb83c9
6bf37cd
 
1f23ef2
6bf37cd
1f23ef2
6bf37cd
 
1f23ef2
 
 
 
6bf37cd
 
 
 
 
 
 
1f23ef2
6bf37cd
1f23ef2
6bf37cd
1f23ef2
6bf37cd
 
 
deb83c9
6bf37cd
 
 
 
 
 
 
 
d9d6b2c
 
6bf37cd
 
 
 
231cb7b
deb83c9
072df7d
6bf37cd
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

REPO_ID = "google/gemma-2b-it-GGUF"
FILENAME = "gemma-2b-it.gguf"
HF_TOKEN = os.environ.get("HF_TOKEN")
MODEL_DIR = "./models"
CACHE_DIR = "./models/.hf_cache"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

# Make sure directories exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)

if not os.path.exists(MODEL_PATH):
    try:
        print("📦 Downloading model from Hugging Face Hub...")
        hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME,
            token=HF_TOKEN,
            cache_dir=CACHE_DIR,
            local_dir=MODEL_DIR,
            local_dir_use_symlinks=False  # even though deprecated, keep for compatibility
        )
        print(f"✅ Model downloaded to {MODEL_PATH}")
    except Exception as e:
        print(f"❌ Download failed: {e}")
        raise

# Step 2: Load model using llama-cpp-python
print("🤖 Loading GGUF model...")
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=512,
    n_threads=4,
    n_batch=512,
    verbose=False
)

# Step 3: FastAPI app
app = FastAPI()

class PromptRequest(BaseModel):
    prompt: str

@app.post("/prompt")
def generate_prompt(req: PromptRequest):
    prompt = req.prompt.strip()

    output = llm(
        prompt,
        max_tokens=512,
        temperature=0.6,
        top_p=0.95,
        stop=["<|endoftext|>", "</s>", "```"],
        echo=False
    )

    result = output["choices"][0]["text"].strip()
    return {"response": result}