ThongCoding's picture
w
96e3318
raw
history blame
1.34 kB
import os
import requests
from llama_cpp import Llama
HF_TOKEN = os.getenv("HF_TOKEN")
MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q2_K.gguf"
MODEL_PATH = f"./models/{MODEL_FILENAME}"
# Manual download with fallback
if not os.path.exists(MODEL_PATH):
print("📦 Downloading GGUF model manually from Hugging Face...")
url = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILENAME}"
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
os.makedirs("./models", exist_ok=True)
with requests.get(url, headers=headers, stream=True) as r:
r.raise_for_status()
with open(MODEL_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print(f"✅ Model downloaded to {MODEL_PATH}")
# Load with llama-cpp
llm = Llama(
model_path="./models/tinyllama-1.1b-chat-v1.0.Q2_K.gguf",
n_ctx=128, # Limit context for smaller RAM/CPU
n_batch=32,
n_threads=2, # Use 2 threads (you can try 1 if needed)
n_gpu_layers=0, # CPU-only
chat_format="llama-2"
)
def generate_structure(prompt: str) -> str:
output = llm.create_completion(
prompt=prompt,
temperature=0.7,
max_tokens=512,
)
return output["choices"][0]["text"].strip()