import os import requests from huggingface_hub import hf_hub_download, HfApi from llama_cpp import Llama HF_TOKEN = os.environ.get("HF_TOKEN") REPO_ID = "google/gemma-2b-it-GGUF" MODEL_FILENAME = "gemma-2b-it.gguf" LOCAL_MODEL_PATH = f"/code/models/{MODEL_FILENAME}" CACHE_DIR = "/code/cache" os.makedirs(os.path.dirname(LOCAL_MODEL_PATH), exist_ok=True) os.makedirs(CACHE_DIR, exist_ok=True) def download_model(): try: print("🔄 Attempting HF Hub download...") model_path = hf_hub_download( repo_id=REPO_ID, filename=MODEL_FILENAME, token=HF_TOKEN, cache_dir=CACHE_DIR, ) print("✅ Downloaded via hf_hub_download:", model_path) return model_path except Exception as e: print("⚠️ hf_hub_download failed:", e) print("🔁 Falling back to manual download...") headers = {"Authorization": f"Bearer {HF_TOKEN}"} url = f"https://huggingface.co/{REPO_ID}/resolve/main/{MODEL_FILENAME}" response = requests.get(url, headers=headers, stream=True) response.raise_for_status() with open(LOCAL_MODEL_PATH, "wb") as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) print("✅ Manual download completed:", LOCAL_MODEL_PATH) return LOCAL_MODEL_PATH print("📦 Loading GGUF model...") model_path = download_model() llm = Llama(model_path=model_path) def generate_structure(prompt: str) -> str: output = llm(prompt, max_tokens=512) return output["choices"][0]["text"].strip()