fastvlm-screen-observer / backend /use_fastvlm_small.py
KMH
Initial commit: FastVLM Screen Observer application
509a107
#!/usr/bin/env python3
"""
Use FastVLM-1.5B - The smaller variant that works with limited RAM
This model requires only ~3GB RAM and maintains good performance
"""
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
# Use the smaller FastVLM model
MID = "apple/FastVLM-1.5B" # Smaller model - only 1.5B parameters
IMAGE_TOKEN_INDEX = -200
def load_fastvlm_small():
"""Load FastVLM-1.5B which works with limited RAM"""
print("Loading FastVLM-1.5B (optimized for limited RAM)...")
print("This model requires only ~3GB RAM\n")
# Load tokenizer
print("1. Loading tokenizer...")
tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
print(f" βœ“ Tokenizer loaded")
# Determine device
if torch.cuda.is_available():
device = "cuda"
dtype = torch.float16
elif torch.backends.mps.is_available():
device = "mps"
dtype = torch.float16
else:
device = "cpu"
dtype = torch.float32
print(f"\n2. Loading model on {device}...")
print(" This will download ~3GB on first run...")
# Load model with memory optimization
model = AutoModelForCausalLM.from_pretrained(
MID,
torch_dtype=dtype,
trust_remote_code=True,
low_cpu_mem_usage=True
)
# Move to device
model = model.to(device)
model.eval()
print(f" βœ“ FastVLM-1.5B loaded successfully!")
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f" βœ“ Parameters: {total_params / 1e9:.2f}B")
return model, tok, device
def test_generation(model, tok, device):
"""Test the model with a sample image"""
print("\n3. Testing generation...")
# Create test image
test_image = Image.new('RGB', (336, 336), color='blue')
# Prepare prompt
messages = [
{"role": "user", "content": "<image>\nDescribe this image."}
]
# Apply chat template
rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
pre, post = rendered.split("<image>", 1)
# Tokenize
pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(device)
# Process image (simplified for testing)
from torchvision import transforms
transform = transforms.Compose([
transforms.Resize((336, 336)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
std=[0.26862954, 0.26130258, 0.27577711])
])
pixel_values = transform(test_image).unsqueeze(0).to(device)
print(" Generating response...")
# Generate
with torch.no_grad():
outputs = model.generate(
inputs=input_ids,
pixel_values=pixel_values,
max_new_tokens=50,
temperature=0.7,
do_sample=True
)
# Decode
response = tok.decode(outputs[0], skip_special_tokens=True)
print(f" Response: {response[:100]}...")
print("\nβœ… FastVLM-1.5B is working correctly!")
if __name__ == "__main__":
print("="*60)
print("FastVLM-1.5B - Optimized for Limited RAM")
print("="*60)
print()
try:
model, tok, device = load_fastvlm_small()
test_generation(model, tok, device)
print("\n" + "="*60)
print("SUCCESS: FastVLM-1.5B is ready for use!")
print("="*60)
print("\nThis smaller model:")
print("β€’ Uses only ~3GB RAM")
print("β€’ Maintains good performance")
print("β€’ Works on your system")
print("β€’ Has same API as FastVLM-7B")
except Exception as e:
print(f"\nβœ— Error: {e}")
print("\nEven FastVLM-1.5B failed to load.")
print("Please close other applications and try again.")