crosse712
commited on
Commit
·
0bfacdd
1
Parent(s):
08511e7
Configure for Hugging Face Spaces deployment
Browse files- Dockerfile +64 -37
- Dockerfile.original +61 -0
- README.md +50 -161
- README_ORIGINAL.md +167 -0
Dockerfile
CHANGED
|
@@ -1,61 +1,88 @@
|
|
| 1 |
-
#
|
| 2 |
-
FROM
|
| 3 |
|
| 4 |
-
WORKDIR /app
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
g++ \
|
| 10 |
-
git \
|
| 11 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
-
|
| 13 |
-
# Copy and install Python dependencies
|
| 14 |
-
COPY backend/requirements.txt .
|
| 15 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
|
| 17 |
-
#
|
| 18 |
FROM python:3.9-slim
|
| 19 |
|
| 20 |
WORKDIR /app
|
| 21 |
|
| 22 |
-
# Install
|
| 23 |
RUN apt-get update && apt-get install -y \
|
|
|
|
|
|
|
| 24 |
libgomp1 \
|
| 25 |
libglib2.0-0 \
|
| 26 |
libsm6 \
|
| 27 |
libxext6 \
|
| 28 |
libxrender1 \
|
| 29 |
-
|
| 30 |
-
wget \
|
| 31 |
&& rm -rf /var/lib/apt/lists/*
|
| 32 |
|
| 33 |
-
#
|
| 34 |
-
COPY
|
| 35 |
-
|
| 36 |
|
| 37 |
-
# Copy
|
| 38 |
COPY backend/ ./backend/
|
| 39 |
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
-
ENV OMP_NUM_THREADS=4
|
| 43 |
-
ENV MKL_NUM_THREADS=4
|
| 44 |
-
ENV NUMEXPR_NUM_THREADS=4
|
| 45 |
-
ENV TOKENIZERS_PARALLELISM=false
|
| 46 |
|
| 47 |
-
#
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
# Expose port
|
| 54 |
-
EXPOSE
|
| 55 |
|
| 56 |
# Health check
|
| 57 |
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 58 |
-
CMD curl -f http://localhost:
|
| 59 |
|
| 60 |
-
# Start
|
| 61 |
-
CMD ["
|
|
|
|
| 1 |
+
# Hugging Face Spaces Dockerfile - Frontend + Backend
|
| 2 |
+
FROM node:18-slim as frontend-builder
|
| 3 |
|
| 4 |
+
WORKDIR /app/frontend
|
| 5 |
+
COPY frontend/package*.json ./
|
| 6 |
+
RUN npm ci
|
| 7 |
+
COPY frontend/ ./
|
| 8 |
+
RUN npm run build
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
# Python backend stage
|
| 11 |
FROM python:3.9-slim
|
| 12 |
|
| 13 |
WORKDIR /app
|
| 14 |
|
| 15 |
+
# Install system dependencies
|
| 16 |
RUN apt-get update && apt-get install -y \
|
| 17 |
+
nginx \
|
| 18 |
+
supervisor \
|
| 19 |
libgomp1 \
|
| 20 |
libglib2.0-0 \
|
| 21 |
libsm6 \
|
| 22 |
libxext6 \
|
| 23 |
libxrender1 \
|
| 24 |
+
curl \
|
|
|
|
| 25 |
&& rm -rf /var/lib/apt/lists/*
|
| 26 |
|
| 27 |
+
# Install Python dependencies
|
| 28 |
+
COPY backend/requirements.txt ./backend/
|
| 29 |
+
RUN pip install --no-cache-dir -r backend/requirements.txt
|
| 30 |
|
| 31 |
+
# Copy backend code
|
| 32 |
COPY backend/ ./backend/
|
| 33 |
|
| 34 |
+
# Copy frontend build from builder stage
|
| 35 |
+
COPY --from=frontend-builder /app/frontend/dist /usr/share/nginx/html
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
# Configure nginx to serve frontend and proxy to backend
|
| 38 |
+
RUN echo 'server { \n\
|
| 39 |
+
listen 7860; \n\
|
| 40 |
+
root /usr/share/nginx/html; \n\
|
| 41 |
+
index index.html; \n\
|
| 42 |
+
\n\
|
| 43 |
+
location / { \n\
|
| 44 |
+
try_files $uri $uri/ /index.html; \n\
|
| 45 |
+
} \n\
|
| 46 |
+
\n\
|
| 47 |
+
location /api/ { \n\
|
| 48 |
+
proxy_pass http://127.0.0.1:8000/; \n\
|
| 49 |
+
proxy_http_version 1.1; \n\
|
| 50 |
+
proxy_set_header Upgrade $http_upgrade; \n\
|
| 51 |
+
proxy_set_header Connection "upgrade"; \n\
|
| 52 |
+
proxy_set_header Host $host; \n\
|
| 53 |
+
proxy_set_header X-Real-IP $remote_addr; \n\
|
| 54 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; \n\
|
| 55 |
+
proxy_set_header X-Forwarded-Proto $scheme; \n\
|
| 56 |
+
proxy_buffering off; \n\
|
| 57 |
+
} \n\
|
| 58 |
+
}' > /etc/nginx/sites-available/default
|
| 59 |
|
| 60 |
+
# Create supervisor config
|
| 61 |
+
RUN echo '[supervisord] \n\
|
| 62 |
+
nodaemon=true \n\
|
| 63 |
+
\n\
|
| 64 |
+
[program:nginx] \n\
|
| 65 |
+
command=nginx -g "daemon off;" \n\
|
| 66 |
+
autostart=true \n\
|
| 67 |
+
autorestart=true \n\
|
| 68 |
+
stderr_logfile=/var/log/nginx.err.log \n\
|
| 69 |
+
stdout_logfile=/var/log/nginx.out.log \n\
|
| 70 |
+
\n\
|
| 71 |
+
[program:backend] \n\
|
| 72 |
+
command=python -m uvicorn backend.app.main:app --host 127.0.0.1 --port 8000 \n\
|
| 73 |
+
directory=/app \n\
|
| 74 |
+
autostart=true \n\
|
| 75 |
+
autorestart=true \n\
|
| 76 |
+
stderr_logfile=/var/log/backend.err.log \n\
|
| 77 |
+
stdout_logfile=/var/log/backend.out.log \n\
|
| 78 |
+
environment=USE_EXTREME_OPTIMIZATION="true",MAX_MEMORY_GB="3"' > /etc/supervisor/conf.d/supervisord.conf
|
| 79 |
|
| 80 |
+
# Expose Hugging Face Spaces default port
|
| 81 |
+
EXPOSE 7860
|
| 82 |
|
| 83 |
# Health check
|
| 84 |
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 85 |
+
CMD curl -f http://localhost:7860/ || exit 1
|
| 86 |
|
| 87 |
+
# Start supervisor
|
| 88 |
+
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
Dockerfile.original
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multi-stage build for optimized image size
|
| 2 |
+
FROM python:3.9-slim as builder
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install build dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
gcc \
|
| 9 |
+
g++ \
|
| 10 |
+
git \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Copy and install Python dependencies
|
| 14 |
+
COPY backend/requirements.txt .
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Production stage
|
| 18 |
+
FROM python:3.9-slim
|
| 19 |
+
|
| 20 |
+
WORKDIR /app
|
| 21 |
+
|
| 22 |
+
# Install runtime dependencies
|
| 23 |
+
RUN apt-get update && apt-get install -y \
|
| 24 |
+
libgomp1 \
|
| 25 |
+
libglib2.0-0 \
|
| 26 |
+
libsm6 \
|
| 27 |
+
libxext6 \
|
| 28 |
+
libxrender1 \
|
| 29 |
+
libgomp1 \
|
| 30 |
+
wget \
|
| 31 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 32 |
+
|
| 33 |
+
# Copy Python packages from builder
|
| 34 |
+
COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages
|
| 35 |
+
COPY --from=builder /usr/local/bin /usr/local/bin
|
| 36 |
+
|
| 37 |
+
# Copy application code
|
| 38 |
+
COPY backend/ ./backend/
|
| 39 |
+
|
| 40 |
+
# Set environment variables for memory optimization
|
| 41 |
+
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
|
| 42 |
+
ENV OMP_NUM_THREADS=4
|
| 43 |
+
ENV MKL_NUM_THREADS=4
|
| 44 |
+
ENV NUMEXPR_NUM_THREADS=4
|
| 45 |
+
ENV TOKENIZERS_PARALLELISM=false
|
| 46 |
+
|
| 47 |
+
# Enable extreme memory optimization
|
| 48 |
+
ENV USE_EXTREME_OPTIMIZATION=true
|
| 49 |
+
ENV MAX_MEMORY_GB=3
|
| 50 |
+
|
| 51 |
+
WORKDIR /app/backend
|
| 52 |
+
|
| 53 |
+
# Expose port
|
| 54 |
+
EXPOSE 8000
|
| 55 |
+
|
| 56 |
+
# Health check
|
| 57 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 58 |
+
CMD curl -f http://localhost:8000/ || exit 1
|
| 59 |
+
|
| 60 |
+
# Start the application with memory-limited configuration
|
| 61 |
+
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
|
README.md
CHANGED
|
@@ -1,167 +1,56 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
## Features
|
| 6 |
-
|
| 7 |
-
-
|
| 8 |
-
-
|
| 9 |
-
-
|
| 10 |
-
-
|
| 11 |
-
-
|
| 12 |
-
-
|
| 13 |
-
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
- **
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
- Python 3.8+
|
| 27 |
-
- Node.js 16+
|
| 28 |
-
- Chrome/Chromium browser (for automation demo)
|
| 29 |
-
- 14GB+ RAM (required for FastVLM-7B model weights)
|
| 30 |
-
- CUDA-capable GPU or Apple Silicon (recommended for FastVLM-7B)
|
| 31 |
-
|
| 32 |
-
## Installation
|
| 33 |
-
|
| 34 |
-
1. Clone this repository:
|
| 35 |
-
```bash
|
| 36 |
-
cd fastvlm-screen-observer
|
| 37 |
-
```
|
| 38 |
-
|
| 39 |
-
2. Install Python dependencies:
|
| 40 |
-
```bash
|
| 41 |
-
cd backend
|
| 42 |
-
python3 -m venv venv
|
| 43 |
-
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 44 |
-
pip install -r requirements.txt
|
| 45 |
-
```
|
| 46 |
-
|
| 47 |
-
3. Install Node.js dependencies:
|
| 48 |
-
```bash
|
| 49 |
-
cd ../frontend
|
| 50 |
-
npm install
|
| 51 |
-
```
|
| 52 |
-
|
| 53 |
-
## Running the Application
|
| 54 |
-
|
| 55 |
-
### Option 1: Using the start script (Recommended)
|
| 56 |
-
```bash
|
| 57 |
-
./start.sh
|
| 58 |
-
```
|
| 59 |
-
|
| 60 |
-
### Option 2: Manual start
|
| 61 |
-
|
| 62 |
-
Terminal 1 - Backend:
|
| 63 |
-
```bash
|
| 64 |
-
cd backend
|
| 65 |
-
source venv/bin/activate
|
| 66 |
-
uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
|
| 67 |
-
```
|
| 68 |
-
|
| 69 |
-
Terminal 2 - Frontend:
|
| 70 |
-
```bash
|
| 71 |
-
cd frontend
|
| 72 |
-
npm run dev
|
| 73 |
-
```
|
| 74 |
-
|
| 75 |
-
## Usage
|
| 76 |
-
|
| 77 |
-
1. Open your browser and navigate to `http://localhost:5173`
|
| 78 |
-
2. Click "Capture Screen" to analyze the current screen
|
| 79 |
-
3. Enable "Auto Capture" for continuous monitoring
|
| 80 |
-
4. Use "Run Demo" to see browser automation in action
|
| 81 |
-
5. Click "Export Logs" to download analysis data
|
| 82 |
|
| 83 |
## API Endpoints
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
- `POST /demo` - Run automation demo
|
| 88 |
-
- `GET /export` - Export logs as ZIP
|
| 89 |
-
- `GET /logs/stream` - Stream logs via SSE
|
| 90 |
-
- `GET /docs` - Interactive API documentation
|
| 91 |
-
|
| 92 |
-
## Project Structure
|
| 93 |
-
|
| 94 |
-
```
|
| 95 |
-
fastvlm-screen-observer/
|
| 96 |
-
├── backend/
|
| 97 |
-
│ ├── app/
|
| 98 |
-
│ │ └── main.py # FastAPI application
|
| 99 |
-
│ ├── models/
|
| 100 |
-
│ │ ├── fastvlm_model.py # FastVLM-7B main integration
|
| 101 |
-
│ │ ├── fastvlm_optimized.py # Memory optimization strategies
|
| 102 |
-
│ │ ├── fastvlm_extreme.py # Extreme optimization (4-bit)
|
| 103 |
-
│ │ └── use_fastvlm_small.py # Alternative 1.5B model
|
| 104 |
-
│ ├── utils/
|
| 105 |
-
│ │ ├── screen_capture.py # Screen capture utilities
|
| 106 |
-
│ │ ├── automation.py # Browser automation
|
| 107 |
-
│ │ └── logger.py # NDJSON logging
|
| 108 |
-
│ └── requirements.txt
|
| 109 |
-
├── frontend/
|
| 110 |
-
│ ├── src/
|
| 111 |
-
│ │ ├── App.jsx # React main component (with error handling)
|
| 112 |
-
│ │ ├── ScreenCapture.jsx # WebRTC screen capture
|
| 113 |
-
│ │ └── App.css # Styling
|
| 114 |
-
│ ├── package.json
|
| 115 |
-
│ └��─ vite.config.js
|
| 116 |
-
├── logs/ # Generated logs and frames
|
| 117 |
-
├── start.sh # Startup script
|
| 118 |
-
└── README.md
|
| 119 |
-
|
| 120 |
-
```
|
| 121 |
-
|
| 122 |
-
## Model Notes
|
| 123 |
-
|
| 124 |
-
The application uses Apple's FastVLM-7B model with the following specifications:
|
| 125 |
-
- **Model ID**: `apple/FastVLM-7B` from HuggingFace
|
| 126 |
-
- **Tokenizer**: Qwen2Tokenizer (requires `transformers>=4.40.0`)
|
| 127 |
-
- **IMAGE_TOKEN_INDEX**: -200 (special token for image placeholders)
|
| 128 |
-
- **trust_remote_code**: True (required for model loading)
|
| 129 |
-
|
| 130 |
-
### Memory Requirements:
|
| 131 |
-
- **Minimum**: 14GB RAM for model weights
|
| 132 |
-
- **Recommended**: 16GB+ RAM for smooth operation
|
| 133 |
-
- The model will download automatically on first run (~14GB)
|
| 134 |
-
|
| 135 |
-
### Current Implementation:
|
| 136 |
-
The system includes multiple optimization strategies:
|
| 137 |
-
1. **Standard Mode**: Full precision (float16) - requires 14GB+ RAM
|
| 138 |
-
2. **Optimized Mode**: 8-bit quantization - requires 8-10GB RAM
|
| 139 |
-
3. **Extreme Mode**: 4-bit quantization with disk offloading - requires 6-8GB RAM
|
| 140 |
-
|
| 141 |
-
If the model fails to load due to memory constraints, the application will:
|
| 142 |
-
- Display a user-friendly error message
|
| 143 |
-
- Continue operating with graceful error handling
|
| 144 |
-
- NOT show "ANALYSIS_ERROR" in risk flags
|
| 145 |
-
|
| 146 |
-
## Acceptance Criteria
|
| 147 |
-
|
| 148 |
-
✅ Local web app running on localhost:5173
|
| 149 |
-
✅ FastAPI backend on localhost:8000
|
| 150 |
-
✅ FastVLM-7B integration with trust_remote_code=True
|
| 151 |
-
✅ IMAGE_TOKEN_INDEX = -200 configured
|
| 152 |
-
✅ JSON output format with required fields
|
| 153 |
-
✅ Demo automation functionality
|
| 154 |
-
✅ NDJSON logging with timestamps
|
| 155 |
-
✅ ZIP export with logs and frames
|
| 156 |
-
✅ Project structure matches specifications
|
| 157 |
-
|
| 158 |
-
## Troubleshooting
|
| 159 |
-
|
| 160 |
-
- **Model Loading Issues**: Check GPU memory and CUDA installation
|
| 161 |
-
- **Screen Capture Errors**: Ensure proper display permissions
|
| 162 |
-
- **Browser Automation**: Install Chrome/Chromium and check WebDriver
|
| 163 |
-
- **Port Conflicts**: Ensure ports 5173 and 8000 are available
|
| 164 |
-
|
| 165 |
-
## License
|
| 166 |
|
| 167 |
-
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: FastVLM Screen Observer
|
| 3 |
+
emoji: 🖥️👁️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
sdk_version: "3.9"
|
| 8 |
+
app_port: 7860
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
models:
|
| 12 |
+
- apple/FastVLM-7B
|
| 13 |
+
suggested_hardware: t4-small
|
| 14 |
+
custom_headers:
|
| 15 |
+
cross-origin-embedder-policy: require-corp
|
| 16 |
+
cross-origin-opener-policy: same-origin
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
# FastVLM Screen Observer 🖥️👁️
|
| 20 |
+
|
| 21 |
+
Real-time screen observation and analysis using Apple's FastVLM-7B model, optimized for low-RAM systems (3-8GB).
|
| 22 |
|
| 23 |
## Features
|
| 24 |
+
- 🎯 Real-time screen capture and analysis
|
| 25 |
+
- 🤖 FastVLM-7B vision-language model integration
|
| 26 |
+
- 🔍 UI element detection
|
| 27 |
+
- 📝 Text extraction from screenshots
|
| 28 |
+
- ⚠️ Risk detection for security concerns
|
| 29 |
+
- 🎮 Browser automation demo
|
| 30 |
+
- 💾 Export logs and captured frames
|
| 31 |
+
- 🚀 Optimized for 3-8GB RAM with 4-bit quantization
|
| 32 |
+
|
| 33 |
+
## How to Use
|
| 34 |
+
1. Click "Capture Screen" to analyze your current screen
|
| 35 |
+
2. Enable "Auto Capture" for continuous monitoring
|
| 36 |
+
3. Use "Run Demo" to see browser automation
|
| 37 |
+
4. Export logs as ZIP archive
|
| 38 |
+
|
| 39 |
+
## Model Information
|
| 40 |
+
- **Model**: Apple FastVLM-7B
|
| 41 |
+
- **Optimization**: Extreme memory optimization with 4-bit quantization
|
| 42 |
+
- **Memory**: Runs on 3-8GB RAM systems
|
| 43 |
+
- **Device**: Supports CPU, CUDA, and MPS (Apple Silicon)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
## API Endpoints
|
| 46 |
+
- `GET /api/` - Status check
|
| 47 |
+
- `POST /api/analyze` - Screen analysis
|
| 48 |
+
- `POST /api/demo` - Automation demo
|
| 49 |
+
- `GET /api/export` - Export logs
|
| 50 |
+
- `GET /api/logs/stream` - Stream logs via SSE
|
| 51 |
|
| 52 |
+
## GitHub Repository
|
| 53 |
+
https://github.com/crosse712/fastvlm-screen-observer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
---
|
| 56 |
+
Built with ❤️ using FastAPI, React, and FastVLM-7B
|
README_ORIGINAL.md
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FastVLM-7B Screen Observer
|
| 2 |
+
|
| 3 |
+
A local web application for real-time screen observation and analysis using Apple's FastVLM-7B model via HuggingFace.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Real-time Screen Capture**: Capture and analyze screen content on-demand or automatically
|
| 8 |
+
- **FastVLM-7B Integration**: Uses Apple's vision-language model for intelligent screen analysis
|
| 9 |
+
- **UI Element Detection**: Identifies buttons, links, forms, and other interface elements
|
| 10 |
+
- **Text Extraction**: Captures text snippets from the screen
|
| 11 |
+
- **Risk Detection**: Flags potential security or privacy concerns
|
| 12 |
+
- **Automation Demo**: Demonstrates browser automation capabilities
|
| 13 |
+
- **NDJSON Logging**: Comprehensive logging in NDJSON format with timestamps
|
| 14 |
+
- **Export Functionality**: Download logs and captured frames as ZIP archive
|
| 15 |
+
|
| 16 |
+
## Specifications
|
| 17 |
+
|
| 18 |
+
- **Frontend**: React + Vite on `http://localhost:5173`
|
| 19 |
+
- **Backend**: FastAPI on `http://localhost:8000`
|
| 20 |
+
- **Model**: Apple FastVLM-7B with `trust_remote_code=True`
|
| 21 |
+
- **Image Token**: `IMAGE_TOKEN_INDEX = -200`
|
| 22 |
+
- **Output Format**: JSON with summary, ui_elements, text_snippets, risk_flags
|
| 23 |
+
|
| 24 |
+
## Prerequisites
|
| 25 |
+
|
| 26 |
+
- Python 3.8+
|
| 27 |
+
- Node.js 16+
|
| 28 |
+
- Chrome/Chromium browser (for automation demo)
|
| 29 |
+
- 14GB+ RAM (required for FastVLM-7B model weights)
|
| 30 |
+
- CUDA-capable GPU or Apple Silicon (recommended for FastVLM-7B)
|
| 31 |
+
|
| 32 |
+
## Installation
|
| 33 |
+
|
| 34 |
+
1. Clone this repository:
|
| 35 |
+
```bash
|
| 36 |
+
cd fastvlm-screen-observer
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
2. Install Python dependencies:
|
| 40 |
+
```bash
|
| 41 |
+
cd backend
|
| 42 |
+
python3 -m venv venv
|
| 43 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 44 |
+
pip install -r requirements.txt
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
3. Install Node.js dependencies:
|
| 48 |
+
```bash
|
| 49 |
+
cd ../frontend
|
| 50 |
+
npm install
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## Running the Application
|
| 54 |
+
|
| 55 |
+
### Option 1: Using the start script (Recommended)
|
| 56 |
+
```bash
|
| 57 |
+
./start.sh
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### Option 2: Manual start
|
| 61 |
+
|
| 62 |
+
Terminal 1 - Backend:
|
| 63 |
+
```bash
|
| 64 |
+
cd backend
|
| 65 |
+
source venv/bin/activate
|
| 66 |
+
uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
Terminal 2 - Frontend:
|
| 70 |
+
```bash
|
| 71 |
+
cd frontend
|
| 72 |
+
npm run dev
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
## Usage
|
| 76 |
+
|
| 77 |
+
1. Open your browser and navigate to `http://localhost:5173`
|
| 78 |
+
2. Click "Capture Screen" to analyze the current screen
|
| 79 |
+
3. Enable "Auto Capture" for continuous monitoring
|
| 80 |
+
4. Use "Run Demo" to see browser automation in action
|
| 81 |
+
5. Click "Export Logs" to download analysis data
|
| 82 |
+
|
| 83 |
+
## API Endpoints
|
| 84 |
+
|
| 85 |
+
- `GET /` - API status check
|
| 86 |
+
- `POST /analyze` - Capture and analyze screen
|
| 87 |
+
- `POST /demo` - Run automation demo
|
| 88 |
+
- `GET /export` - Export logs as ZIP
|
| 89 |
+
- `GET /logs/stream` - Stream logs via SSE
|
| 90 |
+
- `GET /docs` - Interactive API documentation
|
| 91 |
+
|
| 92 |
+
## Project Structure
|
| 93 |
+
|
| 94 |
+
```
|
| 95 |
+
fastvlm-screen-observer/
|
| 96 |
+
├── backend/
|
| 97 |
+
│ ├── app/
|
| 98 |
+
│ │ └── main.py # FastAPI application
|
| 99 |
+
│ ├── models/
|
| 100 |
+
│ │ ├── fastvlm_model.py # FastVLM-7B main integration
|
| 101 |
+
│ │ ├── fastvlm_optimized.py # Memory optimization strategies
|
| 102 |
+
│ │ ├── fastvlm_extreme.py # Extreme optimization (4-bit)
|
| 103 |
+
│ │ └── use_fastvlm_small.py # Alternative 1.5B model
|
| 104 |
+
│ ├── utils/
|
| 105 |
+
│ │ ├── screen_capture.py # Screen capture utilities
|
| 106 |
+
│ │ ├── automation.py # Browser automation
|
| 107 |
+
│ │ └── logger.py # NDJSON logging
|
| 108 |
+
│ └── requirements.txt
|
| 109 |
+
├── frontend/
|
| 110 |
+
│ ├── src/
|
| 111 |
+
│ │ ├── App.jsx # React main component (with error handling)
|
| 112 |
+
│ │ ├── ScreenCapture.jsx # WebRTC screen capture
|
| 113 |
+
│ │ └── App.css # Styling
|
| 114 |
+
│ ├── package.json
|
| 115 |
+
│ └── vite.config.js
|
| 116 |
+
├── logs/ # Generated logs and frames
|
| 117 |
+
├── start.sh # Startup script
|
| 118 |
+
└── README.md
|
| 119 |
+
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
## Model Notes
|
| 123 |
+
|
| 124 |
+
The application uses Apple's FastVLM-7B model with the following specifications:
|
| 125 |
+
- **Model ID**: `apple/FastVLM-7B` from HuggingFace
|
| 126 |
+
- **Tokenizer**: Qwen2Tokenizer (requires `transformers>=4.40.0`)
|
| 127 |
+
- **IMAGE_TOKEN_INDEX**: -200 (special token for image placeholders)
|
| 128 |
+
- **trust_remote_code**: True (required for model loading)
|
| 129 |
+
|
| 130 |
+
### Memory Requirements:
|
| 131 |
+
- **Minimum**: 14GB RAM for model weights
|
| 132 |
+
- **Recommended**: 16GB+ RAM for smooth operation
|
| 133 |
+
- The model will download automatically on first run (~14GB)
|
| 134 |
+
|
| 135 |
+
### Current Implementation:
|
| 136 |
+
The system includes multiple optimization strategies:
|
| 137 |
+
1. **Standard Mode**: Full precision (float16) - requires 14GB+ RAM
|
| 138 |
+
2. **Optimized Mode**: 8-bit quantization - requires 8-10GB RAM
|
| 139 |
+
3. **Extreme Mode**: 4-bit quantization with disk offloading - requires 6-8GB RAM
|
| 140 |
+
|
| 141 |
+
If the model fails to load due to memory constraints, the application will:
|
| 142 |
+
- Display a user-friendly error message
|
| 143 |
+
- Continue operating with graceful error handling
|
| 144 |
+
- NOT show "ANALYSIS_ERROR" in risk flags
|
| 145 |
+
|
| 146 |
+
## Acceptance Criteria
|
| 147 |
+
|
| 148 |
+
✅ Local web app running on localhost:5173
|
| 149 |
+
✅ FastAPI backend on localhost:8000
|
| 150 |
+
✅ FastVLM-7B integration with trust_remote_code=True
|
| 151 |
+
✅ IMAGE_TOKEN_INDEX = -200 configured
|
| 152 |
+
✅ JSON output format with required fields
|
| 153 |
+
✅ Demo automation functionality
|
| 154 |
+
✅ NDJSON logging with timestamps
|
| 155 |
+
✅ ZIP export with logs and frames
|
| 156 |
+
✅ Project structure matches specifications
|
| 157 |
+
|
| 158 |
+
## Troubleshooting
|
| 159 |
+
|
| 160 |
+
- **Model Loading Issues**: Check GPU memory and CUDA installation
|
| 161 |
+
- **Screen Capture Errors**: Ensure proper display permissions
|
| 162 |
+
- **Browser Automation**: Install Chrome/Chromium and check WebDriver
|
| 163 |
+
- **Port Conflicts**: Ensure ports 5173 and 8000 are available
|
| 164 |
+
|
| 165 |
+
## License
|
| 166 |
+
|
| 167 |
+
MIT
|