crosse712 commited on
Commit
0bfacdd
·
1 Parent(s): 08511e7

Configure for Hugging Face Spaces deployment

Browse files
Files changed (4) hide show
  1. Dockerfile +64 -37
  2. Dockerfile.original +61 -0
  3. README.md +50 -161
  4. README_ORIGINAL.md +167 -0
Dockerfile CHANGED
@@ -1,61 +1,88 @@
1
- # Multi-stage build for optimized image size
2
- FROM python:3.9-slim as builder
3
 
4
- WORKDIR /app
5
-
6
- # Install build dependencies
7
- RUN apt-get update && apt-get install -y \
8
- gcc \
9
- g++ \
10
- git \
11
- && rm -rf /var/lib/apt/lists/*
12
-
13
- # Copy and install Python dependencies
14
- COPY backend/requirements.txt .
15
- RUN pip install --no-cache-dir -r requirements.txt
16
 
17
- # Production stage
18
  FROM python:3.9-slim
19
 
20
  WORKDIR /app
21
 
22
- # Install runtime dependencies
23
  RUN apt-get update && apt-get install -y \
 
 
24
  libgomp1 \
25
  libglib2.0-0 \
26
  libsm6 \
27
  libxext6 \
28
  libxrender1 \
29
- libgomp1 \
30
- wget \
31
  && rm -rf /var/lib/apt/lists/*
32
 
33
- # Copy Python packages from builder
34
- COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages
35
- COPY --from=builder /usr/local/bin /usr/local/bin
36
 
37
- # Copy application code
38
  COPY backend/ ./backend/
39
 
40
- # Set environment variables for memory optimization
41
- ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
42
- ENV OMP_NUM_THREADS=4
43
- ENV MKL_NUM_THREADS=4
44
- ENV NUMEXPR_NUM_THREADS=4
45
- ENV TOKENIZERS_PARALLELISM=false
46
 
47
- # Enable extreme memory optimization
48
- ENV USE_EXTREME_OPTIMIZATION=true
49
- ENV MAX_MEMORY_GB=3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- WORKDIR /app/backend
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- # Expose port
54
- EXPOSE 8000
55
 
56
  # Health check
57
  HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
58
- CMD curl -f http://localhost:8000/ || exit 1
59
 
60
- # Start the application with memory-limited configuration
61
- CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
 
1
+ # Hugging Face Spaces Dockerfile - Frontend + Backend
2
+ FROM node:18-slim as frontend-builder
3
 
4
+ WORKDIR /app/frontend
5
+ COPY frontend/package*.json ./
6
+ RUN npm ci
7
+ COPY frontend/ ./
8
+ RUN npm run build
 
 
 
 
 
 
 
9
 
10
+ # Python backend stage
11
  FROM python:3.9-slim
12
 
13
  WORKDIR /app
14
 
15
+ # Install system dependencies
16
  RUN apt-get update && apt-get install -y \
17
+ nginx \
18
+ supervisor \
19
  libgomp1 \
20
  libglib2.0-0 \
21
  libsm6 \
22
  libxext6 \
23
  libxrender1 \
24
+ curl \
 
25
  && rm -rf /var/lib/apt/lists/*
26
 
27
+ # Install Python dependencies
28
+ COPY backend/requirements.txt ./backend/
29
+ RUN pip install --no-cache-dir -r backend/requirements.txt
30
 
31
+ # Copy backend code
32
  COPY backend/ ./backend/
33
 
34
+ # Copy frontend build from builder stage
35
+ COPY --from=frontend-builder /app/frontend/dist /usr/share/nginx/html
 
 
 
 
36
 
37
+ # Configure nginx to serve frontend and proxy to backend
38
+ RUN echo 'server { \n\
39
+ listen 7860; \n\
40
+ root /usr/share/nginx/html; \n\
41
+ index index.html; \n\
42
+ \n\
43
+ location / { \n\
44
+ try_files $uri $uri/ /index.html; \n\
45
+ } \n\
46
+ \n\
47
+ location /api/ { \n\
48
+ proxy_pass http://127.0.0.1:8000/; \n\
49
+ proxy_http_version 1.1; \n\
50
+ proxy_set_header Upgrade $http_upgrade; \n\
51
+ proxy_set_header Connection "upgrade"; \n\
52
+ proxy_set_header Host $host; \n\
53
+ proxy_set_header X-Real-IP $remote_addr; \n\
54
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; \n\
55
+ proxy_set_header X-Forwarded-Proto $scheme; \n\
56
+ proxy_buffering off; \n\
57
+ } \n\
58
+ }' > /etc/nginx/sites-available/default
59
 
60
+ # Create supervisor config
61
+ RUN echo '[supervisord] \n\
62
+ nodaemon=true \n\
63
+ \n\
64
+ [program:nginx] \n\
65
+ command=nginx -g "daemon off;" \n\
66
+ autostart=true \n\
67
+ autorestart=true \n\
68
+ stderr_logfile=/var/log/nginx.err.log \n\
69
+ stdout_logfile=/var/log/nginx.out.log \n\
70
+ \n\
71
+ [program:backend] \n\
72
+ command=python -m uvicorn backend.app.main:app --host 127.0.0.1 --port 8000 \n\
73
+ directory=/app \n\
74
+ autostart=true \n\
75
+ autorestart=true \n\
76
+ stderr_logfile=/var/log/backend.err.log \n\
77
+ stdout_logfile=/var/log/backend.out.log \n\
78
+ environment=USE_EXTREME_OPTIMIZATION="true",MAX_MEMORY_GB="3"' > /etc/supervisor/conf.d/supervisord.conf
79
 
80
+ # Expose Hugging Face Spaces default port
81
+ EXPOSE 7860
82
 
83
  # Health check
84
  HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
85
+ CMD curl -f http://localhost:7860/ || exit 1
86
 
87
+ # Start supervisor
88
+ CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
Dockerfile.original ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-stage build for optimized image size
2
+ FROM python:3.9-slim as builder
3
+
4
+ WORKDIR /app
5
+
6
+ # Install build dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ gcc \
9
+ g++ \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy and install Python dependencies
14
+ COPY backend/requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Production stage
18
+ FROM python:3.9-slim
19
+
20
+ WORKDIR /app
21
+
22
+ # Install runtime dependencies
23
+ RUN apt-get update && apt-get install -y \
24
+ libgomp1 \
25
+ libglib2.0-0 \
26
+ libsm6 \
27
+ libxext6 \
28
+ libxrender1 \
29
+ libgomp1 \
30
+ wget \
31
+ && rm -rf /var/lib/apt/lists/*
32
+
33
+ # Copy Python packages from builder
34
+ COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages
35
+ COPY --from=builder /usr/local/bin /usr/local/bin
36
+
37
+ # Copy application code
38
+ COPY backend/ ./backend/
39
+
40
+ # Set environment variables for memory optimization
41
+ ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
42
+ ENV OMP_NUM_THREADS=4
43
+ ENV MKL_NUM_THREADS=4
44
+ ENV NUMEXPR_NUM_THREADS=4
45
+ ENV TOKENIZERS_PARALLELISM=false
46
+
47
+ # Enable extreme memory optimization
48
+ ENV USE_EXTREME_OPTIMIZATION=true
49
+ ENV MAX_MEMORY_GB=3
50
+
51
+ WORKDIR /app/backend
52
+
53
+ # Expose port
54
+ EXPOSE 8000
55
+
56
+ # Health check
57
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
58
+ CMD curl -f http://localhost:8000/ || exit 1
59
+
60
+ # Start the application with memory-limited configuration
61
+ CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
README.md CHANGED
@@ -1,167 +1,56 @@
1
- # FastVLM-7B Screen Observer
2
-
3
- A local web application for real-time screen observation and analysis using Apple's FastVLM-7B model via HuggingFace.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  ## Features
6
-
7
- - **Real-time Screen Capture**: Capture and analyze screen content on-demand or automatically
8
- - **FastVLM-7B Integration**: Uses Apple's vision-language model for intelligent screen analysis
9
- - **UI Element Detection**: Identifies buttons, links, forms, and other interface elements
10
- - **Text Extraction**: Captures text snippets from the screen
11
- - **Risk Detection**: Flags potential security or privacy concerns
12
- - **Automation Demo**: Demonstrates browser automation capabilities
13
- - **NDJSON Logging**: Comprehensive logging in NDJSON format with timestamps
14
- - **Export Functionality**: Download logs and captured frames as ZIP archive
15
-
16
- ## Specifications
17
-
18
- - **Frontend**: React + Vite on `http://localhost:5173`
19
- - **Backend**: FastAPI on `http://localhost:8000`
20
- - **Model**: Apple FastVLM-7B with `trust_remote_code=True`
21
- - **Image Token**: `IMAGE_TOKEN_INDEX = -200`
22
- - **Output Format**: JSON with summary, ui_elements, text_snippets, risk_flags
23
-
24
- ## Prerequisites
25
-
26
- - Python 3.8+
27
- - Node.js 16+
28
- - Chrome/Chromium browser (for automation demo)
29
- - 14GB+ RAM (required for FastVLM-7B model weights)
30
- - CUDA-capable GPU or Apple Silicon (recommended for FastVLM-7B)
31
-
32
- ## Installation
33
-
34
- 1. Clone this repository:
35
- ```bash
36
- cd fastvlm-screen-observer
37
- ```
38
-
39
- 2. Install Python dependencies:
40
- ```bash
41
- cd backend
42
- python3 -m venv venv
43
- source venv/bin/activate # On Windows: venv\Scripts\activate
44
- pip install -r requirements.txt
45
- ```
46
-
47
- 3. Install Node.js dependencies:
48
- ```bash
49
- cd ../frontend
50
- npm install
51
- ```
52
-
53
- ## Running the Application
54
-
55
- ### Option 1: Using the start script (Recommended)
56
- ```bash
57
- ./start.sh
58
- ```
59
-
60
- ### Option 2: Manual start
61
-
62
- Terminal 1 - Backend:
63
- ```bash
64
- cd backend
65
- source venv/bin/activate
66
- uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
67
- ```
68
-
69
- Terminal 2 - Frontend:
70
- ```bash
71
- cd frontend
72
- npm run dev
73
- ```
74
-
75
- ## Usage
76
-
77
- 1. Open your browser and navigate to `http://localhost:5173`
78
- 2. Click "Capture Screen" to analyze the current screen
79
- 3. Enable "Auto Capture" for continuous monitoring
80
- 4. Use "Run Demo" to see browser automation in action
81
- 5. Click "Export Logs" to download analysis data
82
 
83
  ## API Endpoints
 
 
 
 
 
84
 
85
- - `GET /` - API status check
86
- - `POST /analyze` - Capture and analyze screen
87
- - `POST /demo` - Run automation demo
88
- - `GET /export` - Export logs as ZIP
89
- - `GET /logs/stream` - Stream logs via SSE
90
- - `GET /docs` - Interactive API documentation
91
-
92
- ## Project Structure
93
-
94
- ```
95
- fastvlm-screen-observer/
96
- ├── backend/
97
- │ ├── app/
98
- │ │ └── main.py # FastAPI application
99
- │ ├── models/
100
- │ │ ├── fastvlm_model.py # FastVLM-7B main integration
101
- │ │ ├── fastvlm_optimized.py # Memory optimization strategies
102
- │ │ ├── fastvlm_extreme.py # Extreme optimization (4-bit)
103
- │ │ └── use_fastvlm_small.py # Alternative 1.5B model
104
- │ ├── utils/
105
- │ │ ├── screen_capture.py # Screen capture utilities
106
- │ │ ├── automation.py # Browser automation
107
- │ │ └── logger.py # NDJSON logging
108
- │ └── requirements.txt
109
- ├── frontend/
110
- │ ├── src/
111
- │ │ ├── App.jsx # React main component (with error handling)
112
- │ │ ├── ScreenCapture.jsx # WebRTC screen capture
113
- │ │ └── App.css # Styling
114
- │ ├── package.json
115
- │ └��─ vite.config.js
116
- ├── logs/ # Generated logs and frames
117
- ├── start.sh # Startup script
118
- └── README.md
119
-
120
- ```
121
-
122
- ## Model Notes
123
-
124
- The application uses Apple's FastVLM-7B model with the following specifications:
125
- - **Model ID**: `apple/FastVLM-7B` from HuggingFace
126
- - **Tokenizer**: Qwen2Tokenizer (requires `transformers>=4.40.0`)
127
- - **IMAGE_TOKEN_INDEX**: -200 (special token for image placeholders)
128
- - **trust_remote_code**: True (required for model loading)
129
-
130
- ### Memory Requirements:
131
- - **Minimum**: 14GB RAM for model weights
132
- - **Recommended**: 16GB+ RAM for smooth operation
133
- - The model will download automatically on first run (~14GB)
134
-
135
- ### Current Implementation:
136
- The system includes multiple optimization strategies:
137
- 1. **Standard Mode**: Full precision (float16) - requires 14GB+ RAM
138
- 2. **Optimized Mode**: 8-bit quantization - requires 8-10GB RAM
139
- 3. **Extreme Mode**: 4-bit quantization with disk offloading - requires 6-8GB RAM
140
-
141
- If the model fails to load due to memory constraints, the application will:
142
- - Display a user-friendly error message
143
- - Continue operating with graceful error handling
144
- - NOT show "ANALYSIS_ERROR" in risk flags
145
-
146
- ## Acceptance Criteria
147
-
148
- ✅ Local web app running on localhost:5173
149
- ✅ FastAPI backend on localhost:8000
150
- ✅ FastVLM-7B integration with trust_remote_code=True
151
- ✅ IMAGE_TOKEN_INDEX = -200 configured
152
- ✅ JSON output format with required fields
153
- ✅ Demo automation functionality
154
- ✅ NDJSON logging with timestamps
155
- ✅ ZIP export with logs and frames
156
- ✅ Project structure matches specifications
157
-
158
- ## Troubleshooting
159
-
160
- - **Model Loading Issues**: Check GPU memory and CUDA installation
161
- - **Screen Capture Errors**: Ensure proper display permissions
162
- - **Browser Automation**: Install Chrome/Chromium and check WebDriver
163
- - **Port Conflicts**: Ensure ports 5173 and 8000 are available
164
-
165
- ## License
166
 
167
- MIT
 
 
1
+ ---
2
+ title: FastVLM Screen Observer
3
+ emoji: 🖥️👁️
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ sdk_version: "3.9"
8
+ app_port: 7860
9
+ pinned: false
10
+ license: mit
11
+ models:
12
+ - apple/FastVLM-7B
13
+ suggested_hardware: t4-small
14
+ custom_headers:
15
+ cross-origin-embedder-policy: require-corp
16
+ cross-origin-opener-policy: same-origin
17
+ ---
18
+
19
+ # FastVLM Screen Observer 🖥️👁️
20
+
21
+ Real-time screen observation and analysis using Apple's FastVLM-7B model, optimized for low-RAM systems (3-8GB).
22
 
23
  ## Features
24
+ - 🎯 Real-time screen capture and analysis
25
+ - 🤖 FastVLM-7B vision-language model integration
26
+ - 🔍 UI element detection
27
+ - 📝 Text extraction from screenshots
28
+ - ⚠️ Risk detection for security concerns
29
+ - 🎮 Browser automation demo
30
+ - 💾 Export logs and captured frames
31
+ - 🚀 Optimized for 3-8GB RAM with 4-bit quantization
32
+
33
+ ## How to Use
34
+ 1. Click "Capture Screen" to analyze your current screen
35
+ 2. Enable "Auto Capture" for continuous monitoring
36
+ 3. Use "Run Demo" to see browser automation
37
+ 4. Export logs as ZIP archive
38
+
39
+ ## Model Information
40
+ - **Model**: Apple FastVLM-7B
41
+ - **Optimization**: Extreme memory optimization with 4-bit quantization
42
+ - **Memory**: Runs on 3-8GB RAM systems
43
+ - **Device**: Supports CPU, CUDA, and MPS (Apple Silicon)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  ## API Endpoints
46
+ - `GET /api/` - Status check
47
+ - `POST /api/analyze` - Screen analysis
48
+ - `POST /api/demo` - Automation demo
49
+ - `GET /api/export` - Export logs
50
+ - `GET /api/logs/stream` - Stream logs via SSE
51
 
52
+ ## GitHub Repository
53
+ https://github.com/crosse712/fastvlm-screen-observer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ ---
56
+ Built with ❤️ using FastAPI, React, and FastVLM-7B
README_ORIGINAL.md ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastVLM-7B Screen Observer
2
+
3
+ A local web application for real-time screen observation and analysis using Apple's FastVLM-7B model via HuggingFace.
4
+
5
+ ## Features
6
+
7
+ - **Real-time Screen Capture**: Capture and analyze screen content on-demand or automatically
8
+ - **FastVLM-7B Integration**: Uses Apple's vision-language model for intelligent screen analysis
9
+ - **UI Element Detection**: Identifies buttons, links, forms, and other interface elements
10
+ - **Text Extraction**: Captures text snippets from the screen
11
+ - **Risk Detection**: Flags potential security or privacy concerns
12
+ - **Automation Demo**: Demonstrates browser automation capabilities
13
+ - **NDJSON Logging**: Comprehensive logging in NDJSON format with timestamps
14
+ - **Export Functionality**: Download logs and captured frames as ZIP archive
15
+
16
+ ## Specifications
17
+
18
+ - **Frontend**: React + Vite on `http://localhost:5173`
19
+ - **Backend**: FastAPI on `http://localhost:8000`
20
+ - **Model**: Apple FastVLM-7B with `trust_remote_code=True`
21
+ - **Image Token**: `IMAGE_TOKEN_INDEX = -200`
22
+ - **Output Format**: JSON with summary, ui_elements, text_snippets, risk_flags
23
+
24
+ ## Prerequisites
25
+
26
+ - Python 3.8+
27
+ - Node.js 16+
28
+ - Chrome/Chromium browser (for automation demo)
29
+ - 14GB+ RAM (required for FastVLM-7B model weights)
30
+ - CUDA-capable GPU or Apple Silicon (recommended for FastVLM-7B)
31
+
32
+ ## Installation
33
+
34
+ 1. Clone this repository:
35
+ ```bash
36
+ cd fastvlm-screen-observer
37
+ ```
38
+
39
+ 2. Install Python dependencies:
40
+ ```bash
41
+ cd backend
42
+ python3 -m venv venv
43
+ source venv/bin/activate # On Windows: venv\Scripts\activate
44
+ pip install -r requirements.txt
45
+ ```
46
+
47
+ 3. Install Node.js dependencies:
48
+ ```bash
49
+ cd ../frontend
50
+ npm install
51
+ ```
52
+
53
+ ## Running the Application
54
+
55
+ ### Option 1: Using the start script (Recommended)
56
+ ```bash
57
+ ./start.sh
58
+ ```
59
+
60
+ ### Option 2: Manual start
61
+
62
+ Terminal 1 - Backend:
63
+ ```bash
64
+ cd backend
65
+ source venv/bin/activate
66
+ uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
67
+ ```
68
+
69
+ Terminal 2 - Frontend:
70
+ ```bash
71
+ cd frontend
72
+ npm run dev
73
+ ```
74
+
75
+ ## Usage
76
+
77
+ 1. Open your browser and navigate to `http://localhost:5173`
78
+ 2. Click "Capture Screen" to analyze the current screen
79
+ 3. Enable "Auto Capture" for continuous monitoring
80
+ 4. Use "Run Demo" to see browser automation in action
81
+ 5. Click "Export Logs" to download analysis data
82
+
83
+ ## API Endpoints
84
+
85
+ - `GET /` - API status check
86
+ - `POST /analyze` - Capture and analyze screen
87
+ - `POST /demo` - Run automation demo
88
+ - `GET /export` - Export logs as ZIP
89
+ - `GET /logs/stream` - Stream logs via SSE
90
+ - `GET /docs` - Interactive API documentation
91
+
92
+ ## Project Structure
93
+
94
+ ```
95
+ fastvlm-screen-observer/
96
+ ├── backend/
97
+ │ ├── app/
98
+ │ │ └── main.py # FastAPI application
99
+ │ ├── models/
100
+ │ │ ├── fastvlm_model.py # FastVLM-7B main integration
101
+ │ │ ├── fastvlm_optimized.py # Memory optimization strategies
102
+ │ │ ├── fastvlm_extreme.py # Extreme optimization (4-bit)
103
+ │ │ └── use_fastvlm_small.py # Alternative 1.5B model
104
+ │ ├── utils/
105
+ │ │ ├── screen_capture.py # Screen capture utilities
106
+ │ │ ├── automation.py # Browser automation
107
+ │ │ └── logger.py # NDJSON logging
108
+ │ └── requirements.txt
109
+ ├── frontend/
110
+ │ ├── src/
111
+ │ │ ├── App.jsx # React main component (with error handling)
112
+ │ │ ├── ScreenCapture.jsx # WebRTC screen capture
113
+ │ │ └── App.css # Styling
114
+ │ ├── package.json
115
+ │ └── vite.config.js
116
+ ├── logs/ # Generated logs and frames
117
+ ├── start.sh # Startup script
118
+ └── README.md
119
+
120
+ ```
121
+
122
+ ## Model Notes
123
+
124
+ The application uses Apple's FastVLM-7B model with the following specifications:
125
+ - **Model ID**: `apple/FastVLM-7B` from HuggingFace
126
+ - **Tokenizer**: Qwen2Tokenizer (requires `transformers>=4.40.0`)
127
+ - **IMAGE_TOKEN_INDEX**: -200 (special token for image placeholders)
128
+ - **trust_remote_code**: True (required for model loading)
129
+
130
+ ### Memory Requirements:
131
+ - **Minimum**: 14GB RAM for model weights
132
+ - **Recommended**: 16GB+ RAM for smooth operation
133
+ - The model will download automatically on first run (~14GB)
134
+
135
+ ### Current Implementation:
136
+ The system includes multiple optimization strategies:
137
+ 1. **Standard Mode**: Full precision (float16) - requires 14GB+ RAM
138
+ 2. **Optimized Mode**: 8-bit quantization - requires 8-10GB RAM
139
+ 3. **Extreme Mode**: 4-bit quantization with disk offloading - requires 6-8GB RAM
140
+
141
+ If the model fails to load due to memory constraints, the application will:
142
+ - Display a user-friendly error message
143
+ - Continue operating with graceful error handling
144
+ - NOT show "ANALYSIS_ERROR" in risk flags
145
+
146
+ ## Acceptance Criteria
147
+
148
+ ✅ Local web app running on localhost:5173
149
+ ✅ FastAPI backend on localhost:8000
150
+ ✅ FastVLM-7B integration with trust_remote_code=True
151
+ ✅ IMAGE_TOKEN_INDEX = -200 configured
152
+ ✅ JSON output format with required fields
153
+ ✅ Demo automation functionality
154
+ ✅ NDJSON logging with timestamps
155
+ ✅ ZIP export with logs and frames
156
+ ✅ Project structure matches specifications
157
+
158
+ ## Troubleshooting
159
+
160
+ - **Model Loading Issues**: Check GPU memory and CUDA installation
161
+ - **Screen Capture Errors**: Ensure proper display permissions
162
+ - **Browser Automation**: Install Chrome/Chromium and check WebDriver
163
+ - **Port Conflicts**: Ensure ports 5173 and 8000 are available
164
+
165
+ ## License
166
+
167
+ MIT