mihailik commited on
Commit
57f49ec
·
1 Parent(s): 45a40b2

Adding WebLLM backend as an option.

Browse files
package-lock.json CHANGED
@@ -1,16 +1,17 @@
1
  {
2
  "name": "localm",
3
- "version": "1.1.35",
4
  "lockfileVersion": 3,
5
  "requires": true,
6
  "packages": {
7
  "": {
8
  "name": "localm",
9
- "version": "1.1.35",
10
  "license": "ISC",
11
  "dependencies": {
12
  "@huggingface/transformers": "^3.7.2",
13
  "@milkdown/crepe": "^7.15.3",
 
14
  "esbuild": "^0.25.9"
15
  }
16
  },
@@ -1901,6 +1902,15 @@
1901
  "tslib": "^2.8.1"
1902
  }
1903
  },
 
 
 
 
 
 
 
 
 
1904
  "node_modules/@protobufjs/aspromise": {
1905
  "version": "1.1.2",
1906
  "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
@@ -2599,6 +2609,19 @@
2599
  "integrity": "sha512-mKnC+QJ9pWVzv+C4/U3rRsHapFfHvQFoFB92e52xeyGMcX6/OlIl78je1u8vePzYZSkkogMPJ2yjxxsb89cxyw==",
2600
  "license": "MIT"
2601
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
2602
  "node_modules/long": {
2603
  "version": "5.3.2",
2604
  "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",
 
1
  {
2
  "name": "localm",
3
+ "version": "1.1.38",
4
  "lockfileVersion": 3,
5
  "requires": true,
6
  "packages": {
7
  "": {
8
  "name": "localm",
9
+ "version": "1.1.38",
10
  "license": "ISC",
11
  "dependencies": {
12
  "@huggingface/transformers": "^3.7.2",
13
  "@milkdown/crepe": "^7.15.3",
14
+ "@mlc-ai/web-llm": "^0.2.79",
15
  "esbuild": "^0.25.9"
16
  }
17
  },
 
1902
  "tslib": "^2.8.1"
1903
  }
1904
  },
1905
+ "node_modules/@mlc-ai/web-llm": {
1906
+ "version": "0.2.79",
1907
+ "resolved": "https://registry.npmjs.org/@mlc-ai/web-llm/-/web-llm-0.2.79.tgz",
1908
+ "integrity": "sha512-Hy1ZHQ0o2bZGZoVnGK48+fts/ZSKwLe96xjvqL/6C59Mem9HoHTcFE07NC2E23mRmhd01tL655N6CPeYmwWgwQ==",
1909
+ "license": "Apache-2.0",
1910
+ "dependencies": {
1911
+ "loglevel": "^1.9.1"
1912
+ }
1913
+ },
1914
  "node_modules/@protobufjs/aspromise": {
1915
  "version": "1.1.2",
1916
  "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
 
2609
  "integrity": "sha512-mKnC+QJ9pWVzv+C4/U3rRsHapFfHvQFoFB92e52xeyGMcX6/OlIl78je1u8vePzYZSkkogMPJ2yjxxsb89cxyw==",
2610
  "license": "MIT"
2611
  },
2612
+ "node_modules/loglevel": {
2613
+ "version": "1.9.2",
2614
+ "resolved": "https://registry.npmjs.org/loglevel/-/loglevel-1.9.2.tgz",
2615
+ "integrity": "sha512-HgMmCqIJSAKqo68l0rS2AanEWfkxaZ5wNiEFb5ggm08lDs9Xl2KxBlX3PTcaD2chBM1gXAYf491/M2Rv8Jwayg==",
2616
+ "license": "MIT",
2617
+ "engines": {
2618
+ "node": ">= 0.6.0"
2619
+ },
2620
+ "funding": {
2621
+ "type": "tidelift",
2622
+ "url": "https://tidelift.com/funding/github/npm/loglevel"
2623
+ }
2624
+ },
2625
  "node_modules/long": {
2626
  "version": "5.3.2",
2627
  "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",
package.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "localm",
3
- "version": "1.1.38",
4
  "description": "Chat application",
5
  "scripts": {
6
  "build": "esbuild src/index.js --target=es6 --bundle --sourcemap --outfile=./index.js --format=iife --external:fs --external:path --external:child_process --external:ws --external:katex/dist/katex.min.css",
@@ -22,6 +22,7 @@
22
  "dependencies": {
23
  "@huggingface/transformers": "^3.7.2",
24
  "@milkdown/crepe": "^7.15.3",
 
25
  "esbuild": "^0.25.9"
26
  }
27
  }
 
1
  {
2
  "name": "localm",
3
+ "version": "1.2.0",
4
  "description": "Chat application",
5
  "scripts": {
6
  "build": "esbuild src/index.js --target=es6 --bundle --sourcemap --outfile=./index.js --format=iife --external:fs --external:path --external:child_process --external:ws --external:katex/dist/katex.min.css",
 
22
  "dependencies": {
23
  "@huggingface/transformers": "^3.7.2",
24
  "@milkdown/crepe": "^7.15.3",
25
+ "@mlc-ai/web-llm": "^0.2.79",
26
  "esbuild": "^0.25.9"
27
  }
28
  }
plans/2025-08-23-webllm-integration/1-assessment.md ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WebLLM Integration Plans: Three Alternative Approaches
2
+
3
+ ## Executive Summary
4
+
5
+ This document outlines three comprehensive approaches for integrating WebLLM alongside the existing Transformers.js backend in the LocalM application. Each plan addresses the dual-backend strategy where WebLLM is attempted first for supported models, with Transformers.js as a fallback.
6
+
7
+ ## Current Architecture Analysis
8
+
9
+ **Current Boot Worker Flow:**
10
+
11
+ 1. `boot-worker.js` handles message routing (`loadModel`, `runPrompt`)
12
+ 2. `model-cache.js` manages Transformers.js model loading with backend detection
13
+ 3. `load-model-core.js` creates pipelines using `@huggingface/transformers`
14
+ 4. Curated model list provides stable model set for consistent loading
15
+
16
+ **Key Integration Requirements:**
17
+
18
+ * WebLLM first, Transformers.js fallback loading strategy
19
+ * Dual inference API handling based on loaded model type
20
+ * Unified model management and caching
21
+ * Consistent progress reporting and error handling
22
+
23
+ ***
24
+
25
+ ## Plan 1: Unified Backend Manager Architecture
26
+
27
+ ### Philosophy
28
+
29
+ Create a sophisticated backend abstraction layer that treats WebLLM and Transformers.js as interchangeable engines, with intelligent model routing and unified API surface.
30
+
31
+ ### Implementation Steps
32
+
33
+ #### Step 1: Create Backend Registry System
34
+
35
+ **Description:** Implement a registry pattern for backend engines with capability detection
36
+ **Deliverable:** `BackendRegistry` class with engine registration and capability query methods
37
+
38
+ ```JavaScript
39
+ class BackendRegistry {
40
+ registerBackend(name, engine, capabilities)
41
+ getCompatibleBackends(modelId)
42
+ createEngine(backendName, config)
43
+ }
44
+ ```
45
+
46
+ **Risks:** Complex abstraction may obscure debugging; requires deep understanding of both APIs
47
+ **Mitigation:** Extensive logging and backend-specific error passthrough
48
+
49
+ #### Step 2: Develop Backend Engine Adapters
50
+
51
+ **Description:** Create adapter classes that normalize WebLLM and Transformers.js APIs
52
+ **Deliverable:** `WebLLMAdapter` and `TransformersAdapter` implementing common `IBackendEngine` interface
53
+
54
+ ```JavaScript
55
+ interface IBackendEngine {
56
+ async loadModel(modelId, progressCallback)
57
+ async generateText(prompt, options)
58
+ getModelInfo()
59
+ dispose()
60
+ }
61
+ ```
62
+
63
+ **Risks:** API impedance mismatch between backends; feature parity challenges
64
+ **Mitigation:** Adapter pattern with clear feature capability flags
65
+
66
+ #### Step 3: Implement Model Compatibility Matrix
67
+
68
+ **Description:** Build comprehensive model support matrix mapping models to compatible backends
69
+ **Deliverable:** Enhanced curated model list with backend compatibility metadata
70
+
71
+ ```JSON
72
+ {
73
+ "id": "Llama-3.1-8B-Instruct",
74
+ "backends": {
75
+ "webllm": { "supported": true, "priority": 1, "format": "MLC" },
76
+ "transformers": { "supported": true, "priority": 2, "format": "ONNX" }
77
+ }
78
+ }
79
+ ```
80
+
81
+ **Risks:** Maintenance overhead for compatibility matrix; model format inconsistencies
82
+ **Mitigation:** Automated testing pipeline for model compatibility validation
83
+
84
+ #### Step 4: Create Unified Model Cache
85
+
86
+ **Description:** Replace current ModelCache with multi-backend aware cache
87
+ **Deliverable:** `UnifiedModelCache` with backend-aware storage and retrieval
88
+
89
+ ```JavaScript
90
+ class UnifiedModelCache {
91
+ async getModel(modelId, preferredBackend)
92
+ async loadWithFallback(modelId, backendPriority)
93
+ cacheModel(modelId, backend, modelInstance)
94
+ }
95
+ ```
96
+
97
+ **Risks:** Cache invalidation complexity; memory management across different backend types
98
+ **Mitigation:** Clear cache lifecycle management and backend-specific disposal patterns
99
+
100
+ #### Step 5: Implement Smart Backend Selection
101
+
102
+ **Description:** Create intelligent backend selection based on device capabilities and model compatibility
103
+ **Deliverable:** `BackendSelector` with device detection and optimal backend recommendation
104
+ **Risks:** WebGPU detection inconsistencies; backend preference conflicts
105
+ **Mitigation:** Fallback chains with user preference override capabilities
106
+
107
+ #### Step 6: Update Boot Worker Integration
108
+
109
+ **Description:** Modify boot-worker to use unified backend system
110
+ **Deliverable:** Updated `boot-worker.js` with unified model loading and inference
111
+ **Risks:** Breaking existing functionality; complex error handling
112
+ **Mitigation:** Feature flags for gradual rollout; comprehensive testing suite
113
+
114
+ ### Plan Summary
115
+
116
+ **Why this plan is good:** Provides maximum flexibility and maintainability through clean abstractions. Enables easy addition of future backends. Offers sophisticated model routing and optimization.
117
+
118
+ **How it makes the app better:** Creates a scalable foundation for multiple ML backends, optimizes performance through intelligent backend selection, and provides unified developer experience while maintaining backend-specific optimizations.
119
+
120
+ ***
121
+
122
+ ## Plan 2: Progressive Enhancement Strategy
123
+
124
+ ### Philosophy
125
+
126
+ Implement WebLLM as an enhanced capability layer that progressively enhances the existing Transformers.js foundation, maintaining backward compatibility while adding advanced features.
127
+
128
+ ### Implementation Steps
129
+
130
+ #### Step 1: Create WebLLM Detection and Initialization
131
+
132
+ **Description:** Add WebLLM capability detection and optional initialization
133
+ **Deliverable:** `WebLLMCapabilities` module with environment detection
134
+
135
+ ```JavaScript
136
+ class WebLLMCapabilities {
137
+ static async isSupported()
138
+ static async initialize()
139
+ static getAvailableModels()
140
+ }
141
+ ```
142
+
143
+ **Risks:** WebGPU availability detection false positives; initialization timing issues
144
+ **Mitigation:** Robust feature detection with fallback graceful degradation
145
+
146
+ #### Step 2: Extend Model Metadata with WebLLM Support Flags
147
+
148
+ **Description:** Enhance existing curated model list with WebLLM compatibility flags
149
+ **Deliverable:** Updated `curated-model-list.json` with progressive enhancement metadata
150
+
151
+ ```JSON
152
+ {
153
+ "id": "existing-model",
154
+ "webllm": {
155
+ "supported": true,
156
+ "model_lib": "url-to-wasm",
157
+ "performance_tier": "high"
158
+ }
159
+ }
160
+ ```
161
+
162
+ **Risks:** Data schema versioning; metadata synchronization challenges
163
+ **Mitigation:** Schema validation and backward compatibility layers
164
+
165
+ #### Step 3: Implement Hybrid Model Loader
166
+
167
+ **Description:** Extend existing ModelCache with WebLLM loading capabilities
168
+ **Deliverable:** Enhanced `model-cache.js` with dual-loading strategy
169
+
170
+ ```JavaScript
171
+ class EnhancedModelCache extends ModelCache {
172
+ async loadWithWebLLM(modelName)
173
+ async loadWithTransformers(modelName) // existing
174
+ async getModelWithPreference(modelName, preferWebLLM = true)
175
+ }
176
+ ```
177
+
178
+ **Risks:** Code complexity in existing critical path; regression potential
179
+ **Mitigation:** Incremental enhancement with feature flags and A/B testing
180
+
181
+ #### Step 4: Create Unified Inference Interface
182
+
183
+ **Description:** Build adapter layer for consistent inference API across backends
184
+ **Deliverable:** `InferenceAdapter` that normalizes WebLLM and Transformers.js calls
185
+ **Risks:** API abstraction leakage; performance overhead from adaptation layer
186
+ **Mitigation:** Minimal abstraction with direct passthrough where possible
187
+
188
+ #### Step 5: Implement Progressive Model Loading
189
+
190
+ **Description:** Create graceful fallback system from WebLLM to Transformers.js
191
+ **Deliverable:** Enhanced `loadModel` handler with progressive loading strategy
192
+ **Risks:** Complex error handling; user experience during fallback scenarios
193
+ **Mitigation:** Clear progress indication and transparent fallback communication
194
+
195
+ #### Step 6: Add Advanced WebLLM Features
196
+
197
+ **Description:** Expose WebLLM-specific features like streaming and JSON mode
198
+ **Deliverable:** Enhanced inference options and streaming capabilities
199
+ **Risks:** Feature parity maintenance; increased API surface area
200
+ **Mitigation:** Feature capability detection and graceful degradation
201
+
202
+ ### Plan Summary
203
+
204
+ **Why this plan is good:** Minimizes risk by building on existing foundation. Maintains full backward compatibility. Allows gradual migration and testing. Preserves investment in current Transformers.js integration.
205
+
206
+ **How it makes the app better:** Provides immediate performance benefits for supported models while maintaining reliability. Enables advanced features like better streaming without breaking existing functionality. Creates clear upgrade path for users.
207
+
208
+ ***
209
+
210
+ ## Plan 3: Microservice Backend Architecture
211
+
212
+ ### Philosophy
213
+
214
+ Implement WebLLM and Transformers.js as independent microservice-style modules with a central orchestrator, enabling maximum isolation and specialized optimization for each backend.
215
+
216
+ ### Implementation Steps
217
+
218
+ #### Step 1: Create Backend Service Abstractions
219
+
220
+ **Description:** Design service interfaces for independent backend implementations
221
+ **Deliverable:** `IBackendService` interface and base service framework
222
+
223
+ ```JavaScript
224
+ interface IBackendService {
225
+ async initialize(config)
226
+ async loadModel(modelSpec)
227
+ async inference(request)
228
+ async dispose()
229
+ getCapabilities()
230
+ }
231
+ ```
232
+
233
+ **Risks:** Over-engineering; increased complexity for simple use cases
234
+ **Mitigation:** Keep interfaces minimal and focused on essential operations
235
+
236
+ #### Step 2: Implement WebLLM Service Module
237
+
238
+ **Description:** Create dedicated WebLLM service with full feature implementation
239
+ **Deliverable:** `WebLLMService` with complete WebLLM integration
240
+
241
+ ```JavaScript
242
+ class WebLLMService implements IBackendService {
243
+ async loadModel(modelSpec) { /* WebLLM-specific loading */ }
244
+ async inference(request) { /* OpenAI-compatible API */ }
245
+ async streamInference(request) { /* Streaming support */ }
246
+ }
247
+ ```
248
+
249
+ **Risks:** WebLLM-specific quirks and edge cases; model format compatibility
250
+ **Mitigation:** Comprehensive testing with various model types and sizes
251
+
252
+ #### Step 3: Refactor Transformers.js as Service
253
+
254
+ **Description:** Encapsulate existing Transformers.js logic into service module
255
+ **Deliverable:** `TransformersService` extracted from current implementation
256
+ **Risks:** Breaking existing functionality during refactor; regression introduction
257
+ **Mitigation:** Comprehensive test coverage before refactoring; gradual migration
258
+
259
+ #### Step 4: Create Service Orchestrator
260
+
261
+ **Description:** Build central orchestrator for service selection and lifecycle management
262
+ **Deliverable:** `BackendOrchestrator` with service discovery and routing
263
+
264
+ ```JavaScript
265
+ class BackendOrchestrator {
266
+ async selectService(modelId, requirements)
267
+ async routeRequest(request, servicePreference)
268
+ manageServiceLifecycle()
269
+ }
270
+ ```
271
+
272
+ **Risks:** Central point of failure; orchestration complexity
273
+ **Mitigation:** Robust error handling and service isolation patterns
274
+
275
+ #### Step 5: Implement Service Communication Layer
276
+
277
+ **Description:** Create communication protocol between orchestrator and services
278
+ **Deliverable:** Message-based communication with type-safe protocols
279
+ **Risks:** Communication overhead; debugging complexity across service boundaries
280
+ **Mitigation:** Clear logging and service health monitoring
281
+
282
+ #### Step 6: Build Service Discovery and Health Monitoring
283
+
284
+ **Description:** Implement service capability detection and health monitoring
285
+ **Deliverable:** Service registry with capability announcement and health checks
286
+ **Risks:** Health check false positives; service state synchronization
287
+ **Mitigation:** Conservative health checks with manual override capabilities
288
+
289
+ #### Step 7: Create Worker Thread Integration
290
+
291
+ **Description:** Integrate services with worker thread architecture for performance isolation
292
+ **Deliverable:** Enhanced worker integration with service-specific worker threads
293
+ **Risks:** Worker communication complexity; resource management across threads
294
+ **Mitigation:** Clear worker lifecycle management and resource cleanup
295
+
296
+ ### Plan Summary
297
+
298
+ **Why this plan is good:** Provides maximum isolation and specialization for each backend. Enables independent development and testing of backend implementations. Creates clear separation of concerns with single responsibility services. Facilitates future backend additions with minimal existing code changes.
299
+
300
+ **How it makes the app better:** Enables optimal performance tuning for each backend independently. Provides robust fault isolation where one backend failure doesn't affect others. Creates modular architecture that supports independent scaling and optimization. Enables A/B testing of different backends for same models.
301
+
302
+ ***
303
+
304
+ ## Comparative Analysis
305
+
306
+ | Aspect | Plan 1: Unified Backend | Plan 2: Progressive Enhancement | Plan 3: Microservice Architecture |
307
+ | -------------------------- | ----------------------- | ------------------------------- | --------------------------------- |
308
+ | **Implementation Risk** | Medium-High | Low-Medium | High |
309
+ | **Development Time** | 3-4 weeks | 2-3 weeks | 4-6 weeks |
310
+ | **Maintainability** | High | Medium | Very High |
311
+ | **Performance** | Good | Good | Excellent |
312
+ | **Future Extensibility** | Very Good | Good | Excellent |
313
+ | **Backward Compatibility** | Medium | Excellent | Good |
314
+ | **Testing Complexity** | Medium | Low | High |
315
+ | **User Experience Impact** | Medium | Low | Low |
316
+
317
+ ## Recommended Approach
318
+
319
+ Based on the analysis, **Plan 2: Progressive Enhancement Strategy** is recommended for initial implementation due to:
320
+
321
+ 1. **Lower Risk**: Builds on existing working foundation
322
+ 2. **Faster Time to Value**: Can deliver WebLLM benefits in 2-3 weeks
323
+ 3. **Minimal Disruption**: Maintains existing functionality during transition
324
+ 4. **Clear Migration Path**: Enables future adoption of more sophisticated architectures
325
+
326
+ The progressive enhancement approach allows immediate benefits while preserving the option to evolve toward Plan 1 or Plan 3 architectures as requirements mature and the codebase stabilizes with dual-backend support.
plans/2025-08-23-webllm-integration/2-assessment-rejection.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Review: critique of WebLLM integration plans (short)
3
+
4
+ This document records a concise, one‑page critique of the three proposals in
5
+ `webllm-integration-plans.md`. The focus is strictly on unnecessary complexity,
6
+ maintenance risk, and how each plan deviates from the original, tightly scoped
7
+ requirement: "optimistic WebLLM load with Transformers.js fallback, and simple
8
+ runtime routing at inference time."
9
+
10
+ ## Checklist (requirements extracted)
11
+ - Attempt WebLLM first when loading a model in `boot-worker` — Required
12
+ - If WebLLM load fails, fallback to Transformers.js — Required
13
+ - At inference time, route to WebLLM or Transformers.js based on the loaded model — Required
14
+
15
+ Status: The three plans nominally address these rules, but each layers
16
+ additional architecture that is not required by the stated behavior.
17
+
18
+ ## Plan 1 — Unified Backend Manager
19
+ - What it proposes: registry, adapters, compatibility matrix, unified cache, smart selector.
20
+ - Why it’s over‑engineered: converts a simple dual‑backend decision into a general multi‑backend platform.
21
+ - Specific harms:
22
+ - Large maintenance surface: many new modules to design, document, and keep in sync.
23
+ - Harder debugging: faults are displaced into adapter/registry layers.
24
+ - Test explosion: compatibility matrix and routing logic require extensive tests.
25
+ - Delayed delivery: substantial upfront work with little immediate value.
26
+ - Salvageable idea: a very small, local adapter contract or a backend marker can be useful — but only if kept intentionally minimal.
27
+
28
+ ## Plan 2 — Progressive Enhancement
29
+ - What it proposes: capability detection, curated metadata changes, hybrid loader, small inference adapter.
30
+ - Why it still feels heavy: it expands metadata and loader paths despite the requirement being a single optimistic attempt + fallback.
31
+ - Specific harms:
32
+ - Metadata maintenance and schema versioning overhead.
33
+ - Increased regression risk by touching the hot path (`model-cache`).
34
+ - API leakage: adapters can hide backend differences and cause subtle runtime mismatches.
35
+ - Merit: conceptually the safest approach; its incremental philosophy is appropriate — but the plan should avoid broad metadata and API surface growth at this stage.
36
+
37
+ ## Plan 3 — Microservice Backend Architecture
38
+ - What it proposes: independent backend services, orchestrator, IPC/protocols, health checks, worker isolation.
39
+ - Why it’s inappropriate now: it’s a heavy structural shift that doesn’t fit an in‑browser, worker‑based app nor the simple requirement.
40
+ - Specific harms:
41
+ - Severe implementation and operational overhead.
42
+ - Debugging and runtime complexity across service boundaries.
43
+ - Overfitting server patterns to client‑side code.
44
+
45
+ ## Summary
46
+ All three plans contain useful long‑term ideas, but they escalate architecture well beyond the immediate need. Plan 2’s incremental mindset is the closest fit, yet even it introduces schema and loader surface growth that is not required today. Plans 1 and 3 add costly abstractions that will negatively affect maintainability, testing, and delivery speed if implemented now.
47
+
48
+ Recommendation (for reviewers): preserve the useful concepts (capability detection, explicit backend marker) but avoid registry/orchestrator layers and wide metadata changes at this stage. Keep the initial implementation small and focused on the two behaviors the project must guarantee.
plans/2025-08-23-webllm-integration/3-assessment-revised.md ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WebLLM Integration: Revised Simple Plans
2
+
3
+ ## Executive Summary
4
+
5
+ After reviewing the critique of our initial over-engineered plans, this document presents three focused, minimal approaches for integrating WebLLM with Transformers.js fallback. Each plan prioritizes simple if-statements over complex abstractions, minimal code footprint, and easy maintenance.
6
+
7
+ **Core Requirement Recap:**
8
+
9
+ * Attempt WebLLM first when loading a model
10
+ * If WebLLM load fails, fallback to Transformers.js
11
+ * At inference time, route to appropriate backend based on loaded model type
12
+
13
+ ***
14
+
15
+ ## Plan A: Inline WebLLM Integration (Simplest)
16
+
17
+ ### Goal
18
+
19
+ Integrate WebLLM into the existing `ModelCache` class with absolute minimum code changes, preserving all existing functionality while adding optimistic WebLLM loading with Transformers.js fallback.
20
+
21
+ ### Philosophy
22
+
23
+ Add WebLLM directly into the existing `ModelCache` class with minimal changes. No new abstractions, no registries - just straightforward if-else logic in the existing loading flow.
24
+
25
+ ### Design Restrictions & Limitations
26
+
27
+ * **No new files**: All changes must be within existing files
28
+ * **No interface changes**: External API must remain identical
29
+ * **No breaking changes**: Existing Transformers.js behavior must be preserved exactly
30
+ * **Minimal dependencies**: Only add WebLLM import, no additional libraries
31
+ * **No configuration**: Keep WebLLM model selection automatic based on model name
32
+
33
+ ### Intentional Design Choices
34
+
35
+ 1. **Inline over modular**: Accept some code duplication to avoid abstraction complexity
36
+ 2. **Cache WebLLM availability check**: Prevent repeated import attempts
37
+ 3. **Identical inference interface**: Wrap WebLLM to match Transformers.js pipeline signature
38
+ 4. **Silent fallback**: Log WebLLM failures but don't surface them to UI
39
+ 5. **No WebLLM-specific features**: Stick to basic text generation only
40
+
41
+ ### Risk Assessment
42
+
43
+ #### High Risks
44
+
45
+ * **WebLLM import failures**: Dynamic imports may fail unpredictably
46
+ * *Mitigation*: Robust try-catch with cached failure state
47
+ * **Memory leaks**: WebLLM engines may not dispose properly
48
+ * *Mitigation*: Store engine reference for explicit cleanup
49
+ * **Interface mismatch**: WebLLM API differs significantly from Transformers.js
50
+ * *Mitigation*: Careful wrapper function with identical signatures
51
+
52
+ #### Medium Risks
53
+
54
+ * **Performance regression**: Additional async checks may slow loading
55
+ * *Mitigation*: Cache availability check result
56
+ * **Error message confusion**: Users may see Transformers.js errors when WebLLM was attempted
57
+ * *Mitigation*: Clear logging distinguishing between attempts
58
+
59
+ #### Low Risks
60
+
61
+ * **Code maintainability**: Inline logic may become hard to follow
62
+ * *Mitigation*: Comprehensive comments and clear variable naming
63
+
64
+ ### Potential Pitfalls & Avoidance
65
+
66
+ 1. **Pitfall**: WebLLM models have different naming conventions
67
+ * *Avoidance*: Start with exact model name matching, document differences
68
+
69
+ 2. **Pitfall**: WebLLM may load but fail during inference
70
+ * *Avoidance*: Include inference test during model loading phase
71
+
72
+ 3. **Pitfall**: Mixed backend state confusion in cache
73
+ * *Avoidance*: Clear backend type marking on cached models
74
+
75
+ 4. **Pitfall**: WebLLM engine disposal not called
76
+ * *Avoidance*: Store engine reference and implement cleanup in cache eviction
77
+
78
+ ### Implementation Details
79
+
80
+ #### Step 1: Add WebLLM availability detection (no code)
81
+
82
+ Goal: cheaply detect whether WebLLM is present in the runtime and cache that result.
83
+
84
+ Details: perform a single availability probe once (at worker startup or first model load). Cache the boolean result and last error text for diagnostics. The probe must be cheap and non-blocking for the UI thread; if the probe indicates WebLLM is unavailable, skip WebLLM attempts for the remainder of the session.
85
+
86
+ Risks and safeguards: the probe can succeed but engine creation still fail — store the probe result as advisory only, and always run a short, bounded health check when creating an actual engine. Log errors for debugging; do not surface probe failures to the user.
87
+
88
+ #### Step 2: Optimistic WebLLM attempt with bounded validation (no code)
89
+
90
+ Goal: when loading a model, try WebLLM first if the probe passed. If WebLLM load or a short validation inference fails, fall back to Transformers.js immediately.
91
+
92
+ Details: implement a two-phase load: (1) an optimistic WebLLM engine creation attempt with a short, fixed timeout and a lightweight validation (one small inference). If both complete successfully, mark the cached model as WebLLM-backed. If either times out or fails, swallow the error, log it, and run the existing Transformers.js loader unchanged.
93
+
94
+ Risks and safeguards: protect against long hangs by enforcing a timeout; validate the engine with a minimal inference to catch silent failures; keep errors in logs only and preserve original Transformers.js errors for UI-facing messages.
95
+
96
+ #### Step 3: Keep a simple backend marker and unified inference contract (no code)
97
+
98
+ Goal: mark each cached model with a tiny backend flag ("webllm" or "transformers") so runtime code can choose the correct inference path with one if-statement.
99
+
100
+ Details: ensure the wrapper used for WebLLM returns data in the same shape the rest of the code expects (a small set of fields). At inference time, do a single if-check on the backend marker and call the model accordingly. Map or normalize options conservatively.
101
+
102
+ Risks and safeguards: mapping may omit uncommon options — limit supported options initially and document them; validate return shape after inference and fall back to a meaningful error if malformed.
103
+
104
+ #### Step 4: Minimal disposal lifecycle and cleanup (no code)
105
+
106
+ Goal: provide an explicit, per-model disposal pathway to avoid leaking native resources.
107
+
108
+ Details: when a model is evicted or the worker shuts down, if the cached item exposes a disposal API, call it within try/catch and continue. Log disposal outcomes for visibility. Avoid automatic aggressive disposal during active inference.
109
+
110
+ Risks and safeguards: some engines may not implement disposal correctly — wrap calls in try/catch; avoid disposing in the middle of an active request; provide a single manual cleanup call for diagnostic use.
111
+
112
+ #### Testing, rollout and rollback (no code)
113
+
114
+ Testing: verify the following scenarios: WebLLM absent, WebLLM present but engine creation fails, WebLLM present and operates correctly, mixed model usage. Create deterministic mocks for each path for CI.
115
+
116
+ Rollout: feature-flag the WebLLM probe or gate the code behind a simple config toggle. For rollback, disable the probe; behavior returns to the previous Transformers.js-only flow.
117
+
118
+ Monitoring and diagnostics: log backend selection, load durations, timeouts, and validation failures to the console (or a dev-only telemetry sink). Provide a developer-only command to view cached model backends and last probe errors.
119
+
120
+ ### Implementation Time: 1-2 days
121
+
122
+ ### Risk Level: Very Low
123
+
124
+ ### Maintenance Overhead: Minimal
125
+
126
+ **Why this works:** describes the same minimal inline integration without code; keeps the runtime simple and easily debuggable.
127
+
128
+ ***
129
+
130
+ ## Plan B: Dual-Path ModelCache (Balanced)
131
+
132
+ ### Goal
133
+
134
+ Create a deliberately clear but still small separation between the WebLLM and Transformers.js loading paths to make debugging and testing easier, while keeping a single public interface for callers.
135
+
136
+ ### Philosophy
137
+
138
+ Prefer explicit separate loader functions for each backend, but keep them private to the `ModelCache`. Use a single public `getModel` API and a tiny `modelBackends` registry (mapping modelName -> backend) for diagnostics.
139
+
140
+ ### Design Restrictions & Limitations
141
+
142
+ * The public `ModelCache` interface must not change.
143
+ * No large new frameworks, no registry/adapter abstractions beyond the single Map that records backend per model.
144
+ * WebLLM attempts remain optimistic and short-lived; Transformers.js remains the reliable fallback.
145
+
146
+ ### Intentional Design Choices
147
+
148
+ 1. Separate loader functions for clarity and testability.
149
+ 2. A small map to track which backend served each model for diagnostics only.
150
+ 3. Structured error objects so we can decide what to surface to the UI vs. what to log for debugging.
151
+
152
+ ### Risks and mitigations
153
+
154
+ * State synchronization: update cache and backend map in an atomic sequence so they cannot diverge. If a cached entry is a pending Promise, ensure the map only records the backend after the Promise resolves successfully.
155
+ * Complexity in tests: provide mocks for each loader and test all four combinations (webllm success/fail x transformers success/fail).
156
+
157
+ ### Step-by-step rollout (no code)
158
+
159
+ 1. Add a `modelBackends` map to the cache implementation.
160
+ 2. Implement two private loader routines: one for WebLLM and one for Transformers.js. Keep the WebLLM loader conservative: timeout, one validation call, wrap engine in a normalized interface.
161
+ 3. In the public loader, call the WebLLM loader first; on success update cache and `modelBackends` to "webllm". On failure, call Transformers.js loader and update `modelBackends` to "transformers".
162
+ 4. Ensure the public `getModel` returns the same shape regardless of backend.
163
+ 5. Add lightweight diagnostics: expose a developer method to list cached models with their backends and last load durations.
164
+
165
+ ### Testing, rollout and rollback
166
+
167
+ Testing: add unit tests for both loader functions with mocks; add integration tests that exercise the public `getModel` in all backend success/failure permutations.
168
+ Rollout: can be enabled behind a config flag or staged to a small percentage of users (developer-only first). Rollback is simply disabling WebLLM attempts or reverting the map updates.
169
+
170
+ ### Implementation Time: 2-3 days
171
+
172
+ ### Risk Level: Low
173
+
174
+ ### Maintenance Overhead: Low
175
+
176
+ **Why this works:** Slightly more structure than Plan A simplifies debugging and testing while still avoiding large abstractions.
177
+
178
+ ***
179
+
180
+ ## Plan C: Minimal WebLLM Module (Most Structured)
181
+
182
+ ### Goal
183
+
184
+ Extract WebLLM integration into one small, well-tested module that mirrors the existing Transformers.js contract. Keep the rest of the codebase unchanged and use the module from `ModelCache` when appropriate.
185
+
186
+ ### Philosophy
187
+
188
+ Encapsulate WebLLM specifics (probe, engine creation, validation, disposal) in a single file. That file exposes a tiny API: availability probe, loadModel(modelName) returning a normalized pipeline, and optional dispose methods.
189
+
190
+ ### Design Restrictions & Limitations
191
+
192
+ * Add exactly one new file/module; do not add registries or dispatch systems.
193
+ * The module must be lightweight, with no complex state beyond a cached availability flag and per-engine handles.
194
+ * The module must normalize outputs to the existing pipeline shape used by the rest of the app.
195
+
196
+ ### Intentional Design Choices
197
+
198
+ 1. Single responsibility: only WebLLM concerns go into the module.
199
+ 2. Identical interface: consumers should not need to know whether they call WebLLM or Transformers.js.
200
+ 3. Easier testing: the module can be mocked in unit tests without touching Transformers.js code.
201
+
202
+ ### Step-by-step rollout (no code)
203
+
204
+ 1. Create the WebLLM module with three exported functions: probeAvailability(), loadModel(modelName), disposeEngine(handle).
205
+ 2. `probeAvailability` runs a single cheap probe and caches the result for the session.
206
+ 3. `loadModel` attempts engine creation with a bounded timeout and runs a minimal validation inference, returning a normalized pipeline-like object on success or throwing on failure.
207
+ 4. Import the module into `ModelCache` and attempt to use it first; when it throws or times out, fall back to the existing Transformers.js loader.
208
+
209
+ ### Risks and mitigations
210
+
211
+ * Module drift: keep the module intentionally tiny so API changes are rare.
212
+ * Duplicate normalization logic: ensure the normalization contract is documented and shared between module and cache tests.
213
+
214
+ ### Testing, rollout and rollback
215
+
216
+ Testing: unit test the module aggressively (mocks for engine creation and inference). Integration test that ModelCache interacts with the module correctly and still falls back.
217
+ Rollout: feature flag the module usage. For rollback, remove the module import or disable the probe call.
218
+
219
+ ### Implementation Time: 2-3 days
220
+
221
+ ### Risk Level: Low
222
+
223
+ ### Maintenance Overhead: Low
224
+
225
+ **Why this works:** Clean separation for future growth while keeping runtime and surface area small.
226
+
227
+ ***
228
+
229
+ ## Comparison and Recommendation
230
+
231
+ | Aspect | Plan A: Inline | Plan B: Dual-Path | Plan C: Module |
232
+ | ----------------------------- | ------------------- | ------------------- | ---------------------- |
233
+ | **Code Lines Added** | \~50 (no new files) | \~80 (no new files) | \~60 (+1 small module) |
234
+ | **New Files** | 0 | 0 | 1 |
235
+ | **Debugging Ease** | Excellent | Good | Good |
236
+ | **Testing Isolation** | Hard | Medium | Easy |
237
+ | **Future Extensibility** | Limited | Medium | Good |
238
+ | **Risk of Breaking Existing** | Very Low | Low | Very Low |
239
+
240
+ ### Recommended Approach: Plan A (Inline WebLLM Integration)
241
+
242
+ Reasoning:
243
+
244
+ * Fastest to ship with minimal risk.
245
+ * Keeps logic local and obvious (one if-statement to route inference).
246
+ * Easy rollback and minimal maintenance burden.
247
+
248
+ Operational advice: implement Plan A first, run the tests and collect diagnostics. If WebLLM proves stable and valuable, refactor to Plan C for better testability and maintenance.
249
+
250
+ All three plans keep to the original requirement: optimistic WebLLM load, Transformers.js fallback, and simple runtime routing at inference time, while avoiding large registries, adapters, or orchestration layers.
plans/2025-08-23-webllm-integration/4-chosen-plan-simple.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chosen Plan: Inline WebLLM Integration — Tight Execution Roadmap
2
+
3
+ ## Reinforcement (start)
4
+
5
+ Keep changes tiny, readable, and modern: concise async/await, arrow functions, optional chaining, nullish coalescing. Do not swallow exceptions — when a catch occurs, act: log a concise diagnostic and take the fallback path or rethrow if necessary.
6
+
7
+ ## Core intent
8
+
9
+ Optimistically use WebLLM when available; when it fails use the existing Transformers.js pipeline. Use a single cache (Promise or resolved engine) as the source of truth. No persistent flags, no timeouts on engine creation, no extra shutdown disposal, minimal logging.
10
+
11
+ ***
12
+
13
+ ## Essentials checklist
14
+
15
+ * Probe the runtime once (advisory only) to avoid pointless attempts on unsupported platforms.
16
+ * On model load, try WebLLM first when probe suggests possible; if it fails, immediately and deterministically fall back to Transformers.js.
17
+ * Cache the in-progress Promise and then the resolved engine object in the same cache used today for Transformers.js pipelines.
18
+ * Decide backend at runtime by inspecting the resolved cached object (duck-typing), not by reading separate per-model flags.
19
+ * Keep logging minimal and actionable (one-line load start/success/fail, inference fail).
20
+ * Do not impose timeouts on engine creation; allow large models to finish loading.
21
+ * Do not add shutdown dispose hooks; worker shutdown will clean up resources.
22
+
23
+ ***
24
+
25
+ ## Steps (order of implementation) with success criteria
26
+
27
+ 1. Add a cheap advisory probe (in-memory)
28
+
29
+ * What: perform a single, lightweight probe at first load attempt to detect presence of WebLLM APIs; cache boolean and last error in-memory.
30
+ * Why: skip obviously impossible attempts on unsupported platforms without preventing valid loads elsewhere.
31
+ * Success: probe returns quickly and avoids repeated futile attempts.
32
+
33
+ 1. Implement WebLLM-first load path into `ModelCache` (single-cache logic)
34
+
35
+ * What: on getModel, store an in-progress Promise into the existing cache; if probe suggests WebLLM is possible, attempt engine creation first (no timeout). If WebLLM creation or a short validation check fails, log a concise diagnostic and proceed to the existing Transformers.js loader. When the Promise resolves, replace it with the engine object in the same cache.
36
+ * Why: ensure concurrent requests dedupe and the cache remains the single source of truth.
37
+ * Success: when WebLLM loads successfully the cached engine is used for inference; when it fails, Transformers.js is used with no UI change.
38
+
39
+ 1. Runtime routing by object shape (duck-typing)
40
+
41
+ * What: at runPrompt, await the cached model, inspect the resolved object for a small, documented signature that identifies WebLLM vs Transformers.js, and dispatch via one conditional.
42
+ * Error handling: if WebLLM inference throws, log a concise diagnostic and attempt Transformers.js for that run where sensible. Do not silently swallow errors.
43
+ * Success: a single conditional routes inference correctly; fallback happens within the same request when possible.
44
+
45
+ 1. Minimal logging
46
+
47
+ * What to log (brief): model load start (backend attempted), model load success, model load failure (one-line error + model id), inference failure (one-line).
48
+ * Why: keep logs actionable and small; avoid verbose progress dumps.
49
+
50
+ 1. No explicit disposal at shutdown
51
+
52
+ * Decision: do not add dispose hooks for worker shutdown; rely on worker termination to clean up. Avoid extra lifecycle complexity.
53
+
54
+ ***
55
+
56
+ ## Developer checklist (compact, ready for PR body)
57
+
58
+ * Add in-memory `probe()` used only at first load attempt; cache result and last error for session.
59
+ * In `ModelCache.getModel`: store in-progress Promise in cache; if probe indicates WebLLM may be present, attempt WebLLM engine creation first (no timeout). On success, cache engine object. On failure, log concise diagnostic and run existing Transformers.js loader.
60
+ * In inference handler (`runPrompt`): await cached model, detect backend by object shape (duck-typing), call with minimal options (max\_new\_tokens, temperature). If WebLLM inference fails, log and attempt Transformers.js fallback for that request.
61
+ * Keep logs concise and developer-focused.
62
+ * Use modern, compact JS idioms and keep code short.
63
+
64
+ ***
65
+
66
+ ## Success criteria (project-level)
67
+
68
+ * WebLLM used when available; Transformers.js fallback always available.
69
+ * No feature flags or persisted per-model success flags introduced.
70
+ * Exceptions are not swallowed; catches produce concise diagnostics and a clear fallback or propagate.
71
+ * Changes are minimal, readable, and easy to revert.
72
+
73
+ ***
74
+
75
+ ## Final reinforcement (end)
76
+
77
+ Start small: one compact change to `ModelCache` and a single conditional in inference. Keep the implementation elegant and short. If an exception is caught, do not hide it — act: log minimal diagnostic and fallback or rethrow.
78
+
79
+ If you want, I can now draft a compact PR description based on this checklist or implement the code changes directly — tell me which and I will proceed.
src/worker/boot-worker.js CHANGED
@@ -59,19 +59,42 @@ export function bootWorker() {
59
 
60
  async function handleRunPrompt({ prompt, modelName = selectedModel, id, options }) {
61
  try {
62
- const pipe = await modelCache.getModel({ modelName });
63
- // run the pipeline
64
- if (!pipe) throw new Error('pipeline not available');
65
  self.postMessage({ id, type: 'status', status: 'inference-start', model: modelName });
66
- const out = await pipe(prompt, {
67
- max_new_tokens: 250, // Increase from default
68
- temperature: 0.7,
69
- do_sample: true,
70
- pad_token_id: pipe.tokenizer.eos_token_id,
71
- return_full_text: false, // Only return the generated text
72
- ...options
73
- });
74
- const text = extractText(out);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  self.postMessage({ id, type: 'status', status: 'inference-done', model: modelName });
76
  self.postMessage({ id, type: 'response', result: text });
77
  } catch (err) {
 
59
 
60
  async function handleRunPrompt({ prompt, modelName = selectedModel, id, options }) {
61
  try {
62
+ const engine = await modelCache.getModel({ modelName });
63
+ if (!engine) throw new Error('engine not available');
64
+
65
  self.postMessage({ id, type: 'status', status: 'inference-start', model: modelName });
66
+
67
+ // Duck-typing to detect engine type and route accordingly
68
+ let text;
69
+ if (/** @type {any} */(engine).chat?.completions?.create) {
70
+ // WebLLM engine detected
71
+ try {
72
+ const webllmEngine = /** @type {any} */(engine);
73
+ const response = await webllmEngine.chat.completions.create({
74
+ messages: [{ role: "user", content: prompt }],
75
+ max_tokens: options?.max_new_tokens ?? 250,
76
+ temperature: options?.temperature ?? 0.7
77
+ });
78
+ text = response.choices[0]?.message?.content ?? '';
79
+ } catch (err) {
80
+ console.log(`WebLLM inference failed for ${modelName}: ${err.message}`);
81
+ throw err; // Re-throw since we can't easily fallback mid-inference
82
+ }
83
+ } else if (typeof engine === 'function') {
84
+ // Transformers.js pipeline detected
85
+ const out = await engine(prompt, {
86
+ max_new_tokens: 250,
87
+ temperature: 0.7,
88
+ do_sample: true,
89
+ pad_token_id: engine.tokenizer?.eos_token_id,
90
+ return_full_text: false,
91
+ ...options
92
+ });
93
+ text = extractText(out);
94
+ } else {
95
+ throw new Error('Unknown engine type');
96
+ }
97
+
98
  self.postMessage({ id, type: 'status', status: 'inference-done', model: modelName });
99
  self.postMessage({ id, type: 'response', result: text });
100
  } catch (err) {
src/worker/curated-model-list.json CHANGED
@@ -1,4 +1,30 @@
1
  [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  {
3
  "id": "Xenova/llama2.c-stories15M",
4
  "name": "Llama2.c Stories 15M",
 
1
  [
2
+ {
3
+ "id": "mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC",
4
+ "name": "Llama 3.1 8B Instruct q4f32",
5
+ "model_type": "llama",
6
+ "architectures": ["llama"],
7
+ "classification": "gen",
8
+ "confidence": "high",
9
+ "size_hint": "58Mb",
10
+ "fetchStatus": "ok",
11
+ "hasTokenizer": true,
12
+ "hasOnnxModel": true,
13
+ "isTransformersJsReady": true
14
+ },
15
+ {
16
+ "id": "mlc-ai/gemma-2-9b-it-q4f16_1-MLC",
17
+ "name": "Gemma 2 9B IT q4f16",
18
+ "model_type": "gemma",
19
+ "architectures": ["gemma"],
20
+ "classification": "gen",
21
+ "confidence": "high",
22
+ "size_hint": "5Gb",
23
+ "fetchStatus": "ok",
24
+ "hasTokenizer": true,
25
+ "hasOnnxModel": true,
26
+ "isTransformersJsReady": true
27
+ },
28
  {
29
  "id": "Xenova/llama2.c-stories15M",
30
  "name": "Llama2.c Stories 15M",
src/worker/model-cache.js CHANGED
@@ -1,6 +1,7 @@
1
  // @ts-check
2
 
3
  import { pipeline, env } from '@huggingface/transformers';
 
4
 
5
  import { loadModelCore } from './load-model-core';
6
 
@@ -8,6 +9,8 @@ export class ModelCache {
8
  cache = new Map();
9
  /** @type {import('@huggingface/transformers').DeviceType | undefined} */
10
  backend = undefined;
 
 
11
 
12
  env = env;
13
 
@@ -32,6 +35,24 @@ export class ModelCache {
32
  return this.cache.get(modelName) || this._loadModelAndStore({ modelName });
33
  }
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  /**
36
  * @param {{
37
  * modelName: string
@@ -40,38 +61,7 @@ export class ModelCache {
40
  _loadModelAndStore({ modelName }) {
41
  if (!this.backend) this.backend = detectTransformersBackend();
42
  // Create a loader promise that will try multiple backends in order.
43
- const loader = (async () => {
44
- // candidate order: detected backend first, then common fallbacks
45
- let candidates = ['webgpu', 'gpu', 'wasm'];
46
- // candidates = ['gpu', 'wasm'];
47
- candidates = candidates.slice(candidates.indexOf(this.backend || 'wasm'));
48
- candidates = ['auto'];// , 'wasm'];
49
-
50
- let errs = [];
51
- console.log('Trying candidates ', candidates);
52
- for (const device of candidates) {
53
- try {
54
- const model = await loadModelCore({
55
- modelName,
56
- device: /** @type {import('@huggingface/transformers').DeviceType} */ (device)
57
- });
58
- // on success, update backend to the working device and store model
59
- this.backend = /** @type {import('@huggingface/transformers').DeviceType} */ (device);
60
- this.cache.set(modelName, model);
61
- return model;
62
- } catch (err) {
63
- console.log('Failed ', device, ' ', err);
64
- errs.push(device + ': ' + err.stack);
65
- // continue to next candidate
66
- }
67
- }
68
-
69
- // none succeeded
70
- const err = new Error(
71
- 'Backends failed: ' + JSON.stringify(candidates) + ', errors:\n\n' +
72
- errs.join('\n\n'));
73
- throw err;
74
- })();
75
 
76
  // store the in-progress promise so concurrent requests reuse it
77
  this.cache.set(modelName, loader);
@@ -88,6 +78,81 @@ export class ModelCache {
88
  return loader;
89
  }
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  }
92
 
93
  export function detectTransformersBackend() {
 
1
  // @ts-check
2
 
3
  import { pipeline, env } from '@huggingface/transformers';
4
+ import * as webllm from '@mlc-ai/web-llm';
5
 
6
  import { loadModelCore } from './load-model-core';
7
 
 
9
  cache = new Map();
10
  /** @type {import('@huggingface/transformers').DeviceType | undefined} */
11
  backend = undefined;
12
+ /** @type {{ possible: boolean, lastError?: string } | undefined} */
13
+ webllmProbe = undefined;
14
 
15
  env = env;
16
 
 
35
  return this.cache.get(modelName) || this._loadModelAndStore({ modelName });
36
  }
37
 
38
+ /**
39
+ * Lightweight probe to detect WebLLM API availability (advisory only)
40
+ */
41
+ probeWebLLM() {
42
+ if (this.webllmProbe) return this.webllmProbe;
43
+
44
+ try {
45
+ // Check if basic WebLLM APIs are available
46
+ const hasWebLLM = typeof webllm?.CreateMLCEngine === 'function' &&
47
+ typeof webllm?.prebuiltAppConfig !== 'undefined';
48
+ this.webllmProbe = { possible: hasWebLLM };
49
+ } catch (err) {
50
+ this.webllmProbe = { possible: false, lastError: String(err) };
51
+ }
52
+
53
+ return this.webllmProbe;
54
+ }
55
+
56
  /**
57
  * @param {{
58
  * modelName: string
 
61
  _loadModelAndStore({ modelName }) {
62
  if (!this.backend) this.backend = detectTransformersBackend();
63
  // Create a loader promise that will try multiple backends in order.
64
+ const loader = this._loadWebLLMOrFallbackToTransformersModelNow({ modelName });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  // store the in-progress promise so concurrent requests reuse it
67
  this.cache.set(modelName, loader);
 
78
  return loader;
79
  }
80
 
81
+ async _loadWebLLMOrFallbackToTransformersModelNow({ modelName }) {
82
+ const probe = this.probeWebLLM();
83
+
84
+ // Try WebLLM first if probe suggests it's possible
85
+ if (probe.possible) {
86
+ try {
87
+ const webLLMId = modelName.split('/').pop() || modelName;
88
+ console.log(`Loading ${webLLMId} via WebLLM...`);
89
+ const engine = await webllm.CreateMLCEngine(webLLMId, {
90
+ appConfig: webllm.prebuiltAppConfig
91
+ });
92
+
93
+ // Quick end-to-end validation: run a very small prompt to ensure the
94
+ // engine responds correctly before caching it. If this fails we
95
+ // throw so the outer catch falls back to Transformers.js.
96
+ try {
97
+ const webllmEngine = engine;
98
+ const testResp = await webllmEngine.chat.completions.create({
99
+ messages: [{ role: 'user', content: 'Hello' }],
100
+ max_tokens: 8,
101
+ temperature: 0.2
102
+ });
103
+ const testText = testResp?.choices?.[0]?.message?.content ?? '';
104
+ if (!testText || String(testText).trim() === '') {
105
+ throw new Error('WebLLM test prompt returned empty response');
106
+ }
107
+ } catch (e) {
108
+ throw new Error('WebLLM validation failed: ' + String(e));
109
+ }
110
+
111
+ console.log(`WebLLM loaded: ${webLLMId}`);
112
+ return engine;
113
+ } catch (err) {
114
+ console.log(`WebLLM failed for ${modelName}: ${err.message}`);
115
+ // Fall through to Transformers.js
116
+ }
117
+ }
118
+
119
+ // Fallback to Transformers.js
120
+ return this._loadTransformersModelNow({ modelName });
121
+ }
122
+
123
+ async _loadTransformersModelNow({ modelName }) {
124
+ // candidate order: detected backend first, then common fallbacks
125
+ let candidates = ['webgpu', 'gpu', 'wasm'];
126
+ // candidates = ['gpu', 'wasm'];
127
+ candidates = candidates.slice(candidates.indexOf(this.backend || 'wasm'));
128
+ candidates = ['auto'];// , 'wasm'];
129
+
130
+ let errs = [];
131
+ console.log('Trying candidates ', candidates);
132
+ for (const device of candidates) {
133
+ try {
134
+ const model = await loadModelCore({
135
+ modelName,
136
+ device: /** @type {import('@huggingface/transformers').DeviceType} */ (device)
137
+ });
138
+ // on success, update backend to the working device and store model
139
+ this.backend = /** @type {import('@huggingface/transformers').DeviceType} */ (device);
140
+ this.cache.set(modelName, model);
141
+ return model;
142
+ } catch (err) {
143
+ console.log('Failed ', device, ' ', err);
144
+ errs.push(device + ': ' + err.stack);
145
+ // continue to next candidate
146
+ }
147
+ }
148
+
149
+ // none succeeded
150
+ const err = new Error(
151
+ 'Backends failed: ' + JSON.stringify(candidates) + ', errors:\n\n' +
152
+ errs.join('\n\n'));
153
+ throw err;
154
+ }
155
+
156
  }
157
 
158
  export function detectTransformersBackend() {