Spaces:

oyinbo
/

localm

Configuration error

App Files Files Community

mihailik commited on Aug 23

Commit

57f49ec

1 Parent(s): 45a40b2

Adding WebLLM backend as an option.

Browse files

Files changed (9) hide show

package-lock.json +25 -2
package.json +2 -1
plans/2025-08-23-webllm-integration/1-assessment.md +326 -0
plans/2025-08-23-webllm-integration/2-assessment-rejection.md +48 -0
plans/2025-08-23-webllm-integration/3-assessment-revised.md +250 -0
plans/2025-08-23-webllm-integration/4-chosen-plan-simple.md +79 -0
src/worker/boot-worker.js +35 -12
src/worker/curated-model-list.json +26 -0
src/worker/model-cache.js +97 -32

package-lock.json CHANGED Viewed

@@ -1,16 +1,17 @@
 {
   "name": "localm",
-  "version": "1.1.35",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "localm",
-      "version": "1.1.35",
       "license": "ISC",
       "dependencies": {
         "@huggingface/transformers": "^3.7.2",
         "@milkdown/crepe": "^7.15.3",
         "esbuild": "^0.25.9"
       }
     },
@@ -1901,6 +1902,15 @@
         "tslib": "^2.8.1"
       }
     },
     "node_modules/@protobufjs/aspromise": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
@@ -2599,6 +2609,19 @@
       "integrity": "sha512-mKnC+QJ9pWVzv+C4/U3rRsHapFfHvQFoFB92e52xeyGMcX6/OlIl78je1u8vePzYZSkkogMPJ2yjxxsb89cxyw==",
       "license": "MIT"
     },
     "node_modules/long": {
       "version": "5.3.2",
       "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",

 {
   "name": "localm",
+  "version": "1.1.38",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "localm",
+      "version": "1.1.38",
       "license": "ISC",
       "dependencies": {
         "@huggingface/transformers": "^3.7.2",
         "@milkdown/crepe": "^7.15.3",
+        "@mlc-ai/web-llm": "^0.2.79",
         "esbuild": "^0.25.9"
       }
     },
         "tslib": "^2.8.1"
       }
     },
+    "node_modules/@mlc-ai/web-llm": {
+      "version": "0.2.79",
+      "resolved": "https://registry.npmjs.org/@mlc-ai/web-llm/-/web-llm-0.2.79.tgz",
+      "integrity": "sha512-Hy1ZHQ0o2bZGZoVnGK48+fts/ZSKwLe96xjvqL/6C59Mem9HoHTcFE07NC2E23mRmhd01tL655N6CPeYmwWgwQ==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "loglevel": "^1.9.1"
+      }
+    },
     "node_modules/@protobufjs/aspromise": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
       "integrity": "sha512-mKnC+QJ9pWVzv+C4/U3rRsHapFfHvQFoFB92e52xeyGMcX6/OlIl78je1u8vePzYZSkkogMPJ2yjxxsb89cxyw==",
       "license": "MIT"
     },
+    "node_modules/loglevel": {
+      "version": "1.9.2",
+      "resolved": "https://registry.npmjs.org/loglevel/-/loglevel-1.9.2.tgz",
+      "integrity": "sha512-HgMmCqIJSAKqo68l0rS2AanEWfkxaZ5wNiEFb5ggm08lDs9Xl2KxBlX3PTcaD2chBM1gXAYf491/M2Rv8Jwayg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6.0"
+      },
+      "funding": {
+        "type": "tidelift",
+        "url": "https://tidelift.com/funding/github/npm/loglevel"
+      }
+    },
     "node_modules/long": {
       "version": "5.3.2",
       "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",

package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "localm",
-  "version": "1.1.38",
   "description": "Chat application",
   "scripts": {
     "build": "esbuild src/index.js --target=es6 --bundle --sourcemap --outfile=./index.js --format=iife --external:fs --external:path --external:child_process --external:ws --external:katex/dist/katex.min.css",
@@ -22,6 +22,7 @@
   "dependencies": {
     "@huggingface/transformers": "^3.7.2",
     "@milkdown/crepe": "^7.15.3",
     "esbuild": "^0.25.9"
   }
 }

 {
   "name": "localm",
+  "version": "1.2.0",
   "description": "Chat application",
   "scripts": {
     "build": "esbuild src/index.js --target=es6 --bundle --sourcemap --outfile=./index.js --format=iife --external:fs --external:path --external:child_process --external:ws --external:katex/dist/katex.min.css",
   "dependencies": {
     "@huggingface/transformers": "^3.7.2",
     "@milkdown/crepe": "^7.15.3",
+    "@mlc-ai/web-llm": "^0.2.79",
     "esbuild": "^0.25.9"
   }
 }

plans/2025-08-23-webllm-integration/1-assessment.md ADDED Viewed

	@@ -0,0 +1,326 @@

+# WebLLM Integration Plans: Three Alternative Approaches
+## Executive Summary
+This document outlines three comprehensive approaches for integrating WebLLM alongside the existing Transformers.js backend in the LocalM application. Each plan addresses the dual-backend strategy where WebLLM is attempted first for supported models, with Transformers.js as a fallback.
+## Current Architecture Analysis
+**Current Boot Worker Flow:**
+1. `boot-worker.js` handles message routing (`loadModel`, `runPrompt`)
+2. `model-cache.js` manages Transformers.js model loading with backend detection
+3. `load-model-core.js` creates pipelines using `@huggingface/transformers`
+4. Curated model list provides stable model set for consistent loading
+**Key Integration Requirements:**
+* WebLLM first, Transformers.js fallback loading strategy
+* Dual inference API handling based on loaded model type
+* Unified model management and caching
+* Consistent progress reporting and error handling
+***
+## Plan 1: Unified Backend Manager Architecture
+### Philosophy
+Create a sophisticated backend abstraction layer that treats WebLLM and Transformers.js as interchangeable engines, with intelligent model routing and unified API surface.
+### Implementation Steps
+#### Step 1: Create Backend Registry System
+**Description:** Implement a registry pattern for backend engines with capability detection
+**Deliverable:** `BackendRegistry` class with engine registration and capability query methods
+```JavaScript
+class BackendRegistry {
+  registerBackend(name, engine, capabilities)
+  getCompatibleBackends(modelId)
+  createEngine(backendName, config)
+}
+```
+**Risks:** Complex abstraction may obscure debugging; requires deep understanding of both APIs
+**Mitigation:** Extensive logging and backend-specific error passthrough
+#### Step 2: Develop Backend Engine Adapters
+**Description:** Create adapter classes that normalize WebLLM and Transformers.js APIs
+**Deliverable:** `WebLLMAdapter` and `TransformersAdapter` implementing common `IBackendEngine` interface
+```JavaScript
+interface IBackendEngine {
+  async loadModel(modelId, progressCallback)
+  async generateText(prompt, options)
+  getModelInfo()
+  dispose()
+}
+```
+**Risks:** API impedance mismatch between backends; feature parity challenges
+**Mitigation:** Adapter pattern with clear feature capability flags
+#### Step 3: Implement Model Compatibility Matrix
+**Description:** Build comprehensive model support matrix mapping models to compatible backends
+**Deliverable:** Enhanced curated model list with backend compatibility metadata
+```JSON
+{
+  "id": "Llama-3.1-8B-Instruct",
+  "backends": {
+    "webllm": { "supported": true, "priority": 1, "format": "MLC" },
+    "transformers": { "supported": true, "priority": 2, "format": "ONNX" }
+  }
+}
+```
+**Risks:** Maintenance overhead for compatibility matrix; model format inconsistencies
+**Mitigation:** Automated testing pipeline for model compatibility validation
+#### Step 4: Create Unified Model Cache
+**Description:** Replace current ModelCache with multi-backend aware cache
+**Deliverable:** `UnifiedModelCache` with backend-aware storage and retrieval
+```JavaScript
+class UnifiedModelCache {
+  async getModel(modelId, preferredBackend)
+  async loadWithFallback(modelId, backendPriority)
+  cacheModel(modelId, backend, modelInstance)
+}
+```
+**Risks:** Cache invalidation complexity; memory management across different backend types
+**Mitigation:** Clear cache lifecycle management and backend-specific disposal patterns
+#### Step 5: Implement Smart Backend Selection
+**Description:** Create intelligent backend selection based on device capabilities and model compatibility
+**Deliverable:** `BackendSelector` with device detection and optimal backend recommendation
+**Risks:** WebGPU detection inconsistencies; backend preference conflicts
+**Mitigation:** Fallback chains with user preference override capabilities
+#### Step 6: Update Boot Worker Integration
+**Description:** Modify boot-worker to use unified backend system
+**Deliverable:** Updated `boot-worker.js` with unified model loading and inference
+**Risks:** Breaking existing functionality; complex error handling
+**Mitigation:** Feature flags for gradual rollout; comprehensive testing suite
+### Plan Summary
+**Why this plan is good:** Provides maximum flexibility and maintainability through clean abstractions. Enables easy addition of future backends. Offers sophisticated model routing and optimization.
+**How it makes the app better:** Creates a scalable foundation for multiple ML backends, optimizes performance through intelligent backend selection, and provides unified developer experience while maintaining backend-specific optimizations.
+***
+## Plan 2: Progressive Enhancement Strategy
+### Philosophy
+Implement WebLLM as an enhanced capability layer that progressively enhances the existing Transformers.js foundation, maintaining backward compatibility while adding advanced features.
+### Implementation Steps
+#### Step 1: Create WebLLM Detection and Initialization
+**Description:** Add WebLLM capability detection and optional initialization
+**Deliverable:** `WebLLMCapabilities` module with environment detection
+```JavaScript
+class WebLLMCapabilities {
+  static async isSupported()
+  static async initialize()
+  static getAvailableModels()
+}
+```
+**Risks:** WebGPU availability detection false positives; initialization timing issues
+**Mitigation:** Robust feature detection with fallback graceful degradation
+#### Step 2: Extend Model Metadata with WebLLM Support Flags
+**Description:** Enhance existing curated model list with WebLLM compatibility flags
+**Deliverable:** Updated `curated-model-list.json` with progressive enhancement metadata
+```JSON
+{
+  "id": "existing-model",
+  "webllm": {
+    "supported": true,
+    "model_lib": "url-to-wasm",
+    "performance_tier": "high"
+  }
+}
+```
+**Risks:** Data schema versioning; metadata synchronization challenges
+**Mitigation:** Schema validation and backward compatibility layers
+#### Step 3: Implement Hybrid Model Loader
+**Description:** Extend existing ModelCache with WebLLM loading capabilities
+**Deliverable:** Enhanced `model-cache.js` with dual-loading strategy
+```JavaScript
+class EnhancedModelCache extends ModelCache {
+  async loadWithWebLLM(modelName)
+  async loadWithTransformers(modelName) // existing
+  async getModelWithPreference(modelName, preferWebLLM = true)
+}
+```
+**Risks:** Code complexity in existing critical path; regression potential
+**Mitigation:** Incremental enhancement with feature flags and A/B testing
+#### Step 4: Create Unified Inference Interface
+**Description:** Build adapter layer for consistent inference API across backends
+**Deliverable:** `InferenceAdapter` that normalizes WebLLM and Transformers.js calls
+**Risks:** API abstraction leakage; performance overhead from adaptation layer
+**Mitigation:** Minimal abstraction with direct passthrough where possible
+#### Step 5: Implement Progressive Model Loading
+**Description:** Create graceful fallback system from WebLLM to Transformers.js
+**Deliverable:** Enhanced `loadModel` handler with progressive loading strategy
+**Risks:** Complex error handling; user experience during fallback scenarios
+**Mitigation:** Clear progress indication and transparent fallback communication
+#### Step 6: Add Advanced WebLLM Features
+**Description:** Expose WebLLM-specific features like streaming and JSON mode
+**Deliverable:** Enhanced inference options and streaming capabilities
+**Risks:** Feature parity maintenance; increased API surface area
+**Mitigation:** Feature capability detection and graceful degradation
+### Plan Summary
+**Why this plan is good:** Minimizes risk by building on existing foundation. Maintains full backward compatibility. Allows gradual migration and testing. Preserves investment in current Transformers.js integration.
+**How it makes the app better:** Provides immediate performance benefits for supported models while maintaining reliability. Enables advanced features like better streaming without breaking existing functionality. Creates clear upgrade path for users.
+***
+## Plan 3: Microservice Backend Architecture
+### Philosophy
+Implement WebLLM and Transformers.js as independent microservice-style modules with a central orchestrator, enabling maximum isolation and specialized optimization for each backend.
+### Implementation Steps
+#### Step 1: Create Backend Service Abstractions
+**Description:** Design service interfaces for independent backend implementations
+**Deliverable:** `IBackendService` interface and base service framework
+```JavaScript
+interface IBackendService {
+  async initialize(config)
+  async loadModel(modelSpec)
+  async inference(request)
+  async dispose()
+  getCapabilities()
+}
+```
+**Risks:** Over-engineering; increased complexity for simple use cases
+**Mitigation:** Keep interfaces minimal and focused on essential operations
+#### Step 2: Implement WebLLM Service Module
+**Description:** Create dedicated WebLLM service with full feature implementation
+**Deliverable:** `WebLLMService` with complete WebLLM integration
+```JavaScript
+class WebLLMService implements IBackendService {
+  async loadModel(modelSpec) { /* WebLLM-specific loading */ }
+  async inference(request) { /* OpenAI-compatible API */ }
+  async streamInference(request) { /* Streaming support */ }
+}
+```
+**Risks:** WebLLM-specific quirks and edge cases; model format compatibility
+**Mitigation:** Comprehensive testing with various model types and sizes
+#### Step 3: Refactor Transformers.js as Service
+**Description:** Encapsulate existing Transformers.js logic into service module
+**Deliverable:** `TransformersService` extracted from current implementation
+**Risks:** Breaking existing functionality during refactor; regression introduction
+**Mitigation:** Comprehensive test coverage before refactoring; gradual migration
+#### Step 4: Create Service Orchestrator
+**Description:** Build central orchestrator for service selection and lifecycle management
+**Deliverable:** `BackendOrchestrator` with service discovery and routing
+```JavaScript
+class BackendOrchestrator {
+  async selectService(modelId, requirements)
+  async routeRequest(request, servicePreference)
+  manageServiceLifecycle()
+}
+```
+**Risks:** Central point of failure; orchestration complexity
+**Mitigation:** Robust error handling and service isolation patterns
+#### Step 5: Implement Service Communication Layer
+**Description:** Create communication protocol between orchestrator and services
+**Deliverable:** Message-based communication with type-safe protocols
+**Risks:** Communication overhead; debugging complexity across service boundaries
+**Mitigation:** Clear logging and service health monitoring
+#### Step 6: Build Service Discovery and Health Monitoring
+**Description:** Implement service capability detection and health monitoring
+**Deliverable:** Service registry with capability announcement and health checks
+**Risks:** Health check false positives; service state synchronization
+**Mitigation:** Conservative health checks with manual override capabilities
+#### Step 7: Create Worker Thread Integration
+**Description:** Integrate services with worker thread architecture for performance isolation
+**Deliverable:** Enhanced worker integration with service-specific worker threads
+**Risks:** Worker communication complexity; resource management across threads
+**Mitigation:** Clear worker lifecycle management and resource cleanup
+### Plan Summary
+**Why this plan is good:** Provides maximum isolation and specialization for each backend. Enables independent development and testing of backend implementations. Creates clear separation of concerns with single responsibility services. Facilitates future backend additions with minimal existing code changes.
+**How it makes the app better:** Enables optimal performance tuning for each backend independently. Provides robust fault isolation where one backend failure doesn't affect others. Creates modular architecture that supports independent scaling and optimization. Enables A/B testing of different backends for same models.
+***
+## Comparative Analysis
+| Aspect                     | Plan 1: Unified Backend | Plan 2: Progressive Enhancement | Plan 3: Microservice Architecture |
+| -------------------------- | ----------------------- | ------------------------------- | --------------------------------- |
+| **Implementation Risk**    | Medium-High             | Low-Medium                      | High                              |
+| **Development Time**       | 3-4 weeks               | 2-3 weeks                       | 4-6 weeks                         |
+| **Maintainability**        | High                    | Medium                          | Very High                         |
+| **Performance**            | Good                    | Good                            | Excellent                         |
+| **Future Extensibility**   | Very Good               | Good                            | Excellent                         |
+| **Backward Compatibility** | Medium                  | Excellent                       | Good                              |
+| **Testing Complexity**     | Medium                  | Low                             | High                              |
+| **User Experience Impact** | Medium                  | Low                             | Low                               |
+## Recommended Approach
+Based on the analysis, **Plan 2: Progressive Enhancement Strategy** is recommended for initial implementation due to:
+1. **Lower Risk**: Builds on existing working foundation
+2. **Faster Time to Value**: Can deliver WebLLM benefits in 2-3 weeks
+3. **Minimal Disruption**: Maintains existing functionality during transition
+4. **Clear Migration Path**: Enables future adoption of more sophisticated architectures
+The progressive enhancement approach allows immediate benefits while preserving the option to evolve toward Plan 1 or Plan 3 architectures as requirements mature and the codebase stabilizes with dual-backend support.

plans/2025-08-23-webllm-integration/2-assessment-rejection.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# Review: critique of WebLLM integration plans (short)
+This document records a concise, one‑page critique of the three proposals in
+`webllm-integration-plans.md`. The focus is strictly on unnecessary complexity,
+maintenance risk, and how each plan deviates from the original, tightly scoped
+requirement: "optimistic WebLLM load with Transformers.js fallback, and simple
+runtime routing at inference time."
+## Checklist (requirements extracted)
+- Attempt WebLLM first when loading a model in `boot-worker` — Required
+- If WebLLM load fails, fallback to Transformers.js — Required
+- At inference time, route to WebLLM or Transformers.js based on the loaded model — Required
+Status: The three plans nominally address these rules, but each layers
+additional architecture that is not required by the stated behavior.
+## Plan 1 — Unified Backend Manager
+- What it proposes: registry, adapters, compatibility matrix, unified cache, smart selector.
+- Why it’s over‑engineered: converts a simple dual‑backend decision into a general multi‑backend platform.
+- Specific harms:
+	- Large maintenance surface: many new modules to design, document, and keep in sync.
+	- Harder debugging: faults are displaced into adapter/registry layers.
+	- Test explosion: compatibility matrix and routing logic require extensive tests.
+	- Delayed delivery: substantial upfront work with little immediate value.
+- Salvageable idea: a very small, local adapter contract or a backend marker can be useful — but only if kept intentionally minimal.
+## Plan 2 — Progressive Enhancement
+- What it proposes: capability detection, curated metadata changes, hybrid loader, small inference adapter.
+- Why it still feels heavy: it expands metadata and loader paths despite the requirement being a single optimistic attempt + fallback.
+- Specific harms:
+	- Metadata maintenance and schema versioning overhead.
+	- Increased regression risk by touching the hot path (`model-cache`).
+	- API leakage: adapters can hide backend differences and cause subtle runtime mismatches.
+- Merit: conceptually the safest approach; its incremental philosophy is appropriate — but the plan should avoid broad metadata and API surface growth at this stage.
+## Plan 3 — Microservice Backend Architecture
+- What it proposes: independent backend services, orchestrator, IPC/protocols, health checks, worker isolation.
+- Why it’s inappropriate now: it’s a heavy structural shift that doesn’t fit an in‑browser, worker‑based app nor the simple requirement.
+- Specific harms:
+	- Severe implementation and operational overhead.
+	- Debugging and runtime complexity across service boundaries.
+	- Overfitting server patterns to client‑side code.
+## Summary
+All three plans contain useful long‑term ideas, but they escalate architecture well beyond the immediate need. Plan 2’s incremental mindset is the closest fit, yet even it introduces schema and loader surface growth that is not required today. Plans 1 and 3 add costly abstractions that will negatively affect maintainability, testing, and delivery speed if implemented now.
+Recommendation (for reviewers): preserve the useful concepts (capability detection, explicit backend marker) but avoid registry/orchestrator layers and wide metadata changes at this stage. Keep the initial implementation small and focused on the two behaviors the project must guarantee.

plans/2025-08-23-webllm-integration/3-assessment-revised.md ADDED Viewed

	@@ -0,0 +1,250 @@

+# WebLLM Integration: Revised Simple Plans
+## Executive Summary
+After reviewing the critique of our initial over-engineered plans, this document presents three focused, minimal approaches for integrating WebLLM with Transformers.js fallback. Each plan prioritizes simple if-statements over complex abstractions, minimal code footprint, and easy maintenance.
+**Core Requirement Recap:**
+* Attempt WebLLM first when loading a model
+* If WebLLM load fails, fallback to Transformers.js
+* At inference time, route to appropriate backend based on loaded model type
+***
+## Plan A: Inline WebLLM Integration (Simplest)
+### Goal
+Integrate WebLLM into the existing `ModelCache` class with absolute minimum code changes, preserving all existing functionality while adding optimistic WebLLM loading with Transformers.js fallback.
+### Philosophy
+Add WebLLM directly into the existing `ModelCache` class with minimal changes. No new abstractions, no registries - just straightforward if-else logic in the existing loading flow.
+### Design Restrictions & Limitations
+* **No new files**: All changes must be within existing files
+* **No interface changes**: External API must remain identical
+* **No breaking changes**: Existing Transformers.js behavior must be preserved exactly
+* **Minimal dependencies**: Only add WebLLM import, no additional libraries
+* **No configuration**: Keep WebLLM model selection automatic based on model name
+### Intentional Design Choices
+1. **Inline over modular**: Accept some code duplication to avoid abstraction complexity
+2. **Cache WebLLM availability check**: Prevent repeated import attempts
+3. **Identical inference interface**: Wrap WebLLM to match Transformers.js pipeline signature
+4. **Silent fallback**: Log WebLLM failures but don't surface them to UI
+5. **No WebLLM-specific features**: Stick to basic text generation only
+### Risk Assessment
+#### High Risks
+* **WebLLM import failures**: Dynamic imports may fail unpredictably
+  * *Mitigation*: Robust try-catch with cached failure state
+* **Memory leaks**: WebLLM engines may not dispose properly
+  * *Mitigation*: Store engine reference for explicit cleanup
+* **Interface mismatch**: WebLLM API differs significantly from Transformers.js
+  * *Mitigation*: Careful wrapper function with identical signatures
+#### Medium Risks
+* **Performance regression**: Additional async checks may slow loading
+  * *Mitigation*: Cache availability check result
+* **Error message confusion**: Users may see Transformers.js errors when WebLLM was attempted
+  * *Mitigation*: Clear logging distinguishing between attempts
+#### Low Risks
+* **Code maintainability**: Inline logic may become hard to follow
+  * *Mitigation*: Comprehensive comments and clear variable naming
+### Potential Pitfalls & Avoidance
+1. **Pitfall**: WebLLM models have different naming conventions
+   * *Avoidance*: Start with exact model name matching, document differences
+2. **Pitfall**: WebLLM may load but fail during inference
+   * *Avoidance*: Include inference test during model loading phase
+3. **Pitfall**: Mixed backend state confusion in cache
+   * *Avoidance*: Clear backend type marking on cached models
+4. **Pitfall**: WebLLM engine disposal not called
+   * *Avoidance*: Store engine reference and implement cleanup in cache eviction
+### Implementation Details
+#### Step 1: Add WebLLM availability detection (no code)
+Goal: cheaply detect whether WebLLM is present in the runtime and cache that result.
+Details: perform a single availability probe once (at worker startup or first model load). Cache the boolean result and last error text for diagnostics. The probe must be cheap and non-blocking for the UI thread; if the probe indicates WebLLM is unavailable, skip WebLLM attempts for the remainder of the session.
+Risks and safeguards: the probe can succeed but engine creation still fail — store the probe result as advisory only, and always run a short, bounded health check when creating an actual engine. Log errors for debugging; do not surface probe failures to the user.
+#### Step 2: Optimistic WebLLM attempt with bounded validation (no code)
+Goal: when loading a model, try WebLLM first if the probe passed. If WebLLM load or a short validation inference fails, fall back to Transformers.js immediately.
+Details: implement a two-phase load: (1) an optimistic WebLLM engine creation attempt with a short, fixed timeout and a lightweight validation (one small inference). If both complete successfully, mark the cached model as WebLLM-backed. If either times out or fails, swallow the error, log it, and run the existing Transformers.js loader unchanged.
+Risks and safeguards: protect against long hangs by enforcing a timeout; validate the engine with a minimal inference to catch silent failures; keep errors in logs only and preserve original Transformers.js errors for UI-facing messages.
+#### Step 3: Keep a simple backend marker and unified inference contract (no code)
+Goal: mark each cached model with a tiny backend flag ("webllm" or "transformers") so runtime code can choose the correct inference path with one if-statement.
+Details: ensure the wrapper used for WebLLM returns data in the same shape the rest of the code expects (a small set of fields). At inference time, do a single if-check on the backend marker and call the model accordingly. Map or normalize options conservatively.
+Risks and safeguards: mapping may omit uncommon options — limit supported options initially and document them; validate return shape after inference and fall back to a meaningful error if malformed.
+#### Step 4: Minimal disposal lifecycle and cleanup (no code)
+Goal: provide an explicit, per-model disposal pathway to avoid leaking native resources.
+Details: when a model is evicted or the worker shuts down, if the cached item exposes a disposal API, call it within try/catch and continue. Log disposal outcomes for visibility. Avoid automatic aggressive disposal during active inference.
+Risks and safeguards: some engines may not implement disposal correctly — wrap calls in try/catch; avoid disposing in the middle of an active request; provide a single manual cleanup call for diagnostic use.
+#### Testing, rollout and rollback (no code)
+Testing: verify the following scenarios: WebLLM absent, WebLLM present but engine creation fails, WebLLM present and operates correctly, mixed model usage. Create deterministic mocks for each path for CI.
+Rollout: feature-flag the WebLLM probe or gate the code behind a simple config toggle. For rollback, disable the probe; behavior returns to the previous Transformers.js-only flow.
+Monitoring and diagnostics: log backend selection, load durations, timeouts, and validation failures to the console (or a dev-only telemetry sink). Provide a developer-only command to view cached model backends and last probe errors.
+### Implementation Time: 1-2 days
+### Risk Level: Very Low
+### Maintenance Overhead: Minimal
+**Why this works:** describes the same minimal inline integration without code; keeps the runtime simple and easily debuggable.
+***
+## Plan B: Dual-Path ModelCache (Balanced)
+### Goal
+Create a deliberately clear but still small separation between the WebLLM and Transformers.js loading paths to make debugging and testing easier, while keeping a single public interface for callers.
+### Philosophy
+Prefer explicit separate loader functions for each backend, but keep them private to the `ModelCache`. Use a single public `getModel` API and a tiny `modelBackends` registry (mapping modelName -> backend) for diagnostics.
+### Design Restrictions & Limitations
+* The public `ModelCache` interface must not change.
+* No large new frameworks, no registry/adapter abstractions beyond the single Map that records backend per model.
+* WebLLM attempts remain optimistic and short-lived; Transformers.js remains the reliable fallback.
+### Intentional Design Choices
+1. Separate loader functions for clarity and testability.
+2. A small map to track which backend served each model for diagnostics only.
+3. Structured error objects so we can decide what to surface to the UI vs. what to log for debugging.
+### Risks and mitigations
+* State synchronization: update cache and backend map in an atomic sequence so they cannot diverge. If a cached entry is a pending Promise, ensure the map only records the backend after the Promise resolves successfully.
+* Complexity in tests: provide mocks for each loader and test all four combinations (webllm success/fail x transformers success/fail).
+### Step-by-step rollout (no code)
+1. Add a `modelBackends` map to the cache implementation.
+2. Implement two private loader routines: one for WebLLM and one for Transformers.js. Keep the WebLLM loader conservative: timeout, one validation call, wrap engine in a normalized interface.
+3. In the public loader, call the WebLLM loader first; on success update cache and `modelBackends` to "webllm". On failure, call Transformers.js loader and update `modelBackends` to "transformers".
+4. Ensure the public `getModel` returns the same shape regardless of backend.
+5. Add lightweight diagnostics: expose a developer method to list cached models with their backends and last load durations.
+### Testing, rollout and rollback
+Testing: add unit tests for both loader functions with mocks; add integration tests that exercise the public `getModel` in all backend success/failure permutations.
+Rollout: can be enabled behind a config flag or staged to a small percentage of users (developer-only first). Rollback is simply disabling WebLLM attempts or reverting the map updates.
+### Implementation Time: 2-3 days
+### Risk Level: Low
+### Maintenance Overhead: Low
+**Why this works:** Slightly more structure than Plan A simplifies debugging and testing while still avoiding large abstractions.
+***
+## Plan C: Minimal WebLLM Module (Most Structured)
+### Goal
+Extract WebLLM integration into one small, well-tested module that mirrors the existing Transformers.js contract. Keep the rest of the codebase unchanged and use the module from `ModelCache` when appropriate.
+### Philosophy
+Encapsulate WebLLM specifics (probe, engine creation, validation, disposal) in a single file. That file exposes a tiny API: availability probe, loadModel(modelName) returning a normalized pipeline, and optional dispose methods.
+### Design Restrictions & Limitations
+* Add exactly one new file/module; do not add registries or dispatch systems.
+* The module must be lightweight, with no complex state beyond a cached availability flag and per-engine handles.
+* The module must normalize outputs to the existing pipeline shape used by the rest of the app.
+### Intentional Design Choices
+1. Single responsibility: only WebLLM concerns go into the module.
+2. Identical interface: consumers should not need to know whether they call WebLLM or Transformers.js.
+3. Easier testing: the module can be mocked in unit tests without touching Transformers.js code.
+### Step-by-step rollout (no code)
+1. Create the WebLLM module with three exported functions: probeAvailability(), loadModel(modelName), disposeEngine(handle).
+2. `probeAvailability` runs a single cheap probe and caches the result for the session.
+3. `loadModel` attempts engine creation with a bounded timeout and runs a minimal validation inference, returning a normalized pipeline-like object on success or throwing on failure.
+4. Import the module into `ModelCache` and attempt to use it first; when it throws or times out, fall back to the existing Transformers.js loader.
+### Risks and mitigations
+* Module drift: keep the module intentionally tiny so API changes are rare.
+* Duplicate normalization logic: ensure the normalization contract is documented and shared between module and cache tests.
+### Testing, rollout and rollback
+Testing: unit test the module aggressively (mocks for engine creation and inference). Integration test that ModelCache interacts with the module correctly and still falls back.
+Rollout: feature flag the module usage. For rollback, remove the module import or disable the probe call.
+### Implementation Time: 2-3 days
+### Risk Level: Low
+### Maintenance Overhead: Low
+**Why this works:** Clean separation for future growth while keeping runtime and surface area small.
+***
+## Comparison and Recommendation
+| Aspect                        | Plan A: Inline      | Plan B: Dual-Path   | Plan C: Module         |
+| ----------------------------- | ------------------- | ------------------- | ---------------------- |
+| **Code Lines Added**          | \~50 (no new files) | \~80 (no new files) | \~60 (+1 small module) |
+| **New Files**                 | 0                   | 0                   | 1                      |
+| **Debugging Ease**            | Excellent           | Good                | Good                   |
+| **Testing Isolation**         | Hard                | Medium              | Easy                   |
+| **Future Extensibility**      | Limited             | Medium              | Good                   |
+| **Risk of Breaking Existing** | Very Low            | Low                 | Very Low               |
+### Recommended Approach: Plan A (Inline WebLLM Integration)
+Reasoning:
+* Fastest to ship with minimal risk.
+* Keeps logic local and obvious (one if-statement to route inference).
+* Easy rollback and minimal maintenance burden.
+Operational advice: implement Plan A first, run the tests and collect diagnostics. If WebLLM proves stable and valuable, refactor to Plan C for better testability and maintenance.
+All three plans keep to the original requirement: optimistic WebLLM load, Transformers.js fallback, and simple runtime routing at inference time, while avoiding large registries, adapters, or orchestration layers.

plans/2025-08-23-webllm-integration/4-chosen-plan-simple.md ADDED Viewed

	@@ -0,0 +1,79 @@

+# Chosen Plan: Inline WebLLM Integration — Tight Execution Roadmap
+## Reinforcement (start)
+Keep changes tiny, readable, and modern: concise async/await, arrow functions, optional chaining, nullish coalescing. Do not swallow exceptions — when a catch occurs, act: log a concise diagnostic and take the fallback path or rethrow if necessary.
+## Core intent
+Optimistically use WebLLM when available; when it fails use the existing Transformers.js pipeline. Use a single cache (Promise or resolved engine) as the source of truth. No persistent flags, no timeouts on engine creation, no extra shutdown disposal, minimal logging.
+***
+## Essentials checklist
+* Probe the runtime once (advisory only) to avoid pointless attempts on unsupported platforms.
+* On model load, try WebLLM first when probe suggests possible; if it fails, immediately and deterministically fall back to Transformers.js.
+* Cache the in-progress Promise and then the resolved engine object in the same cache used today for Transformers.js pipelines.
+* Decide backend at runtime by inspecting the resolved cached object (duck-typing), not by reading separate per-model flags.
+* Keep logging minimal and actionable (one-line load start/success/fail, inference fail).
+* Do not impose timeouts on engine creation; allow large models to finish loading.
+* Do not add shutdown dispose hooks; worker shutdown will clean up resources.
+***
+## Steps (order of implementation) with success criteria
+1. Add a cheap advisory probe (in-memory)
+* What: perform a single, lightweight probe at first load attempt to detect presence of WebLLM APIs; cache boolean and last error in-memory.
+* Why: skip obviously impossible attempts on unsupported platforms without preventing valid loads elsewhere.
+* Success: probe returns quickly and avoids repeated futile attempts.
+1. Implement WebLLM-first load path into `ModelCache` (single-cache logic)
+* What: on getModel, store an in-progress Promise into the existing cache; if probe suggests WebLLM is possible, attempt engine creation first (no timeout). If WebLLM creation or a short validation check fails, log a concise diagnostic and proceed to the existing Transformers.js loader. When the Promise resolves, replace it with the engine object in the same cache.
+* Why: ensure concurrent requests dedupe and the cache remains the single source of truth.
+* Success: when WebLLM loads successfully the cached engine is used for inference; when it fails, Transformers.js is used with no UI change.
+1. Runtime routing by object shape (duck-typing)
+* What: at runPrompt, await the cached model, inspect the resolved object for a small, documented signature that identifies WebLLM vs Transformers.js, and dispatch via one conditional.
+* Error handling: if WebLLM inference throws, log a concise diagnostic and attempt Transformers.js for that run where sensible. Do not silently swallow errors.
+* Success: a single conditional routes inference correctly; fallback happens within the same request when possible.
+1. Minimal logging
+* What to log (brief): model load start (backend attempted), model load success, model load failure (one-line error + model id), inference failure (one-line).
+* Why: keep logs actionable and small; avoid verbose progress dumps.
+1. No explicit disposal at shutdown
+* Decision: do not add dispose hooks for worker shutdown; rely on worker termination to clean up. Avoid extra lifecycle complexity.
+***
+## Developer checklist (compact, ready for PR body)
+* Add in-memory `probe()` used only at first load attempt; cache result and last error for session.
+* In `ModelCache.getModel`: store in-progress Promise in cache; if probe indicates WebLLM may be present, attempt WebLLM engine creation first (no timeout). On success, cache engine object. On failure, log concise diagnostic and run existing Transformers.js loader.
+* In inference handler (`runPrompt`): await cached model, detect backend by object shape (duck-typing), call with minimal options (max\_new\_tokens, temperature). If WebLLM inference fails, log and attempt Transformers.js fallback for that request.
+* Keep logs concise and developer-focused.
+* Use modern, compact JS idioms and keep code short.
+***
+## Success criteria (project-level)
+* WebLLM used when available; Transformers.js fallback always available.
+* No feature flags or persisted per-model success flags introduced.
+* Exceptions are not swallowed; catches produce concise diagnostics and a clear fallback or propagate.
+* Changes are minimal, readable, and easy to revert.
+***
+## Final reinforcement (end)
+Start small: one compact change to `ModelCache` and a single conditional in inference. Keep the implementation elegant and short. If an exception is caught, do not hide it — act: log minimal diagnostic and fallback or rethrow.
+If you want, I can now draft a compact PR description based on this checklist or implement the code changes directly — tell me which and I will proceed.

src/worker/boot-worker.js CHANGED Viewed

@@ -59,19 +59,42 @@ export function bootWorker() {
   async function handleRunPrompt({ prompt, modelName = selectedModel, id, options }) {
     try {
-      const pipe = await modelCache.getModel({ modelName });
-      // run the pipeline
-      if (!pipe) throw new Error('pipeline not available');
       self.postMessage({ id, type: 'status', status: 'inference-start', model: modelName });
-      const out = await pipe(prompt, {
-        max_new_tokens: 250,        // Increase from default
-        temperature: 0.7,
-        do_sample: true,
-        pad_token_id: pipe.tokenizer.eos_token_id,
-        return_full_text: false,     // Only return the generated text
-        ...options
-      });
-      const text = extractText(out);
       self.postMessage({ id, type: 'status', status: 'inference-done', model: modelName });
       self.postMessage({ id, type: 'response', result: text });
     } catch (err) {

   async function handleRunPrompt({ prompt, modelName = selectedModel, id, options }) {
     try {
+      const engine = await modelCache.getModel({ modelName });
+      if (!engine) throw new Error('engine not available');
       self.postMessage({ id, type: 'status', status: 'inference-start', model: modelName });
+      // Duck-typing to detect engine type and route accordingly
+      let text;
+      if (/** @type {any} */(engine).chat?.completions?.create) {
+        // WebLLM engine detected
+        try {
+          const webllmEngine = /** @type {any} */(engine);
+          const response = await webllmEngine.chat.completions.create({
+            messages: [{ role: "user", content: prompt }],
+            max_tokens: options?.max_new_tokens ?? 250,
+            temperature: options?.temperature ?? 0.7
+          });
+          text = response.choices[0]?.message?.content ?? '';
+        } catch (err) {
+          console.log(`WebLLM inference failed for ${modelName}: ${err.message}`);
+          throw err; // Re-throw since we can't easily fallback mid-inference
+        }
+      } else if (typeof engine === 'function') {
+        // Transformers.js pipeline detected
+        const out = await engine(prompt, {
+          max_new_tokens: 250,
+          temperature: 0.7,
+          do_sample: true,
+          pad_token_id: engine.tokenizer?.eos_token_id,
+          return_full_text: false,
+          ...options
+        });
+        text = extractText(out);
+      } else {
+        throw new Error('Unknown engine type');
+      }
       self.postMessage({ id, type: 'status', status: 'inference-done', model: modelName });
       self.postMessage({ id, type: 'response', result: text });
     } catch (err) {

src/worker/curated-model-list.json CHANGED Viewed

@@ -1,4 +1,30 @@
 [
   {
     "id": "Xenova/llama2.c-stories15M",
     "name": "Llama2.c Stories 15M",

 [
+  {
+    "id": "mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC",
+    "name": "Llama 3.1 8B Instruct q4f32",
+    "model_type": "llama",
+    "architectures": ["llama"],
+    "classification": "gen",
+    "confidence": "high",
+    "size_hint": "58Mb",
+    "fetchStatus": "ok",
+    "hasTokenizer": true,
+    "hasOnnxModel": true,
+    "isTransformersJsReady": true
+  },
+  {
+    "id": "mlc-ai/gemma-2-9b-it-q4f16_1-MLC",
+    "name": "Gemma 2 9B IT q4f16",
+    "model_type": "gemma",
+    "architectures": ["gemma"],
+    "classification": "gen",
+    "confidence": "high",
+    "size_hint": "5Gb",
+    "fetchStatus": "ok",
+    "hasTokenizer": true,
+    "hasOnnxModel": true,
+    "isTransformersJsReady": true
+  },
   {
     "id": "Xenova/llama2.c-stories15M",
     "name": "Llama2.c Stories 15M",

src/worker/model-cache.js CHANGED Viewed

@@ -1,6 +1,7 @@
 // @ts-check
 import { pipeline, env } from '@huggingface/transformers';
 import { loadModelCore } from './load-model-core';
@@ -8,6 +9,8 @@ export class ModelCache {
   cache = new Map();
   /** @type {import('@huggingface/transformers').DeviceType | undefined} */
   backend = undefined;
   env = env;
@@ -32,6 +35,24 @@ export class ModelCache {
     return this.cache.get(modelName) || this._loadModelAndStore({ modelName });
   }
   /**
    * @param {{
    *  modelName: string
@@ -40,38 +61,7 @@ export class ModelCache {
   _loadModelAndStore({ modelName }) {
     if (!this.backend) this.backend = detectTransformersBackend();
     // Create a loader promise that will try multiple backends in order.
-    const loader = (async () => {
-      // candidate order: detected backend first, then common fallbacks
-      let candidates = ['webgpu', 'gpu', 'wasm'];
-      // candidates = ['gpu', 'wasm'];
-      candidates = candidates.slice(candidates.indexOf(this.backend || 'wasm'));
-      candidates = ['auto'];// , 'wasm'];
-      let errs = [];
-      console.log('Trying candidates ', candidates);
-      for (const device of candidates) {
-        try {
-          const model = await loadModelCore({
-            modelName,
-            device: /** @type {import('@huggingface/transformers').DeviceType} */ (device)
-          });
-          // on success, update backend to the working device and store model
-          this.backend = /** @type {import('@huggingface/transformers').DeviceType} */ (device);
-          this.cache.set(modelName, model);
-          return model;
-        } catch (err) {
-          console.log('Failed ', device, ' ', err);
-          errs.push(device + ': ' + err.stack);
-          // continue to next candidate
-        }
-      }
-      // none succeeded
-      const err = new Error(
-        'Backends failed: ' + JSON.stringify(candidates) + ', errors:\n\n' +
-        errs.join('\n\n'));
-      throw err;
-    })();
     // store the in-progress promise so concurrent requests reuse it
     this.cache.set(modelName, loader);
@@ -88,6 +78,81 @@ export class ModelCache {
     return loader;
   }
 }
 export function detectTransformersBackend() {

 // @ts-check
 import { pipeline, env } from '@huggingface/transformers';
+import * as webllm from '@mlc-ai/web-llm';
 import { loadModelCore } from './load-model-core';
   cache = new Map();
   /** @type {import('@huggingface/transformers').DeviceType | undefined} */
   backend = undefined;
+  /** @type {{ possible: boolean, lastError?: string } | undefined} */
+  webllmProbe = undefined;
   env = env;
     return this.cache.get(modelName) || this._loadModelAndStore({ modelName });
   }
+  /**
+   * Lightweight probe to detect WebLLM API availability (advisory only)
+   */
+  probeWebLLM() {
+    if (this.webllmProbe) return this.webllmProbe;
+    try {
+      // Check if basic WebLLM APIs are available
+      const hasWebLLM = typeof webllm?.CreateMLCEngine === 'function' &&
+                        typeof webllm?.prebuiltAppConfig !== 'undefined';
+      this.webllmProbe = { possible: hasWebLLM };
+    } catch (err) {
+      this.webllmProbe = { possible: false, lastError: String(err) };
+    }
+    return this.webllmProbe;
+  }
   /**
    * @param {{
    *  modelName: string
   _loadModelAndStore({ modelName }) {
     if (!this.backend) this.backend = detectTransformersBackend();
     // Create a loader promise that will try multiple backends in order.
+    const loader = this._loadWebLLMOrFallbackToTransformersModelNow({ modelName });
     // store the in-progress promise so concurrent requests reuse it
     this.cache.set(modelName, loader);
     return loader;
   }
+  async _loadWebLLMOrFallbackToTransformersModelNow({ modelName }) {
+    const probe = this.probeWebLLM();
+    // Try WebLLM first if probe suggests it's possible
+    if (probe.possible) {
+      try {
+        const webLLMId = modelName.split('/').pop() || modelName;
+        console.log(`Loading ${webLLMId} via WebLLM...`);
+        const engine = await webllm.CreateMLCEngine(webLLMId, {
+          appConfig: webllm.prebuiltAppConfig
+        });
+        // Quick end-to-end validation: run a very small prompt to ensure the
+        // engine responds correctly before caching it. If this fails we
+        // throw so the outer catch falls back to Transformers.js.
+        try {
+          const webllmEngine = engine;
+          const testResp = await webllmEngine.chat.completions.create({
+            messages: [{ role: 'user', content: 'Hello' }],
+            max_tokens: 8,
+            temperature: 0.2
+          });
+          const testText = testResp?.choices?.[0]?.message?.content ?? '';
+          if (!testText || String(testText).trim() === '') {
+            throw new Error('WebLLM test prompt returned empty response');
+          }
+        } catch (e) {
+          throw new Error('WebLLM validation failed: ' + String(e));
+        }
+        console.log(`WebLLM loaded: ${webLLMId}`);
+        return engine;
+      } catch (err) {
+        console.log(`WebLLM failed for ${modelName}: ${err.message}`);
+        // Fall through to Transformers.js
+      }
+    }
+    // Fallback to Transformers.js
+    return this._loadTransformersModelNow({ modelName });
+  }
+  async _loadTransformersModelNow({ modelName }) {
+    // candidate order: detected backend first, then common fallbacks
+    let candidates = ['webgpu', 'gpu', 'wasm'];
+    // candidates = ['gpu', 'wasm'];
+    candidates = candidates.slice(candidates.indexOf(this.backend || 'wasm'));
+    candidates = ['auto'];// , 'wasm'];
+    let errs = [];
+    console.log('Trying candidates ', candidates);
+    for (const device of candidates) {
+      try {
+        const model = await loadModelCore({
+          modelName,
+          device: /** @type {import('@huggingface/transformers').DeviceType} */ (device)
+        });
+        // on success, update backend to the working device and store model
+        this.backend = /** @type {import('@huggingface/transformers').DeviceType} */ (device);
+        this.cache.set(modelName, model);
+        return model;
+      } catch (err) {
+        console.log('Failed ', device, ' ', err);
+        errs.push(device + ': ' + err.stack);
+        // continue to next candidate
+      }
+    }
+    // none succeeded
+    const err = new Error(
+      'Backends failed: ' + JSON.stringify(candidates) + ', errors:\n\n' +
+      errs.join('\n\n'));
+    throw err;
+  }
 }
 export function detectTransformersBackend() {