From b26f9d5ee3d9ed250c9f3b927cdd69a65cce3a2d Mon Sep 17 00:00:00 2001 From: Jhin Lee Date: Sun, 31 May 2026 10:59:56 -0400 Subject: [PATCH 1/3] Expose LiteRT-LM speculative decoding --- CHANGELOG.md | 9 +++++ README.md | 4 +- .../chat_app/lib/litert_lm_benchmark_app.dart | 40 +++++++++++++++---- .../litert_lm/litert_lm_backend_web.dart | 3 ++ .../backends/litert_lm/litert_lm_service.dart | 22 ++++++++-- .../backends/llama_cpp/llama_cpp_service.dart | 8 ++++ lib/src/backends/webgpu/webgpu_backend.dart | 6 +++ .../models/inference/generation_params.dart | 10 +++++ .../litert_lm/litert_lm_backend_web_test.dart | 33 +++++++++++++++ .../litert_lm/litert_lm_service_test.dart | 32 +++++++++++++++ .../llama_cpp/llama_cpp_service_test.dart | 20 ++++++++++ .../backends/webgpu/webgpu_backend_test.dart | 17 ++++++++ .../inference/generation_params_test.dart | 3 ++ tool/macos_fair_litert_vs_llamadart.sh | 3 +- .../docs/configuration/runtime-parameters.md | 4 ++ website/docs/guides/backend-benchmarks.md | 32 +++++++++++++++ website/docs/guides/backend-selection.md | 5 +++ website/docs/guides/performance-tuning.md | 14 +++++++ website/docs/platforms/support-matrix.md | 7 +++- 19 files changed, 256 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bb33540e..7905c14e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ ## Unreleased +* **LiteRT-LM speculative decoding opt-in**: + * Added `GenerationParams.speculativeDecoding` and wired it through the + native LiteRT-LM backend to + `litert_lm_engine_settings_set_enable_speculative_decoding`. The + `LlamaEngine` default remains disabled for stable/parity behavior; + llama.cpp, WebGPU, and LiteRT-LM web reject the option until their + speculative paths are implemented. + * Updated the LiteRT-LM benchmark app so its speculative toggle now affects + native LiteRT-LM generation and is recorded in per-run/final metrics. * **LiteRT-LM Gemma 4 function calling + thinking fix**: * Fixed Gemma 4 `.litertlm` models not calling tools and producing unreliable thinking. The backend supplied a hand-written stub chat template that diff --git a/README.md b/README.md index b30dad52..e996157f 100644 --- a/README.md +++ b/README.md @@ -356,7 +356,9 @@ thread counts, LoRA load configs, and rope overrides are rejected instead of being silently ignored. `.litertlm` generation honors `GenerationParams` `maxTokens`, `temp`, `topK`, `topP`, and `seed` on native and web, with `stopSequences` enforced by llamadart. Native LiteRT-LM also honors stream -batching thresholds. llama.cpp-only sampling and constrained-decoding controls +batching thresholds and the opt-in `speculativeDecoding` flag; Web LiteRT-LM +rejects speculative decoding until the browser runtime exposes an equivalent +control. llama.cpp-only sampling and constrained-decoding controls such as Min-P, repeat penalty overrides, grammar/lazy grammar triggers, preserved tokens, custom grammar roots, and web stream batching thresholds are rejected until LiteRT-LM exposes equivalent runtime controls. diff --git a/example/chat_app/lib/litert_lm_benchmark_app.dart b/example/chat_app/lib/litert_lm_benchmark_app.dart index 7ac24631..800198dc 100644 --- a/example/chat_app/lib/litert_lm_benchmark_app.dart +++ b/example/chat_app/lib/litert_lm_benchmark_app.dart @@ -236,7 +236,7 @@ class _LiteRtLmBenchmarkAppState extends State { _append('Initializing LiteRT-LM:'); _append(' model: $modelPath'); _append(' backend: $_backend'); - _append(' speculative: ignored by backend API'); + _append(' speculative: $_speculative'); if (_cacheDir.isNotEmpty) { await Directory(_cacheDir).create(recursive: true); _append(' cache override ignored by backend API: $_cacheDir'); @@ -247,11 +247,8 @@ class _LiteRtLmBenchmarkAppState extends State { modelPath, modelParams: ModelParams( contextSize: _maxTokens, - preferredBackend: _backend == 'cpu' - ? GpuBackend.cpu - : Platform.isMacOS - ? GpuBackend.metal - : GpuBackend.vulkan, + preferredBackend: _preferredGpuBackendForLiteRt(_backend), + liteRtLmBackend: _liteRtLmBackendPreference(_backend), ), ); loadSw.stop(); @@ -263,7 +260,11 @@ class _LiteRtLmBenchmarkAppState extends State { await engine .generate( _promptController.text, - params: GenerationParams(maxTokens: _outputTokens, seed: 1), + params: GenerationParams( + maxTokens: _outputTokens, + seed: 1, + speculativeDecoding: _speculative, + ), ) .drain(); } @@ -277,7 +278,11 @@ class _LiteRtLmBenchmarkAppState extends State { final sw = Stopwatch()..start(); await for (final chunk in engine.generate( _promptController.text, - params: GenerationParams(maxTokens: _outputTokens, seed: 1), + params: GenerationParams( + maxTokens: _outputTokens, + seed: 1, + speculativeDecoding: _speculative, + ), )) { buffer.write(chunk); } @@ -288,6 +293,7 @@ class _LiteRtLmBenchmarkAppState extends State { final runMetrics = { 'index': i, 'wallMilliseconds': wallMs, + 'speculativeDecoding': _speculative, 'promptEvalTokens': perf?.promptEvalTokens, 'evalTokens': perf?.evalTokens, 'hitEosBeforeTarget': perf == null @@ -319,6 +325,7 @@ class _LiteRtLmBenchmarkAppState extends State { 'wallMilliseconds': wallMs, 'backendName': await engine.getBackendName(), 'targetDecodeTokens': _outputTokens, + 'speculativeDecoding': _speculative, 'backendInitMilliseconds': perf?.loadMs, 'promptEvalTokens': perf?.promptEvalTokens, 'evalTokens': perf?.evalTokens, @@ -356,6 +363,23 @@ class _LiteRtLmBenchmarkAppState extends State { } } + GpuBackend _preferredGpuBackendForLiteRt(String backend) { + return switch (backend) { + 'cpu' => GpuBackend.cpu, + 'gpu' => Platform.isMacOS ? GpuBackend.metal : GpuBackend.vulkan, + _ => GpuBackend.auto, + }; + } + + LiteRtLmBackendPreference _liteRtLmBackendPreference(String backend) { + return switch (backend) { + 'cpu' => LiteRtLmBackendPreference.cpu, + 'gpu' => LiteRtLmBackendPreference.gpu, + 'npu' => LiteRtLmBackendPreference.npu, + _ => LiteRtLmBackendPreference.auto, + }; + } + Future _runLlamaDartBenchmark(String modelPath) async { final engine = LlamaEngine(LlamaBackend()); try { diff --git a/lib/src/backends/litert_lm/litert_lm_backend_web.dart b/lib/src/backends/litert_lm/litert_lm_backend_web.dart index 7b1f47ad..cb82b2f6 100644 --- a/lib/src/backends/litert_lm/litert_lm_backend_web.dart +++ b/lib/src/backends/litert_lm/litert_lm_backend_web.dart @@ -945,6 +945,9 @@ class LiteRtLmBackend if (params.grammarRoot != defaults.grammarRoot) { unsupported.add('grammarRoot'); } + if (params.speculativeDecoding) { + unsupported.add('speculativeDecoding'); + } if (params.streamBatchTokenThreshold != defaults.streamBatchTokenThreshold) { unsupported.add('streamBatchTokenThreshold'); diff --git a/lib/src/backends/litert_lm/litert_lm_service.dart b/lib/src/backends/litert_lm/litert_lm_service.dart index 8dfa9562..e7aa34d6 100644 --- a/lib/src/backends/litert_lm/litert_lm_service.dart +++ b/lib/src/backends/litert_lm/litert_lm_service.dart @@ -34,6 +34,7 @@ class LiteRtLmService { String? _modelPath; String? _activeBackend; int? _activeOutputTokens; + bool? _activeSpeculativeDecoding; int _nextModelHandle = 1; int _nextContextHandle = 1; int? _modelHandle; @@ -77,6 +78,7 @@ class LiteRtLmService { _modelParams = params; _activeBackend = resolvedBackend; _activeOutputTokens = null; + _activeSpeculativeDecoding = null; _modelHandle = _nextModelHandle++; _contextHandle = null; _lastMetrics = null; @@ -95,6 +97,7 @@ class LiteRtLmService { _modelParams = null; _activeBackend = null; _activeOutputTokens = null; + _activeSpeculativeDecoding = null; _modelHandle = null; _contextHandle = null; _lastMetrics = null; @@ -398,6 +401,7 @@ class LiteRtLmService { _client?.dispose(); _client = null; _activeOutputTokens = null; + _activeSpeculativeDecoding = null; _lastMetrics = null; _cancelRequested = false; } @@ -405,11 +409,15 @@ class LiteRtLmService { Future _ensureClientForGeneration( GenerationParams params, ) { - return _ensureClientForRuntime(outputTokens: params.maxTokens); + return _ensureClientForRuntime( + outputTokens: params.maxTokens, + speculativeDecoding: params.speculativeDecoding, + ); } Future _ensureClientForRuntime({ int? outputTokens, + bool? speculativeDecoding, }) async { final modelPath = _modelPath; final modelParams = _modelParams; @@ -419,10 +427,14 @@ class LiteRtLmService { final resolvedOutputTokens = outputTokens ?? _activeOutputTokens ?? GenerationParams().maxTokens; + final resolvedSpeculativeDecoding = + speculativeDecoding ?? _activeSpeculativeDecoding ?? false; final backend = _activeBackend ?? _backendNameFor(modelParams); final existing = _client; if (existing != null && (outputTokens == null || _activeOutputTokens == resolvedOutputTokens) && + (speculativeDecoding == null || + _activeSpeculativeDecoding == resolvedSpeculativeDecoding) && _activeBackend == backend) { return existing; } @@ -430,6 +442,7 @@ class LiteRtLmService { existing?.dispose(); _client = null; _activeOutputTokens = null; + _activeSpeculativeDecoding = null; final client = _clientFactory(); final responseThinkingTags = _responseThinkingTagsForModel(modelPath); client.configureResponseThinkingTags( @@ -443,7 +456,7 @@ class LiteRtLmService { maxTokens: modelParams.contextSize, outputTokens: resolvedOutputTokens, cacheDir: _defaultCacheDir(), - speculativeDecoding: false, + speculativeDecoding: resolvedSpeculativeDecoding, minLogLevel: _liteRtLmMinLogLevel(_logLevel), ); } catch (_) { @@ -456,6 +469,7 @@ class LiteRtLmService { } _client = client; _activeOutputTokens = resolvedOutputTokens; + _activeSpeculativeDecoding = resolvedSpeculativeDecoding; _activeBackend = backend; return client; } @@ -714,8 +728,8 @@ class LiteRtLmService { throw UnsupportedError( 'LiteRtLmBackend does not support llama.cpp-specific GenerationParams: ' '${unsupported.join(', ')}. Supported LiteRT-LM generation options are ' - 'maxTokens, temp, topK, topP, seed, stopSequences, and native stream ' - 'batching thresholds.', + 'maxTokens, temp, topK, topP, seed, stopSequences, ' + 'speculativeDecoding, and native stream batching thresholds.', ); } diff --git a/lib/src/backends/llama_cpp/llama_cpp_service.dart b/lib/src/backends/llama_cpp/llama_cpp_service.dart index 63e95af1..57dc9bd7 100644 --- a/lib/src/backends/llama_cpp/llama_cpp_service.dart +++ b/lib/src/backends/llama_cpp/llama_cpp_service.dart @@ -2711,6 +2711,14 @@ class LlamaCppService { int cancelTokenAddress, { List? parts, }) async* { + if (params.speculativeDecoding) { + throw UnsupportedError( + 'llama.cpp speculative decoding is not exposed by llamadart yet. ' + 'Use the LiteRT-LM native backend or track llama.cpp support in ' + 'issues #168/#190.', + ); + } + var ctx = _contexts[contextHandle]; if (ctx == null) throw Exception("Invalid context handle"); _generatingContexts.update( diff --git a/lib/src/backends/webgpu/webgpu_backend.dart b/lib/src/backends/webgpu/webgpu_backend.dart index 4fb173c3..4f437524 100644 --- a/lib/src/backends/webgpu/webgpu_backend.dart +++ b/lib/src/backends/webgpu/webgpu_backend.dart @@ -1649,6 +1649,12 @@ class WebGpuLlamaBackend GenerationParams params, { List? parts, }) { + if (params.speculativeDecoding) { + throw UnsupportedError( + 'WebGPU speculative decoding is not supported yet.', + ); + } + final mediaParts = _buildMultimodalParts(parts); if (mediaParts != null && !_mmContextActive) { throw StateError( diff --git a/lib/src/core/models/inference/generation_params.dart b/lib/src/core/models/inference/generation_params.dart index ef9ae17e..908be2b3 100644 --- a/lib/src/core/models/inference/generation_params.dart +++ b/lib/src/core/models/inference/generation_params.dart @@ -90,6 +90,13 @@ class GenerationParams { /// Grammar start symbol. Defaults to "root". final String grammarRoot; + /// Enables backend-native speculative decoding when supported. + /// + /// Native LiteRT-LM currently honors this flag by forwarding it to the + /// runtime's speculative decoding setting. llama.cpp, WebGPU, and LiteRT-LM + /// web reject this option until their speculative paths are implemented. + final bool speculativeDecoding; + /// Reuses matching prompt prefixes from previous requests in the same native /// context to reduce prompt ingestion latency. /// @@ -125,6 +132,7 @@ class GenerationParams { this.grammarTriggers = const [], this.preservedTokens = const [], this.grammarRoot = 'root', + this.speculativeDecoding = false, this.reusePromptPrefix = defaultReusePromptPrefix, this.streamBatchTokenThreshold = defaultStreamBatchTokenThreshold, this.streamBatchByteThreshold = defaultStreamBatchByteThreshold, @@ -145,6 +153,7 @@ class GenerationParams { List? grammarTriggers, List? preservedTokens, String? grammarRoot, + bool? speculativeDecoding, bool? reusePromptPrefix, int? streamBatchTokenThreshold, int? streamBatchByteThreshold, @@ -163,6 +172,7 @@ class GenerationParams { grammarTriggers: grammarTriggers ?? this.grammarTriggers, preservedTokens: preservedTokens ?? this.preservedTokens, grammarRoot: grammarRoot ?? this.grammarRoot, + speculativeDecoding: speculativeDecoding ?? this.speculativeDecoding, reusePromptPrefix: reusePromptPrefix ?? this.reusePromptPrefix, streamBatchTokenThreshold: streamBatchTokenThreshold ?? this.streamBatchTokenThreshold, diff --git a/test/unit/backends/litert_lm/litert_lm_backend_web_test.dart b/test/unit/backends/litert_lm/litert_lm_backend_web_test.dart index 64b3b9cf..21fa477d 100644 --- a/test/unit/backends/litert_lm/litert_lm_backend_web_test.dart +++ b/test/unit/backends/litert_lm/litert_lm_backend_web_test.dart @@ -262,6 +262,39 @@ void main() { } }); + test('rejects speculative decoding on LiteRT-LM web', () async { + _installFakeEngine(chunks: []); + + final backend = LiteRtLmBackend(); + try { + final modelHandle = await backend.modelLoadFromUrl( + 'https://example.com/model.litertlm', + const ModelParams(), + ); + final contextHandle = await backend.contextCreate( + modelHandle, + const ModelParams(), + ); + + await expectLater( + backend.generate( + contextHandle, + 'hello', + const GenerationParams(speculativeDecoding: true), + ), + emitsError( + isA().having( + (error) => error.message.toString(), + 'message', + contains('speculativeDecoding'), + ), + ), + ); + } finally { + await backend.dispose(); + } + }); + test('rejects unsupported context-time model params', () async { _installFakeEngine(chunks: []); diff --git a/test/unit/backends/litert_lm/litert_lm_service_test.dart b/test/unit/backends/litert_lm/litert_lm_service_test.dart index 953ffe42..ff456e2b 100644 --- a/test/unit/backends/litert_lm/litert_lm_service_test.dart +++ b/test/unit/backends/litert_lm/litert_lm_service_test.dart @@ -834,6 +834,36 @@ void main() { } }); + test('keeps LiteRT-LM speculative decoding disabled by default', () async { + final fakeClient = _FakeLiteRtLmRuntimeClient(); + final service = LiteRtLmService(clientFactory: () => fakeClient); + + try { + final modelHandle = await service.loadModel( + modelFile.path, + const ModelParams(preferredBackend: GpuBackend.cpu), + ); + final contextHandle = service.createContext( + modelHandle, + const ModelParams(preferredBackend: GpuBackend.cpu), + ); + + final subscription = service + .generate( + contextHandle, + 'hello', + const GenerationParams(maxTokens: 7), + ) + .listen((_) {}); + + await fakeClient.generateStarted.future; + expect(fakeClient.lastSpeculativeDecoding, isFalse); + unawaited(subscription.cancel()); + } finally { + service.dispose(); + } + }); + test('passes supported LiteRT-LM generation options to the client', () async { final fakeClient = _FakeLiteRtLmRuntimeClient(); final service = LiteRtLmService(clientFactory: () => fakeClient); @@ -860,6 +890,7 @@ void main() { topP: 0.4, seed: 123, stopSequences: ['STOP'], + speculativeDecoding: true, ), ) .listen(chunks.add); @@ -877,6 +908,7 @@ void main() { expect(fakeClient.lastTopK, 5); expect(fakeClient.lastTopP, 0.4); expect(fakeClient.lastSeed, 123); + expect(fakeClient.lastSpeculativeDecoding, isTrue); expect(fakeClient.lastNpuBackend, isFalse); expect(utf8.decode(chunks.expand((chunk) => chunk).toList()), 'alpha '); expect(fakeClient.cancelCount, 1); diff --git a/test/unit/backends/llama_cpp/llama_cpp_service_test.dart b/test/unit/backends/llama_cpp/llama_cpp_service_test.dart index e92ecb27..2c76b68e 100644 --- a/test/unit/backends/llama_cpp/llama_cpp_service_test.dart +++ b/test/unit/backends/llama_cpp/llama_cpp_service_test.dart @@ -101,6 +101,26 @@ void main() { ); }); + test('generate rejects speculative decoding', () async { + expect( + service + .generate( + -1, + 'hello', + const GenerationParams(speculativeDecoding: true), + 0, + ) + .drain(), + throwsA( + isA().having( + (error) => error.message.toString(), + 'message', + contains('speculative decoding'), + ), + ), + ); + }); + test('embed and embedBatch throw for unknown context handle', () { expect(() => service.embed(-1, 'hello'), throwsA(isA())); expect( diff --git a/test/unit/backends/webgpu/webgpu_backend_test.dart b/test/unit/backends/webgpu/webgpu_backend_test.dart index 658c57f6..27c74e87 100644 --- a/test/unit/backends/webgpu/webgpu_backend_test.dart +++ b/test/unit/backends/webgpu/webgpu_backend_test.dart @@ -785,6 +785,23 @@ void main() { expect(lastTokenEventFlushChars, 48); }); + test('rejects speculative decoding', () { + expect( + () => backend.generate( + 1, + 'Hello', + const GenerationParams(speculativeDecoding: true), + ), + throwsA( + isA().having( + (error) => error.message.toString(), + 'message', + contains('speculative decoding'), + ), + ), + ); + }); + test( 'canceling generation subscription aborts active bridge completion', () async { diff --git a/test/unit/core/models/inference/generation_params_test.dart b/test/unit/core/models/inference/generation_params_test.dart index 90ac41dc..53a6fbdd 100644 --- a/test/unit/core/models/inference/generation_params_test.dart +++ b/test/unit/core/models/inference/generation_params_test.dart @@ -9,6 +9,7 @@ void main() { minP: 0.05, grammarRoot: 'main', grammarLazy: true, + speculativeDecoding: true, reusePromptPrefix: false, streamBatchTokenThreshold: 4, streamBatchByteThreshold: 256, @@ -24,6 +25,7 @@ void main() { expect(updated.minP, 0.05); expect(updated.grammarRoot, 'main'); expect(updated.grammarLazy, isTrue); + expect(updated.speculativeDecoding, isTrue); expect(updated.reusePromptPrefix, isFalse); expect(updated.streamBatchTokenThreshold, 4); expect(updated.streamBatchByteThreshold, 256); @@ -35,6 +37,7 @@ void main() { const params = GenerationParams(); expect(params.minP, 0.0); + expect(params.speculativeDecoding, isFalse); }); test('GenerationParams defaults stream batching thresholds', () { diff --git a/tool/macos_fair_litert_vs_llamadart.sh b/tool/macos_fair_litert_vs_llamadart.sh index df4ba094..185a295a 100755 --- a/tool/macos_fair_litert_vs_llamadart.sh +++ b/tool/macos_fair_litert_vs_llamadart.sh @@ -12,6 +12,7 @@ if [[ ! -f "$DEFAULT_LLAMADART_MODEL" && -f "$SIBLING_LLAMADART_MODEL" ]]; then fi LLAMADART_MODEL="${LLAMADART_MODEL:-$DEFAULT_LLAMADART_MODEL}" DECODE_TOKENS="${DECODE_TOKENS:-256}" +SPECULATIVE="${SPECULATIVE:-false}" PROMPT="${PROMPT:-Write a detailed practical guide for product engineers who want to use on-device language models in mobile and desktop apps. Cover privacy, latency, offline behavior, personalization, battery tradeoffs, model format choices, benchmarking methodology, rollout strategy, and failure modes. Use clear paragraphs and continue until the answer is complete.}" APP="$CHAT_APP_DIR/build/macos/Build/Products/Debug/llamadart_chat_example.app" @@ -48,7 +49,7 @@ echo "== LiteRT-LM Metal ==" --dart-define="LITERT_LM_MODEL=$MODEL_IN_APP" \ --dart-define=LLAMADART_MODEL= \ --dart-define=LITERT_LM_BACKEND=gpu \ - --dart-define=LITERT_LM_SPECULATIVE=false \ + --dart-define="LITERT_LM_SPECULATIVE=$SPECULATIVE" \ --dart-define=LITERT_LM_RUNS=3 \ --dart-define=LITERT_LM_WARMUPS=1 \ --dart-define="LITERT_LM_OUTPUT_TOKENS=$DECODE_TOKENS" \ diff --git a/website/docs/configuration/runtime-parameters.md b/website/docs/configuration/runtime-parameters.md index c5d8a6e6..2454a2c6 100644 --- a/website/docs/configuration/runtime-parameters.md +++ b/website/docs/configuration/runtime-parameters.md @@ -85,6 +85,7 @@ const params = GenerationParams( minP: 0.0, penalty: 1.1, stopSequences: [''], + speculativeDecoding: false, ); ``` @@ -94,6 +95,9 @@ Important fields: - `temp`: randomness. - `topK`, `topP`, `minP`: token filtering controls. - `penalty`: repeat penalty. +- `speculativeDecoding`: opt-in backend-native speculative decoding. Native + LiteRT-LM honors this flag; llama.cpp, WebGPU, and LiteRT-LM web reject it + until their speculative paths are implemented. - `seed`: deterministic replay when set. - `grammar`: constrained decoding with GBNF. diff --git a/website/docs/guides/backend-benchmarks.md b/website/docs/guides/backend-benchmarks.md index 49cad341..c0286207 100644 --- a/website/docs/guides/backend-benchmarks.md +++ b/website/docs/guides/backend-benchmarks.md @@ -49,6 +49,26 @@ Thermal status was 0 before the benchmark and 1 after the run, so the Android numbers should be treated as practical app-level numbers rather than a cooled lab baseline. +### Speculative decoding check + +After `GenerationParams.speculativeDecoding` was exposed for native LiteRT-LM, +the Gemma 4 E2B `.litertlm` path was rerun with the flag off and on. The flag +remains off by default because the measured result was slower for this model on +both devices. + +| Device / target | Runtime path | `speculativeDecoding` | Median wall tok/s | Median decode tok/s | Result | +| --- | --- | ---: | ---: | ---: | --- | +| Pixel 9 Pro, Android 16 | LiteRT-LM GPU | `false` | 15.50 | 15.70 | baseline | +| Pixel 9 Pro, Android 16 | LiteRT-LM GPU | `true` | 9.06 | 9.13 | about 42% slower | +| Mac, Apple M4 Max, macOS 26.5 | LiteRT-LM Metal | `false` | 135.02 | 136.70 | baseline | +| Mac, Apple M4 Max, macOS 26.5 | LiteRT-LM Metal | `true` | 118.96 | 120.25 | about 12% slower | + +The Pixel NPU path was also attempted for `gemma-4-E2B-it.litertlm`, but native +LiteRT-LM failed engine creation for backend `npu` on this device/model bundle +and reported that the Android NPU delegate may not support the device, OS, +model, or bundle. Use GPU or CPU for this artifact unless a newer LiteRT-LM +bundle/runtime combination validates NPU support. + ## Interpretation On Pixel 9 Pro, LiteRT-LM GPU was about 9x faster than llama.cpp Vulkan for this @@ -71,6 +91,10 @@ macOS: ```bash DECODE_TOKENS=256 tool/macos_fair_litert_vs_llamadart.sh + +# Native LiteRT-LM speculative decoding off/on check +SPECULATIVE=false DECODE_TOKENS=256 tool/macos_fair_litert_vs_llamadart.sh +SPECULATIVE=true DECODE_TOKENS=256 tool/macos_fair_litert_vs_llamadart.sh ``` Web: @@ -99,6 +123,14 @@ WARMUPS=1 \ RUNS=3 \ TARGETS=litert_lm,llamadart \ tool/litert_lm_pixel_benchmark.sh + +# Native LiteRT-LM GPU speculative decoding off/on check +DEVICE="$DEVICE" ADB="$ADB" TARGETS=litert_lm BACKEND=gpu \ + SPECULATIVE=false OUTPUT_TOKENS=256 WARMUPS=1 RUNS=3 \ + tool/litert_lm_pixel_benchmark.sh +DEVICE="$DEVICE" ADB="$ADB" TARGETS=litert_lm BACKEND=gpu \ + SPECULATIVE=true OUTPUT_TOKENS=256 WARMUPS=1 RUNS=3 \ + tool/litert_lm_pixel_benchmark.sh ``` For web GGUF experiments, use `TARGETS=llamadart`. If serving local large GGUF diff --git a/website/docs/guides/backend-selection.md b/website/docs/guides/backend-selection.md index bbf7194e..b4e6fbe3 100644 --- a/website/docs/guides/backend-selection.md +++ b/website/docs/guides/backend-selection.md @@ -127,6 +127,7 @@ For `.litertlm` / LiteRT-LM, use: - `contextSize` - `chatTemplate` - `GenerationParams.maxTokens`, `temp`, `topK`, `topP`, and `seed` +- `GenerationParams.speculativeDecoding` on native LiteRT-LM only - `stopSequences`, enforced by `llamadart` `llamadart` rejects unsupported backend-specific options for `.litertlm` loads @@ -170,6 +171,10 @@ kernel benchmark. generated through the chat app. LiteRT-LM was about 2x faster on the measured web decode counter and loaded much faster, while GGUF kept the broader llama.cpp feature surface. +- Treat `GenerationParams.speculativeDecoding` as a per-model/per-device tuning + knob, not a guaranteed speedup. For the measured Gemma 4 E2B LiteRT-LM runs, + speculative decoding was slower on Pixel 9 Pro GPU and Apple M4 Max Metal, so + the `LlamaEngine` default remains off. - On Android, benchmark LiteRT-LM `gpu` and `npu` separately when the model and device support them. NPU is not a general replacement for GGUF/Vulkan; it is a LiteRT-LM deployment path. diff --git a/website/docs/guides/performance-tuning.md b/website/docs/guides/performance-tuning.md index 91aa0f47..7cd6526c 100644 --- a/website/docs/guides/performance-tuning.md +++ b/website/docs/guides/performance-tuning.md @@ -110,6 +110,9 @@ Guidelines: `streamBatchTokenThreshold` and `streamBatchByteThreshold`. - Lower stream thresholds improve token-by-token UI granularity, while higher values improve throughput by reducing isolate message overhead. +- Use `speculativeDecoding` only for native LiteRT-LM and benchmark your target + model/device before enabling it; the default remains off because it is not a + universal speedup. - `reusePromptPrefix` is enabled by default for native generation; keep it on for multi-turn chats and repeated prompts, and validate parity for your target model/workload. @@ -199,6 +202,10 @@ Compare llama.cpp/GGUF and LiteRT-LM with the bundled fair benchmark scripts: # macOS native, Gemma 4 E2B artifacts DECODE_TOKENS=256 tool/macos_fair_litert_vs_llamadart.sh +# Native LiteRT-LM speculative decoding off/on comparison +SPECULATIVE=false DECODE_TOKENS=256 tool/macos_fair_litert_vs_llamadart.sh +SPECULATIVE=true DECODE_TOKENS=256 tool/macos_fair_litert_vs_llamadart.sh + # Web LiteRT-LM; use TARGETS=llamadart to test GGUF WebGPU separately DOWNLOAD_LITERT_WEB_MODEL=1 \ DECODE_TOKENS=256 \ @@ -214,6 +221,13 @@ DEVICE= "$ADB" -s "$DEVICE" shell input keyevent KEYCODE_WAKEUP DEVICE="$DEVICE" ADB="$ADB" OUTPUT_TOKENS=256 WARMUPS=1 RUNS=3 \ TARGETS=litert_lm,llamadart tool/litert_lm_pixel_benchmark.sh + +DEVICE="$DEVICE" ADB="$ADB" TARGETS=litert_lm BACKEND=gpu \ + SPECULATIVE=false OUTPUT_TOKENS=256 WARMUPS=1 RUNS=3 \ + tool/litert_lm_pixel_benchmark.sh +DEVICE="$DEVICE" ADB="$ADB" TARGETS=litert_lm BACKEND=gpu \ + SPECULATIVE=true OUTPUT_TOKENS=256 WARMUPS=1 RUNS=3 \ + tool/litert_lm_pixel_benchmark.sh ``` Current measured Gemma 4 E2B results are recorded in diff --git a/website/docs/platforms/support-matrix.md b/website/docs/platforms/support-matrix.md index 5f0a84a7..aacb92e8 100644 --- a/website/docs/platforms/support-matrix.md +++ b/website/docs/platforms/support-matrix.md @@ -128,8 +128,11 @@ LiteRT-LM does not currently expose embeddings, state persistence, LoRA, or multimodal projector APIs through llamadart. On native LiteRT-LM targets, high-level thinking and tool-call parsing still run through `LlamaEngine` for compatible templates, but llama.cpp-style GBNF grammar constraints are not -supported for `.litertlm` generation. Web LiteRT-LM also does not expose -tokenizer operations and is limited to single-turn text prompts, so it should +supported for `.litertlm` generation. Native LiteRT-LM can opt into runtime +speculative decoding through `GenerationParams.speculativeDecoding`; Web +LiteRT-LM rejects that option until the browser runtime exposes an equivalent +control. Web LiteRT-LM also does not expose tokenizer operations and is limited +to single-turn text prompts, so it should not be treated as a multi-turn `ChatSession` or tool-calling backend yet. `llamadart` rejects unsupported operations explicitly for `.litertlm` loads instead of silently ignoring llama.cpp-only settings. From ad882080965a53460286ea7cbc94526da486b930 Mon Sep 17 00:00:00 2001 From: Jhin Lee Date: Sun, 31 May 2026 11:07:28 -0400 Subject: [PATCH 2/3] Default speculative benchmark runs off --- example/chat_app/lib/litert_lm_benchmark_app.dart | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/chat_app/lib/litert_lm_benchmark_app.dart b/example/chat_app/lib/litert_lm_benchmark_app.dart index 800198dc..60cc2153 100644 --- a/example/chat_app/lib/litert_lm_benchmark_app.dart +++ b/example/chat_app/lib/litert_lm_benchmark_app.dart @@ -143,7 +143,7 @@ class _LiteRtLmBenchmarkAppState extends State { ); bool _speculative = const bool.fromEnvironment( 'LITERT_LM_SPECULATIVE', - defaultValue: true, + defaultValue: false, ); int _maxTokens = const int.fromEnvironment( 'LITERT_LM_MAX_TOKENS', From 5358b08cf3293d666fadcc78b9b5f5b05348c01d Mon Sep 17 00:00:00 2001 From: Jhin Lee Date: Sun, 31 May 2026 11:11:19 -0400 Subject: [PATCH 3/3] Cover speculative LiteRT-LM client reuse --- .../litert_lm/litert_lm_service_test.dart | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/test/unit/backends/litert_lm/litert_lm_service_test.dart b/test/unit/backends/litert_lm/litert_lm_service_test.dart index ff456e2b..c1ccaf0f 100644 --- a/test/unit/backends/litert_lm/litert_lm_service_test.dart +++ b/test/unit/backends/litert_lm/litert_lm_service_test.dart @@ -917,6 +917,67 @@ void main() { } }); + test( + 'recreates LiteRT-LM client when speculative decoding changes', + () async { + final firstClient = _FakeLiteRtLmRuntimeClient(); + final secondClient = _FakeLiteRtLmRuntimeClient(); + final clients = <_FakeLiteRtLmRuntimeClient>[firstClient, secondClient]; + var nextClient = 0; + final service = LiteRtLmService( + clientFactory: () => clients[nextClient++], + ); + + try { + final modelHandle = await service.loadModel( + modelFile.path, + const ModelParams( + contextSize: 3072, + preferredBackend: GpuBackend.cpu, + ), + ); + final contextHandle = service.createContext( + modelHandle, + const ModelParams( + contextSize: 3072, + preferredBackend: GpuBackend.cpu, + ), + ); + + final firstChunks = service + .generate( + contextHandle, + 'hello', + const GenerationParams(maxTokens: 7, speculativeDecoding: true), + ) + .toList(); + await firstClient.generateStarted.future; + firstClient.generated.add('first'); + await firstClient.generated.close(); + await firstChunks; + + final secondChunks = service + .generate( + contextHandle, + 'hello', + const GenerationParams(maxTokens: 7), + ) + .toList(); + await secondClient.generateStarted.future; + secondClient.generated.add('second'); + await secondClient.generated.close(); + await secondChunks; + + expect(firstClient.lastSpeculativeDecoding, isTrue); + expect(firstClient.disposeCount, 1); + expect(secondClient.lastSpeculativeDecoding, isFalse); + expect(nextClient, 2); + } finally { + service.dispose(); + } + }, + ); + test('buffers stop-sequence tails when no stop is found', () async { final fakeClient = _FakeLiteRtLmRuntimeClient(); final service = LiteRtLmService(clientFactory: () => fakeClient);