From 9acae4fdfd1cd9efe99b81eb29971d469ddada84 Mon Sep 17 00:00:00 2001
From: Foundry Bot <ops@infernet.org>
Date: Mon, 2 Mar 2026 18:24:48 +0000
Subject: [PATCH] fix: AI SDK compatibility, rootless Docker, preflight checks,
 troubleshooting docs

- Add --reasoning-format none to Qwen3.5-9B profiles (rtx5090, default)
  to prevent AI SDK extractReasoningMiddleware crash on empty <think>
  blocks emitted by the Unsloth GGUF chat template (vercel/ai #12054)

- Fix docker-compose.yml for rootless Docker compatibility:
  - Remove container-level sysctls (bbr, busy_read, busy_poll) that fail
    on rootless Docker. These are host-level settings already applied by
    scripts/host-setup.sh
  - Remove ulimits.memlock (fails on rootless Docker, --mlock in profile
    handles memory locking at the application level)
  - Fix duplicate volumes key in prometheus service (YAML bug)

- Add preflight_check() to entrypoint that warns on startup about
  suboptimal host kernel params (BBR, NUMA balancing, swappiness, GPU
  persistence mode) with actionable fix: sudo ./scripts/host-setup.sh

- Update AGENTS.md OpenCode config to working @ai-sdk/openai-compatible
  setup with correct model ID (Qwen3.5-9B-UD-Q4_K_XL.gguf) and note
  about @ai-sdk/openai crash

- Add comprehensive TROUBLESHOOTING.md with symptom-first headings,
  exact error messages, diagnostic commands, and verification steps

Tested on eae@192.168.0.219 (RTX 5090, rootless Docker):
- Preflight check detects missing BBR correctly
- --reasoning-format none confirmed: no reasoning_content in responses
- Tool calling works with @ai-sdk/openai-compatible
- 4 slots, 262K context/slot, all 33 layers on GPU
---
 .gitignore                            |   6 +
 AGENTS.md                             |  50 +--
 TROUBLESHOOTING.md                    | 421 ++++++++++++++++++++++++++
 docker-compose.yml                    |  14 +-
 models/qwen3.5-9b/entrypoint.sh       |  71 ++++-
 models/qwen3.5-9b/profiles/default.sh |   2 +-
 models/qwen3.5-9b/profiles/rtx5090.sh |   4 +-
 scripts/entrypoint.sh                 |  71 ++++-
 8 files changed, 601 insertions(+), 38 deletions(-)
 create mode 100644 TROUBLESHOOTING.md
diff --git a/.gitignore b/.gitignore
index 430c129..d4437c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,9 @@ Thumbs.db
 # Benchmark results (keep structure, ignore large outputs)
 benchmarks/results/*.json
 !benchmarks/results/.gitkeep
+
+# OpenCode (user-specific local config at project root)
+/opencode.json
+
+# Build artifacts
+remote-build.log
diff --git a/AGENTS.md b/AGENTS.md
index 548e250..1160a1a 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -37,24 +37,31 @@ No API key is required by default. If your client demands one, any non-empty str
 
 ### OpenCode
 
-[OpenCode](https://opencode.ai) connects directly via OpenAI-compatible providers.
+[OpenCode](https://opencode.ai) connects via the `@ai-sdk/openai-compatible` provider.
+
+> **Important:** Use `@ai-sdk/openai-compatible`, not `@ai-sdk/openai`. The latter crashes on
+> models that emit `<think>` tokens (see [Troubleshooting](TROUBLESHOOTING.md#opencode-text-part-msg-not-found)).
 
 ```json
-// ~/.config/opencode/config.json
+// opencode.json (project root or ~/.config/opencode/opencode.json)
 {
+  "$schema": "https://opencode.ai/config.json",
+  "model": "foundry/Qwen3.5-9B-UD-Q4_K_XL.gguf",
   "provider": {
     "foundry": {
+      "npm": "@ai-sdk/openai-compatible",
       "name": "Foundry",
-      "type": "openai",
-      "url": "http://localhost:8080/v1",
+      "options": {
+        "baseURL": "http://localhost:8080/v1",
+        "apiKey": "sk-local"
+      },
       "models": {
-        "qwen": {
-          "id": "qwen3.5-9b",
-          "name": "Qwen 3.5 9B"
-        },
-        "qwen-coder": {
-          "id": "qwen3-coder-30b-a3b",
-          "name": "Qwen 3 Coder 30B A3B"
+        "Qwen3.5-9B-UD-Q4_K_XL.gguf": {
+          "name": "Qwen 3.5 9B",
+          "limit": {
+            "context": 262144,
+            "output": 32768
+          }
         }
       }
     }
@@ -62,6 +69,8 @@ No API key is required by default. If your client demands one, any non-empty str
 }
 ```
 
+The model ID must match what `/v1/models` returns (check with `curl http://localhost:8080/v1/models`).
+
 ### Cursor
 
 Settings > Models > OpenAI API Base:
@@ -493,7 +502,7 @@ for chunk in stream:
         print(chunk.choices[0].delta.content, end="", flush=True)
 ```
 
-The Docker Compose configuration includes TCP tuning (BBR congestion control, busy polling) for minimal streaming latency. Time-to-first-token is typically ~50-200 ms depending on prompt length.
+The host tuning script (`sudo ./scripts/host-setup.sh`) configures BBR congestion control and busy polling for minimal streaming latency. Time-to-first-token is typically ~50-200 ms depending on prompt length.
 
 ## Multi-GPU Agent Routing
 
@@ -539,6 +548,10 @@ def get_client(agent_id: int) -> OpenAI:
 
 ## Troubleshooting
 
+For a comprehensive troubleshooting guide, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
+
+Quick reference for the most common issues:
+
 ### Model loads on CPU instead of GPU
 
 If you see `no devices with dedicated memory found` in the logs, the CUDA backend failed to load. Check:
@@ -553,6 +566,10 @@ If you see `no devices with dedicated memory found` in the logs, the CUDA backen
 2. Check VRAM: `nvidia-smi` -- if VRAM is full, reduce context with `FOUNDRY_CTX_LENGTH`
 3. Check if all slots are occupied: `curl http://localhost:8080/metrics | grep slots`
 
+### OpenCode: "text part msg_... not found"
+
+Use `@ai-sdk/openai-compatible` (not `@ai-sdk/openai`) and ensure the server has `--reasoning-format none`. See [TROUBLESHOOTING.md](TROUBLESHOOTING.md#opencode-text-part-msg-not-found) for details.
+
 ### Connection refused
 
 1. Container might still be loading the model. Check `docker logs <container>` for progress.
@@ -571,12 +588,3 @@ docker run --gpus all -p 8080:8080 \
 ```
 
 For GPUs with less than 16 GB VRAM, use Qwen3.5-9B (only 5.66 GB model weight). For 16+ GB, Qwen3-Coder-30B-A3B's MoE expert offloading can spill inactive experts to CPU.
-
-### Inconsistent response speeds
-
-If response speed varies between requests, check for:
-
-1. **Prompt cache misses**: First message in a conversation is always slower (prompt processing)
-2. **Concurrent slot contention**: Other agents may be using slots simultaneously
-3. **GPU thermal throttling**: Check `nvidia-smi -q -d PERFORMANCE` for throttle reasons
-4. **CPU interrupt interference**: Pin GPU IRQs to dedicated cores (see README Host Tuning section)
diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
new file mode 100644
index 0000000..5e6250b
--- /dev/null
+++ b/TROUBLESHOOTING.md
@@ -0,0 +1,421 @@
+# Troubleshooting
+
+Common issues and solutions for Foundry inference servers.
+
+## First Steps: Collecting Diagnostic Information
+
+Before troubleshooting, gather this information:
+
+### Container Logs
+
+```bash
+docker compose logs inference --tail 50
+```
+
+### Health Check
+
+```bash
+curl http://localhost:8080/health
+# Returns: {"status":"ok"} when healthy
+```
+
+### Metrics
+
+```bash
+curl -s http://localhost:8080/metrics | grep -E "llama_server_(slots|ctx|model)"
+```
+
+### GPU Status
+
+```bash
+nvidia-smi --query-gpu=name,memory.used,memory.total,temperature.gpu,utilization.gpu,persistence_mode --format=csv
+```
+
+---
+
+## GPU & CUDA
+
+### "no devices with dedicated memory found"
+
+**Symptom:** Server starts but model loads on CPU. Inference is extremely slow.
+
+**Cause:** The NVIDIA container runtime is not mounting GPU drivers into the container.
+
+**Fix:**
+1. Verify the host GPU works:
+   ```bash
+   nvidia-smi
+   ```
+2. Verify Docker can access the GPU:
+   ```bash
+   docker run --rm --gpus all nvidia/cuda:12.9.1-base-ubuntu24.04 nvidia-smi
+   ```
+3. If step 2 fails, install the NVIDIA Container Toolkit:
+   ```bash
+   # Ubuntu/Debian
+   curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+   curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+     sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+     sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+   sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
+   sudo nvidia-ctk runtime configure --runtime=docker
+   sudo systemctl restart docker
+   ```
+
+**Verify:** `docker compose logs inference | grep "offloaded"` should show `offloaded N/N layers to GPU`.
+
+### "CUDA error: no kernel image is available for execution on the device"
+
+**Symptom:** Server crashes immediately after loading model.
+
+**Cause:** The compiled CUDA kernels don't match your GPU architecture. Foundry images are built for sm_89 (Ada/RTX 40xx) and sm_120a (Blackwell/RTX 50xx).
+
+**Fix:** If you have an older GPU (e.g. Ampere/RTX 30xx), rebuild the image with your architecture:
+```bash
+# In the Dockerfile, change CMAKE_CUDA_ARCHITECTURES:
+-DCMAKE_CUDA_ARCHITECTURES="86;89;120a"   # Add 86 for Ampere
+```
+
+**Verify:** `docker compose logs inference | grep "CUDA"` should show successful backend loading.
+
+---
+
+## Out of Memory
+
+### Container exits with "OOMKilled"
+
+**Symptom:** Container disappears. `docker inspect` shows OOMKilled.
+
+**Cause:** VRAM or system RAM exhausted. The RTX 5090 profile uses 1M total context (262K/slot x 4), which requires ~29.5 GB VRAM for Qwen3.5-9B.
+
+**Diagnosis:**
+```bash
+docker inspect --format='{{.State.OOMKilled}}' foundry-inference-1
+# Returns: true if OOM killed
+```
+
+**Fix:**
+1. Reduce context length:
+   ```bash
+   FOUNDRY_CTX_LENGTH=65536 docker compose up
+   ```
+2. Reduce parallel slots:
+   ```bash
+   FOUNDRY_EXTRA_ARGS="--parallel 1" docker compose up
+   ```
+3. Use more aggressive KV cache quantization (add to `FOUNDRY_EXTRA_ARGS`):
+   ```bash
+   FOUNDRY_EXTRA_ARGS="-ctk q4_0 -ctv q4_0" docker compose up
+   ```
+
+**Verify:** `nvidia-smi` should show VRAM usage within your GPU's capacity.
+
+### "CUDA out of memory" in logs
+
+**Symptom:** Server starts but crashes when processing the first request.
+
+**Cause:** KV cache allocation exceeds available VRAM. This happens when context is too large for your GPU.
+
+**Fix:** Same as OOMKilled above. Start with a small context and increase:
+```bash
+FOUNDRY_CTX_LENGTH=16384 docker compose up
+```
+
+---
+
+## Startup & Model Loading
+
+### "Download failed: xxx.gguf not found after download"
+
+**Symptom:** First startup fails during model download.
+
+**Cause:** Hugging Face API rate limiting, network issues, or missing auth token for gated models.
+
+**Fix:**
+1. Set a Hugging Face token (free, read-only access):
+   ```bash
+   echo "HF_TOKEN=hf_your_token_here" > .env
+   docker compose up
+   ```
+2. Or download manually and mount:
+   ```bash
+   pip install huggingface-hub
+   huggingface-cli download unsloth/Qwen3.5-9B-GGUF Qwen3.5-9B-UD-Q4_K_XL.gguf --local-dir ~/.cache/foundry
+   docker compose up
+   ```
+
+**Verify:** `ls -lh ~/.cache/foundry/*.gguf` should show the model file.
+
+### "Permission denied: Cannot access /models"
+
+**Symptom:** Container starts but cannot read the model volume.
+
+**Cause:** Docker volume permissions don't match the container user.
+
+**Fix:**
+```bash
+sudo chown -R $(id -u):$(id -g) ~/.cache/foundry
+chmod 755 ~/.cache/foundry
+```
+
+**Verify:** `docker compose logs inference | grep "Model found"` confirms the model is accessible.
+
+### Model loading takes 30+ seconds
+
+**Symptom:** Long delay between container start and first `/health` OK.
+
+**Cause:** First-time model loading, GPU not in persistence mode, or slow storage.
+
+**Fix:**
+1. Enable GPU persistence mode (avoids ~100-500ms cold start per request):
+   ```bash
+   sudo nvidia-smi -pm 1
+   ```
+2. Run host tuning for optimized I/O:
+   ```bash
+   sudo ./scripts/host-setup.sh
+   ```
+3. Subsequent starts are fast since the model is cached in `~/.cache/foundry`.
+
+**Verify:** `docker compose logs inference | grep "Model found"` shows the cached model with its size.
+
+---
+
+## API & Connectivity
+
+### Connection refused on port 8080
+
+**Symptom:** `curl: (7) Failed to connect to localhost port 8080: Connection refused`
+
+**Cause:** Container is still loading the model, or another service is using port 8080.
+
+**Fix:**
+1. Check if the model is still loading:
+   ```bash
+   docker compose logs inference --tail 5
+   # Look for "server is listening on" message
+   ```
+2. Check for port conflicts:
+   ```bash
+   ss -tlnp src :8080
+   ```
+3. Use a different port:
+   ```bash
+   FOUNDRY_PORT=8090 docker compose up
+   ```
+
+**Verify:** `curl http://localhost:8080/health` returns `{"status":"ok"}`.
+
+### HTTP 503 / All slots busy
+
+**Symptom:** API returns 503 when all parallel slots are occupied.
+
+**Cause:** More concurrent requests than available slots. Default: 4 slots (RTX 5090) or 2 slots (default profile).
+
+**Fix:**
+1. Check slot usage:
+   ```bash
+   curl -s http://localhost:8080/metrics | grep "llama_server_requests"
+   ```
+2. Increase parallel slots (requires more VRAM):
+   ```bash
+   FOUNDRY_EXTRA_ARGS="--parallel 8" docker compose up
+   ```
+3. Or queue requests client-side with retry logic.
+
+---
+
+## Performance
+
+### Throughput below documented benchmarks
+
+**Symptom:** Decode speed is 50%+ lower than documented (e.g. 90 tok/s instead of 177 tok/s for Qwen3.5-9B).
+
+**Cause:** Host kernel not tuned. The documented benchmarks assume `host-setup.sh` has been run.
+
+**Diagnosis:**
+```bash
+# Check CPU governor
+cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
+# Should be: performance
+
+# Check NUMA balancing
+sysctl kernel.numa_balancing
+# Should be: 0
+
+# Check GPU persistence mode
+nvidia-smi --query-gpu=persistence_mode --format=csv,noheader
+# Should be: Enabled
+
+# Check BBR congestion control (for streaming latency)
+sysctl net.ipv4.tcp_congestion_control
+# Should be: bbr
+```
+
+**Fix:** Run the host tuning script:
+```bash
+sudo ./scripts/host-setup.sh
+```
+
+This sets: CPU governor to performance, disables NUMA balancing, enables BBR TCP, tunes NVMe I/O, enables GPU persistence, allocates hugepages, and enables busy polling for reduced NIC latency.
+
+**Verify:** Re-run your benchmark. Single-stream decode should be within 10% of documented speeds.
+
+### Latency spikes after 10-15 minutes (thermal throttling)
+
+**Symptom:** Steady performance degrades over time. GPU temp >85C.
+
+**Diagnosis:**
+```bash
+nvidia-smi -q -d PERFORMANCE | grep -A5 "Clocks Throttle Reasons"
+# Look for "SW Thermal Slowdown: Active"
+```
+
+**Fix:**
+1. Reduce concurrent load (fewer parallel slots = less heat):
+   ```bash
+   FOUNDRY_EXTRA_ARGS="--parallel 2" docker compose up
+   ```
+2. Check `nvidia-smi dmon -s p` for real-time power/temp monitoring.
+
+**Verify:** `nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader` should stay below 83C under load.
+
+---
+
+## Docker & Container Issues
+
+### Rootless Docker: sysctls and ulimits fail
+
+**Symptom:** Container fails to start with `sysctl not allowed` or `permission denied setting ulimit`.
+
+**Cause:** Rootless Docker cannot set privileged sysctls (`tcp_congestion_control`, `busy_read`, `busy_poll`) or unlimited memlock.
+
+**Fix:** These settings are intentionally omitted from `docker-compose.yml` for rootless compatibility. Apply them at the host level instead:
+```bash
+sudo ./scripts/host-setup.sh
+```
+
+This is more effective anyway -- container-level network sysctls share the host's network stack.
+
+### eBPF exporter won't start
+
+**Symptom:** `ebpf-exporter` container exits immediately.
+
+**Cause:** Requires `privileged: true` and `pid: host` for kernel tracepoint access. Incompatible with rootless Docker, SELinux enforcing, and some cloud providers.
+
+**Fix:** The eBPF exporter is optional (part of the `monitoring` profile). The other monitoring services (Prometheus, Grafana, GPU exporter, node exporter) work without it.
+
+To run without eBPF:
+```bash
+# The monitoring profile still works -- eBPF exporter will fail but others start fine
+docker compose --profile monitoring up -d
+```
+
+**Verify:** `docker compose --profile monitoring ps` -- all services should be "Up" except `ebpf-exporter`.
+
+---
+
+## AI Agent Integration
+
+### OpenCode: "text part msg_... not found"
+
+**Symptom:** OpenCode crashes immediately with `text part msg_XXXX not found` when using `@ai-sdk/openai`.
+
+**Cause:** The `@ai-sdk/openai` package includes `extractReasoningMiddleware` that crashes when it encounters `<think>` tokens in the response content. Qwen3.5 models always generate `<think>` blocks even with thinking disabled ([vercel/ai #12054](https://github.com/vercel/ai/issues/12054)).
+
+**Fix:** Use `@ai-sdk/openai-compatible` instead of `@ai-sdk/openai` in your `opencode.json`:
+```json
+{
+  "provider": {
+    "foundry": {
+      "npm": "@ai-sdk/openai-compatible",
+      "options": {
+        "baseURL": "http://localhost:8080/v1",
+        "apiKey": "sk-local"
+      }
+    }
+  }
+}
+```
+
+Also ensure the server uses `--reasoning-format none` (already set in Foundry's Qwen3.5-9B profiles).
+
+### OpenCode: Tool calls appear as raw XML
+
+**Symptom:** Model outputs `<tool_call><function=...>` as text instead of executing tools.
+
+**Cause:** The server is returning `reasoning_content` in the API response, which confuses the AI SDK's tool call parser.
+
+**Fix:** Ensure `--reasoning-format none` is in the server's `PROFILE_EXTRA_ARGS`. This is the default in Foundry's Qwen3.5-9B profiles. If you're using a custom profile or `FOUNDRY_EXTRA_ARGS`, add it:
+```bash
+FOUNDRY_EXTRA_ARGS="--reasoning-format none" docker compose up
+```
+
+**Verify:**
+```bash
+curl -s http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model":"Qwen3.5-9B-UD-Q4_K_XL.gguf","messages":[{"role":"user","content":"hello"}],"max_tokens":64}' \
+  | python3 -c "import sys,json; d=json.load(sys.stdin); print('reasoning_content' in d['choices'][0]['message'])"
+# Should print: False
+```
+
+### Client requires API key
+
+**Symptom:** Client errors with "No API key provided" even though Foundry doesn't require one.
+
+**Fix:** Use any non-empty string as the API key:
+```bash
+export OPENAI_API_KEY=sk-local
+```
+
+---
+
+## Monitoring
+
+### Grafana dashboards show no data
+
+**Symptom:** Dashboards load but all panels are empty.
+
+**Cause:** Prometheus hasn't scraped targets yet, or targets are unreachable.
+
+**Diagnosis:**
+```bash
+# Check Prometheus targets
+curl -s http://localhost:9090/api/v1/targets | python3 -m json.tool | grep -E '"health"|"lastError"'
+```
+
+**Fix:**
+1. Wait 30-60 seconds after starting services (Prometheus scrapes every 15s).
+2. Verify the inference metrics endpoint works:
+   ```bash
+   curl -s http://localhost:8080/metrics | head -5
+   ```
+3. Check that `monitoring/prometheus/prometheus.yml` has correct scrape targets.
+
+**Verify:** Prometheus targets page at `http://localhost:9090/targets` should show all targets as "UP".
+
+---
+
+## Collecting Information for Bug Reports
+
+When filing an issue, include the output of these commands:
+
+```bash
+# System info
+nvidia-smi
+docker version
+docker compose version
+uname -a
+
+# Container state
+docker compose --profile monitoring ps
+docker compose logs inference --tail 100
+
+# Server health
+curl -s http://localhost:8080/health
+curl -s http://localhost:8080/v1/models | python3 -m json.tool
+
+# Metrics snapshot
+curl -s http://localhost:8080/metrics
+```
diff --git a/docker-compose.yml b/docker-compose.yml
index e60c6de..8a6d1b4 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -60,22 +60,20 @@ services:
               count: 1
               capabilities: [gpu]
     shm_size: '2gb'
+    # NOTE: Network tuning (BBR, busy_poll) and memory locking are applied at the
+    # host level by scripts/host-setup.sh. Run it once before deploying:
+    #   sudo ./scripts/host-setup.sh
+    # Container-level sysctls for these are omitted because they fail on rootless
+    # Docker and are less effective than host-level settings.
     sysctls:
       - net.core.somaxconn=4096
       - net.ipv4.tcp_keepalive_time=60
-      - net.ipv4.tcp_congestion_control=bbr
-      - net.core.busy_read=50
-      - net.core.busy_poll=50
     restart: unless-stopped
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
       interval: 10s
       timeout: 5s
       retries: 5
-    ulimits:
-      memlock:
-        soft: -1
-        hard: -1
     networks:
       - default
       - monitoring
@@ -88,8 +86,6 @@ services:
     image: prom/prometheus:latest
     profiles:
       - monitoring
-    volumes:
-      - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
     extra_hosts:
       - "host.docker.internal:host-gateway"
     ports:
diff --git a/models/qwen3.5-9b/entrypoint.sh b/models/qwen3.5-9b/entrypoint.sh
index 745915b..413db4a 100755
--- a/models/qwen3.5-9b/entrypoint.sh
+++ b/models/qwen3.5-9b/entrypoint.sh
@@ -102,6 +102,68 @@ load_profile() {
     source "$profile_file"
 }
 
+# ==============================================================================
+# Pre-flight Host Check
+# ==============================================================================
+# Checks host kernel parameters visible via /proc/sys/ and warns about
+# suboptimal settings that affect inference performance. Does NOT block startup.
+#
+# These parameters are set by: sudo ./scripts/host-setup.sh
+
+preflight_check() {
+    local warnings=0
+
+    # Helper: check a sysctl value and warn if it doesn't match
+    check_sysctl() {
+        local path="$1" expected="$2" label="$3" fix="$4"
+        local actual
+        if [ -f "$path" ]; then
+            actual=$(cat "$path" 2>/dev/null)
+            if [ "$actual" != "$expected" ]; then
+                warn "  $label = $actual (expected: $expected) -- $fix"
+                warnings=$((warnings + 1))
+            fi
+        fi
+    }
+
+    log "Checking host kernel parameters..."
+
+    # TCP congestion control (BBR reduces streaming latency)
+    check_sysctl /proc/sys/net/ipv4/tcp_congestion_control "bbr" \
+        "tcp_congestion_control" "BBR reduces token streaming latency"
+
+    # NUMA balancing (causes random latency spikes during decode)
+    check_sysctl /proc/sys/kernel/numa_balancing "0" \
+        "numa_balancing" "page migration causes decode latency jitter"
+
+    # Swappiness (model weights should stay in RAM)
+    local swappiness
+    swappiness=$(cat /proc/sys/vm/swappiness 2>/dev/null)
+    if [ -n "$swappiness" ] && [ "$swappiness" -gt 10 ]; then
+        warn "  vm.swappiness = $swappiness (expected: 0-10) -- high swappiness may evict model weights"
+        warnings=$((warnings + 1))
+    fi
+
+    # GPU persistence mode
+    if command -v nvidia-smi &> /dev/null; then
+        local pm
+        pm=$(nvidia-smi --query-gpu=persistence_mode --format=csv,noheader,nounits 2>/dev/null | head -1 | xargs)
+        if [ "$pm" = "Disabled" ]; then
+            warn "  GPU persistence mode = Disabled -- adds 100-500ms cold start latency"
+            warnings=$((warnings + 1))
+        fi
+    fi
+
+    if [ "$warnings" -gt 0 ]; then
+        warn ""
+        warn "  $warnings performance issue(s) detected. Run on the host:"
+        warn "    sudo ./scripts/host-setup.sh"
+        warn ""
+    else
+        log "Host kernel parameters: OK"
+    fi
+}
+
 # ==============================================================================
 # Model Download
 # ==============================================================================
@@ -294,10 +356,13 @@ main() {
     # 2. Load profile
     load_profile "$profile"
 
-    # 3. Download model if needed
+    # 3. Pre-flight host check (warns about suboptimal kernel params)
+    preflight_check
+
+    # 4. Download model if needed
     download_model
 
-    # 4. Build launch command (sets FOUNDRY_CMD array directly, no subshell)
+    # 5. Build launch command (sets FOUNDRY_CMD array directly, no subshell)
     build_command
 
     echo ""
@@ -308,7 +373,7 @@ main() {
     echo -e "${GREEN}  http://localhost:${FOUNDRY_PORT:-8080}/v1/chat/completions${NC}"
     echo ""
 
-    # 5. Launch (exec replaces shell process for proper signal handling)
+    # 6. Launch (exec replaces shell process for proper signal handling)
     # Use the array form to avoid word-splitting issues
     exec "${FOUNDRY_CMD[@]}"
 }
diff --git a/models/qwen3.5-9b/profiles/default.sh b/models/qwen3.5-9b/profiles/default.sh
index a20c158..2b09587 100644
--- a/models/qwen3.5-9b/profiles/default.sh
+++ b/models/qwen3.5-9b/profiles/default.sh
@@ -22,4 +22,4 @@ PROFILE_CPU_STRICT=0
 PROFILE_CACHE_REUSE=0           # Disabled: hybrid recurrent arch re-processes anyway
 PROFILE_NO_WEBUI="false"        # Keep web UI for exploration
 PROFILE_METRICS="false"
-PROFILE_EXTRA_ARGS="--swa-full --cache-ram 0"
+PROFILE_EXTRA_ARGS="--swa-full --cache-ram 0 --reasoning-format none"
diff --git a/models/qwen3.5-9b/profiles/rtx5090.sh b/models/qwen3.5-9b/profiles/rtx5090.sh
index ce3f3bf..1d2fd07 100644
--- a/models/qwen3.5-9b/profiles/rtx5090.sh
+++ b/models/qwen3.5-9b/profiles/rtx5090.sh
@@ -51,4 +51,6 @@ PROFILE_METRICS="true"          # Prometheus-compatible /metrics endpoint
 # --mlock: pin model in RAM; -b/-ub 4096: large batch for fast prompt encode
 # --swa-full: full SWA cache for hybrid attention models (DeltaNet + attention)
 # --cache-ram 0: disable prompt cache (hybrid recurrent arch forces re-processing)
-PROFILE_EXTRA_ARGS="--mlock -b 4096 -ub 4096 --swa-full --cache-ram 0"
+# --reasoning-format none: keep <think> tags as plain text in content, no reasoning_content field
+#   (prevents AI SDK extractReasoningMiddleware crash on empty think blocks)
+PROFILE_EXTRA_ARGS="--mlock -b 4096 -ub 4096 --swa-full --cache-ram 0 --reasoning-format none"
diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh
index 745915b..413db4a 100755
--- a/scripts/entrypoint.sh
+++ b/scripts/entrypoint.sh
@@ -102,6 +102,68 @@ load_profile() {
     source "$profile_file"
 }
 
+# ==============================================================================
+# Pre-flight Host Check
+# ==============================================================================
+# Checks host kernel parameters visible via /proc/sys/ and warns about
+# suboptimal settings that affect inference performance. Does NOT block startup.
+#
+# These parameters are set by: sudo ./scripts/host-setup.sh
+
+preflight_check() {
+    local warnings=0
+
+    # Helper: check a sysctl value and warn if it doesn't match
+    check_sysctl() {
+        local path="$1" expected="$2" label="$3" fix="$4"
+        local actual
+        if [ -f "$path" ]; then
+            actual=$(cat "$path" 2>/dev/null)
+            if [ "$actual" != "$expected" ]; then
+                warn "  $label = $actual (expected: $expected) -- $fix"
+                warnings=$((warnings + 1))
+            fi
+        fi
+    }
+
+    log "Checking host kernel parameters..."
+
+    # TCP congestion control (BBR reduces streaming latency)
+    check_sysctl /proc/sys/net/ipv4/tcp_congestion_control "bbr" \
+        "tcp_congestion_control" "BBR reduces token streaming latency"
+
+    # NUMA balancing (causes random latency spikes during decode)
+    check_sysctl /proc/sys/kernel/numa_balancing "0" \
+        "numa_balancing" "page migration causes decode latency jitter"
+
+    # Swappiness (model weights should stay in RAM)
+    local swappiness
+    swappiness=$(cat /proc/sys/vm/swappiness 2>/dev/null)
+    if [ -n "$swappiness" ] && [ "$swappiness" -gt 10 ]; then
+        warn "  vm.swappiness = $swappiness (expected: 0-10) -- high swappiness may evict model weights"
+        warnings=$((warnings + 1))
+    fi
+
+    # GPU persistence mode
+    if command -v nvidia-smi &> /dev/null; then
+        local pm
+        pm=$(nvidia-smi --query-gpu=persistence_mode --format=csv,noheader,nounits 2>/dev/null | head -1 | xargs)
+        if [ "$pm" = "Disabled" ]; then
+            warn "  GPU persistence mode = Disabled -- adds 100-500ms cold start latency"
+            warnings=$((warnings + 1))
+        fi
+    fi
+
+    if [ "$warnings" -gt 0 ]; then
+        warn ""
+        warn "  $warnings performance issue(s) detected. Run on the host:"
+        warn "    sudo ./scripts/host-setup.sh"
+        warn ""
+    else
+        log "Host kernel parameters: OK"
+    fi
+}
+
 # ==============================================================================
 # Model Download
 # ==============================================================================
@@ -294,10 +356,13 @@ main() {
     # 2. Load profile
     load_profile "$profile"
 
-    # 3. Download model if needed
+    # 3. Pre-flight host check (warns about suboptimal kernel params)
+    preflight_check
+
+    # 4. Download model if needed
     download_model
 
-    # 4. Build launch command (sets FOUNDRY_CMD array directly, no subshell)
+    # 5. Build launch command (sets FOUNDRY_CMD array directly, no subshell)
     build_command
 
     echo ""
@@ -308,7 +373,7 @@ main() {
     echo -e "${GREEN}  http://localhost:${FOUNDRY_PORT:-8080}/v1/chat/completions${NC}"
     echo ""
 
-    # 5. Launch (exec replaces shell process for proper signal handling)
+    # 6. Launch (exec replaces shell process for proper signal handling)
     # Use the array form to avoid word-splitting issues
     exec "${FOUNDRY_CMD[@]}"
 }