From 9acae4fdfd1cd9efe99b81eb29971d469ddada84 Mon Sep 17 00:00:00 2001 From: Foundry Bot Date: Mon, 2 Mar 2026 18:24:48 +0000 Subject: [PATCH] fix: AI SDK compatibility, rootless Docker, preflight checks, troubleshooting docs - Add --reasoning-format none to Qwen3.5-9B profiles (rtx5090, default) to prevent AI SDK extractReasoningMiddleware crash on empty blocks emitted by the Unsloth GGUF chat template (vercel/ai #12054) - Fix docker-compose.yml for rootless Docker compatibility: - Remove container-level sysctls (bbr, busy_read, busy_poll) that fail on rootless Docker. These are host-level settings already applied by scripts/host-setup.sh - Remove ulimits.memlock (fails on rootless Docker, --mlock in profile handles memory locking at the application level) - Fix duplicate volumes key in prometheus service (YAML bug) - Add preflight_check() to entrypoint that warns on startup about suboptimal host kernel params (BBR, NUMA balancing, swappiness, GPU persistence mode) with actionable fix: sudo ./scripts/host-setup.sh - Update AGENTS.md OpenCode config to working @ai-sdk/openai-compatible setup with correct model ID (Qwen3.5-9B-UD-Q4_K_XL.gguf) and note about @ai-sdk/openai crash - Add comprehensive TROUBLESHOOTING.md with symptom-first headings, exact error messages, diagnostic commands, and verification steps Tested on eae@192.168.0.219 (RTX 5090, rootless Docker): - Preflight check detects missing BBR correctly - --reasoning-format none confirmed: no reasoning_content in responses - Tool calling works with @ai-sdk/openai-compatible - 4 slots, 262K context/slot, all 33 layers on GPU --- .gitignore | 6 + AGENTS.md | 50 +-- TROUBLESHOOTING.md | 421 ++++++++++++++++++++++++++ docker-compose.yml | 14 +- models/qwen3.5-9b/entrypoint.sh | 71 ++++- models/qwen3.5-9b/profiles/default.sh | 2 +- models/qwen3.5-9b/profiles/rtx5090.sh | 4 +- scripts/entrypoint.sh | 71 ++++- 8 files changed, 601 insertions(+), 38 deletions(-) create mode 100644 TROUBLESHOOTING.md diff --git a/.gitignore b/.gitignore index 430c129..d4437c3 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,9 @@ Thumbs.db # Benchmark results (keep structure, ignore large outputs) benchmarks/results/*.json !benchmarks/results/.gitkeep + +# OpenCode (user-specific local config at project root) +/opencode.json + +# Build artifacts +remote-build.log diff --git a/AGENTS.md b/AGENTS.md index 548e250..1160a1a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -37,24 +37,31 @@ No API key is required by default. If your client demands one, any non-empty str ### OpenCode -[OpenCode](https://opencode.ai) connects directly via OpenAI-compatible providers. +[OpenCode](https://opencode.ai) connects via the `@ai-sdk/openai-compatible` provider. + +> **Important:** Use `@ai-sdk/openai-compatible`, not `@ai-sdk/openai`. The latter crashes on +> models that emit `` tokens (see [Troubleshooting](TROUBLESHOOTING.md#opencode-text-part-msg-not-found)). ```json -// ~/.config/opencode/config.json +// opencode.json (project root or ~/.config/opencode/opencode.json) { + "$schema": "https://opencode.ai/config.json", + "model": "foundry/Qwen3.5-9B-UD-Q4_K_XL.gguf", "provider": { "foundry": { + "npm": "@ai-sdk/openai-compatible", "name": "Foundry", - "type": "openai", - "url": "http://localhost:8080/v1", + "options": { + "baseURL": "http://localhost:8080/v1", + "apiKey": "sk-local" + }, "models": { - "qwen": { - "id": "qwen3.5-9b", - "name": "Qwen 3.5 9B" - }, - "qwen-coder": { - "id": "qwen3-coder-30b-a3b", - "name": "Qwen 3 Coder 30B A3B" + "Qwen3.5-9B-UD-Q4_K_XL.gguf": { + "name": "Qwen 3.5 9B", + "limit": { + "context": 262144, + "output": 32768 + } } } } @@ -62,6 +69,8 @@ No API key is required by default. If your client demands one, any non-empty str } ``` +The model ID must match what `/v1/models` returns (check with `curl http://localhost:8080/v1/models`). + ### Cursor Settings > Models > OpenAI API Base: @@ -493,7 +502,7 @@ for chunk in stream: print(chunk.choices[0].delta.content, end="", flush=True) ``` -The Docker Compose configuration includes TCP tuning (BBR congestion control, busy polling) for minimal streaming latency. Time-to-first-token is typically ~50-200 ms depending on prompt length. +The host tuning script (`sudo ./scripts/host-setup.sh`) configures BBR congestion control and busy polling for minimal streaming latency. Time-to-first-token is typically ~50-200 ms depending on prompt length. ## Multi-GPU Agent Routing @@ -539,6 +548,10 @@ def get_client(agent_id: int) -> OpenAI: ## Troubleshooting +For a comprehensive troubleshooting guide, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md). + +Quick reference for the most common issues: + ### Model loads on CPU instead of GPU If you see `no devices with dedicated memory found` in the logs, the CUDA backend failed to load. Check: @@ -553,6 +566,10 @@ If you see `no devices with dedicated memory found` in the logs, the CUDA backen 2. Check VRAM: `nvidia-smi` -- if VRAM is full, reduce context with `FOUNDRY_CTX_LENGTH` 3. Check if all slots are occupied: `curl http://localhost:8080/metrics | grep slots` +### OpenCode: "text part msg_... not found" + +Use `@ai-sdk/openai-compatible` (not `@ai-sdk/openai`) and ensure the server has `--reasoning-format none`. See [TROUBLESHOOTING.md](TROUBLESHOOTING.md#opencode-text-part-msg-not-found) for details. + ### Connection refused 1. Container might still be loading the model. Check `docker logs ` for progress. @@ -571,12 +588,3 @@ docker run --gpus all -p 8080:8080 \ ``` For GPUs with less than 16 GB VRAM, use Qwen3.5-9B (only 5.66 GB model weight). For 16+ GB, Qwen3-Coder-30B-A3B's MoE expert offloading can spill inactive experts to CPU. - -### Inconsistent response speeds - -If response speed varies between requests, check for: - -1. **Prompt cache misses**: First message in a conversation is always slower (prompt processing) -2. **Concurrent slot contention**: Other agents may be using slots simultaneously -3. **GPU thermal throttling**: Check `nvidia-smi -q -d PERFORMANCE` for throttle reasons -4. **CPU interrupt interference**: Pin GPU IRQs to dedicated cores (see README Host Tuning section) diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 0000000..5e6250b --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,421 @@ +# Troubleshooting + +Common issues and solutions for Foundry inference servers. + +## First Steps: Collecting Diagnostic Information + +Before troubleshooting, gather this information: + +### Container Logs + +```bash +docker compose logs inference --tail 50 +``` + +### Health Check + +```bash +curl http://localhost:8080/health +# Returns: {"status":"ok"} when healthy +``` + +### Metrics + +```bash +curl -s http://localhost:8080/metrics | grep -E "llama_server_(slots|ctx|model)" +``` + +### GPU Status + +```bash +nvidia-smi --query-gpu=name,memory.used,memory.total,temperature.gpu,utilization.gpu,persistence_mode --format=csv +``` + +--- + +## GPU & CUDA + +### "no devices with dedicated memory found" + +**Symptom:** Server starts but model loads on CPU. Inference is extremely slow. + +**Cause:** The NVIDIA container runtime is not mounting GPU drivers into the container. + +**Fix:** +1. Verify the host GPU works: + ```bash + nvidia-smi + ``` +2. Verify Docker can access the GPU: + ```bash + docker run --rm --gpus all nvidia/cuda:12.9.1-base-ubuntu24.04 nvidia-smi + ``` +3. If step 2 fails, install the NVIDIA Container Toolkit: + ```bash + # Ubuntu/Debian + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit + sudo nvidia-ctk runtime configure --runtime=docker + sudo systemctl restart docker + ``` + +**Verify:** `docker compose logs inference | grep "offloaded"` should show `offloaded N/N layers to GPU`. + +### "CUDA error: no kernel image is available for execution on the device" + +**Symptom:** Server crashes immediately after loading model. + +**Cause:** The compiled CUDA kernels don't match your GPU architecture. Foundry images are built for sm_89 (Ada/RTX 40xx) and sm_120a (Blackwell/RTX 50xx). + +**Fix:** If you have an older GPU (e.g. Ampere/RTX 30xx), rebuild the image with your architecture: +```bash +# In the Dockerfile, change CMAKE_CUDA_ARCHITECTURES: +-DCMAKE_CUDA_ARCHITECTURES="86;89;120a" # Add 86 for Ampere +``` + +**Verify:** `docker compose logs inference | grep "CUDA"` should show successful backend loading. + +--- + +## Out of Memory + +### Container exits with "OOMKilled" + +**Symptom:** Container disappears. `docker inspect` shows OOMKilled. + +**Cause:** VRAM or system RAM exhausted. The RTX 5090 profile uses 1M total context (262K/slot x 4), which requires ~29.5 GB VRAM for Qwen3.5-9B. + +**Diagnosis:** +```bash +docker inspect --format='{{.State.OOMKilled}}' foundry-inference-1 +# Returns: true if OOM killed +``` + +**Fix:** +1. Reduce context length: + ```bash + FOUNDRY_CTX_LENGTH=65536 docker compose up + ``` +2. Reduce parallel slots: + ```bash + FOUNDRY_EXTRA_ARGS="--parallel 1" docker compose up + ``` +3. Use more aggressive KV cache quantization (add to `FOUNDRY_EXTRA_ARGS`): + ```bash + FOUNDRY_EXTRA_ARGS="-ctk q4_0 -ctv q4_0" docker compose up + ``` + +**Verify:** `nvidia-smi` should show VRAM usage within your GPU's capacity. + +### "CUDA out of memory" in logs + +**Symptom:** Server starts but crashes when processing the first request. + +**Cause:** KV cache allocation exceeds available VRAM. This happens when context is too large for your GPU. + +**Fix:** Same as OOMKilled above. Start with a small context and increase: +```bash +FOUNDRY_CTX_LENGTH=16384 docker compose up +``` + +--- + +## Startup & Model Loading + +### "Download failed: xxx.gguf not found after download" + +**Symptom:** First startup fails during model download. + +**Cause:** Hugging Face API rate limiting, network issues, or missing auth token for gated models. + +**Fix:** +1. Set a Hugging Face token (free, read-only access): + ```bash + echo "HF_TOKEN=hf_your_token_here" > .env + docker compose up + ``` +2. Or download manually and mount: + ```bash + pip install huggingface-hub + huggingface-cli download unsloth/Qwen3.5-9B-GGUF Qwen3.5-9B-UD-Q4_K_XL.gguf --local-dir ~/.cache/foundry + docker compose up + ``` + +**Verify:** `ls -lh ~/.cache/foundry/*.gguf` should show the model file. + +### "Permission denied: Cannot access /models" + +**Symptom:** Container starts but cannot read the model volume. + +**Cause:** Docker volume permissions don't match the container user. + +**Fix:** +```bash +sudo chown -R $(id -u):$(id -g) ~/.cache/foundry +chmod 755 ~/.cache/foundry +``` + +**Verify:** `docker compose logs inference | grep "Model found"` confirms the model is accessible. + +### Model loading takes 30+ seconds + +**Symptom:** Long delay between container start and first `/health` OK. + +**Cause:** First-time model loading, GPU not in persistence mode, or slow storage. + +**Fix:** +1. Enable GPU persistence mode (avoids ~100-500ms cold start per request): + ```bash + sudo nvidia-smi -pm 1 + ``` +2. Run host tuning for optimized I/O: + ```bash + sudo ./scripts/host-setup.sh + ``` +3. Subsequent starts are fast since the model is cached in `~/.cache/foundry`. + +**Verify:** `docker compose logs inference | grep "Model found"` shows the cached model with its size. + +--- + +## API & Connectivity + +### Connection refused on port 8080 + +**Symptom:** `curl: (7) Failed to connect to localhost port 8080: Connection refused` + +**Cause:** Container is still loading the model, or another service is using port 8080. + +**Fix:** +1. Check if the model is still loading: + ```bash + docker compose logs inference --tail 5 + # Look for "server is listening on" message + ``` +2. Check for port conflicts: + ```bash + ss -tlnp src :8080 + ``` +3. Use a different port: + ```bash + FOUNDRY_PORT=8090 docker compose up + ``` + +**Verify:** `curl http://localhost:8080/health` returns `{"status":"ok"}`. + +### HTTP 503 / All slots busy + +**Symptom:** API returns 503 when all parallel slots are occupied. + +**Cause:** More concurrent requests than available slots. Default: 4 slots (RTX 5090) or 2 slots (default profile). + +**Fix:** +1. Check slot usage: + ```bash + curl -s http://localhost:8080/metrics | grep "llama_server_requests" + ``` +2. Increase parallel slots (requires more VRAM): + ```bash + FOUNDRY_EXTRA_ARGS="--parallel 8" docker compose up + ``` +3. Or queue requests client-side with retry logic. + +--- + +## Performance + +### Throughput below documented benchmarks + +**Symptom:** Decode speed is 50%+ lower than documented (e.g. 90 tok/s instead of 177 tok/s for Qwen3.5-9B). + +**Cause:** Host kernel not tuned. The documented benchmarks assume `host-setup.sh` has been run. + +**Diagnosis:** +```bash +# Check CPU governor +cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor +# Should be: performance + +# Check NUMA balancing +sysctl kernel.numa_balancing +# Should be: 0 + +# Check GPU persistence mode +nvidia-smi --query-gpu=persistence_mode --format=csv,noheader +# Should be: Enabled + +# Check BBR congestion control (for streaming latency) +sysctl net.ipv4.tcp_congestion_control +# Should be: bbr +``` + +**Fix:** Run the host tuning script: +```bash +sudo ./scripts/host-setup.sh +``` + +This sets: CPU governor to performance, disables NUMA balancing, enables BBR TCP, tunes NVMe I/O, enables GPU persistence, allocates hugepages, and enables busy polling for reduced NIC latency. + +**Verify:** Re-run your benchmark. Single-stream decode should be within 10% of documented speeds. + +### Latency spikes after 10-15 minutes (thermal throttling) + +**Symptom:** Steady performance degrades over time. GPU temp >85C. + +**Diagnosis:** +```bash +nvidia-smi -q -d PERFORMANCE | grep -A5 "Clocks Throttle Reasons" +# Look for "SW Thermal Slowdown: Active" +``` + +**Fix:** +1. Reduce concurrent load (fewer parallel slots = less heat): + ```bash + FOUNDRY_EXTRA_ARGS="--parallel 2" docker compose up + ``` +2. Check `nvidia-smi dmon -s p` for real-time power/temp monitoring. + +**Verify:** `nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader` should stay below 83C under load. + +--- + +## Docker & Container Issues + +### Rootless Docker: sysctls and ulimits fail + +**Symptom:** Container fails to start with `sysctl not allowed` or `permission denied setting ulimit`. + +**Cause:** Rootless Docker cannot set privileged sysctls (`tcp_congestion_control`, `busy_read`, `busy_poll`) or unlimited memlock. + +**Fix:** These settings are intentionally omitted from `docker-compose.yml` for rootless compatibility. Apply them at the host level instead: +```bash +sudo ./scripts/host-setup.sh +``` + +This is more effective anyway -- container-level network sysctls share the host's network stack. + +### eBPF exporter won't start + +**Symptom:** `ebpf-exporter` container exits immediately. + +**Cause:** Requires `privileged: true` and `pid: host` for kernel tracepoint access. Incompatible with rootless Docker, SELinux enforcing, and some cloud providers. + +**Fix:** The eBPF exporter is optional (part of the `monitoring` profile). The other monitoring services (Prometheus, Grafana, GPU exporter, node exporter) work without it. + +To run without eBPF: +```bash +# The monitoring profile still works -- eBPF exporter will fail but others start fine +docker compose --profile monitoring up -d +``` + +**Verify:** `docker compose --profile monitoring ps` -- all services should be "Up" except `ebpf-exporter`. + +--- + +## AI Agent Integration + +### OpenCode: "text part msg_... not found" + +**Symptom:** OpenCode crashes immediately with `text part msg_XXXX not found` when using `@ai-sdk/openai`. + +**Cause:** The `@ai-sdk/openai` package includes `extractReasoningMiddleware` that crashes when it encounters `` tokens in the response content. Qwen3.5 models always generate `` blocks even with thinking disabled ([vercel/ai #12054](https://github.com/vercel/ai/issues/12054)). + +**Fix:** Use `@ai-sdk/openai-compatible` instead of `@ai-sdk/openai` in your `opencode.json`: +```json +{ + "provider": { + "foundry": { + "npm": "@ai-sdk/openai-compatible", + "options": { + "baseURL": "http://localhost:8080/v1", + "apiKey": "sk-local" + } + } + } +} +``` + +Also ensure the server uses `--reasoning-format none` (already set in Foundry's Qwen3.5-9B profiles). + +### OpenCode: Tool calls appear as raw XML + +**Symptom:** Model outputs `` as text instead of executing tools. + +**Cause:** The server is returning `reasoning_content` in the API response, which confuses the AI SDK's tool call parser. + +**Fix:** Ensure `--reasoning-format none` is in the server's `PROFILE_EXTRA_ARGS`. This is the default in Foundry's Qwen3.5-9B profiles. If you're using a custom profile or `FOUNDRY_EXTRA_ARGS`, add it: +```bash +FOUNDRY_EXTRA_ARGS="--reasoning-format none" docker compose up +``` + +**Verify:** +```bash +curl -s http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"Qwen3.5-9B-UD-Q4_K_XL.gguf","messages":[{"role":"user","content":"hello"}],"max_tokens":64}' \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print('reasoning_content' in d['choices'][0]['message'])" +# Should print: False +``` + +### Client requires API key + +**Symptom:** Client errors with "No API key provided" even though Foundry doesn't require one. + +**Fix:** Use any non-empty string as the API key: +```bash +export OPENAI_API_KEY=sk-local +``` + +--- + +## Monitoring + +### Grafana dashboards show no data + +**Symptom:** Dashboards load but all panels are empty. + +**Cause:** Prometheus hasn't scraped targets yet, or targets are unreachable. + +**Diagnosis:** +```bash +# Check Prometheus targets +curl -s http://localhost:9090/api/v1/targets | python3 -m json.tool | grep -E '"health"|"lastError"' +``` + +**Fix:** +1. Wait 30-60 seconds after starting services (Prometheus scrapes every 15s). +2. Verify the inference metrics endpoint works: + ```bash + curl -s http://localhost:8080/metrics | head -5 + ``` +3. Check that `monitoring/prometheus/prometheus.yml` has correct scrape targets. + +**Verify:** Prometheus targets page at `http://localhost:9090/targets` should show all targets as "UP". + +--- + +## Collecting Information for Bug Reports + +When filing an issue, include the output of these commands: + +```bash +# System info +nvidia-smi +docker version +docker compose version +uname -a + +# Container state +docker compose --profile monitoring ps +docker compose logs inference --tail 100 + +# Server health +curl -s http://localhost:8080/health +curl -s http://localhost:8080/v1/models | python3 -m json.tool + +# Metrics snapshot +curl -s http://localhost:8080/metrics +``` diff --git a/docker-compose.yml b/docker-compose.yml index e60c6de..8a6d1b4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -60,22 +60,20 @@ services: count: 1 capabilities: [gpu] shm_size: '2gb' + # NOTE: Network tuning (BBR, busy_poll) and memory locking are applied at the + # host level by scripts/host-setup.sh. Run it once before deploying: + # sudo ./scripts/host-setup.sh + # Container-level sysctls for these are omitted because they fail on rootless + # Docker and are less effective than host-level settings. sysctls: - net.core.somaxconn=4096 - net.ipv4.tcp_keepalive_time=60 - - net.ipv4.tcp_congestion_control=bbr - - net.core.busy_read=50 - - net.core.busy_poll=50 restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8080/health"] interval: 10s timeout: 5s retries: 5 - ulimits: - memlock: - soft: -1 - hard: -1 networks: - default - monitoring @@ -88,8 +86,6 @@ services: image: prom/prometheus:latest profiles: - monitoring - volumes: - - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro extra_hosts: - "host.docker.internal:host-gateway" ports: diff --git a/models/qwen3.5-9b/entrypoint.sh b/models/qwen3.5-9b/entrypoint.sh index 745915b..413db4a 100755 --- a/models/qwen3.5-9b/entrypoint.sh +++ b/models/qwen3.5-9b/entrypoint.sh @@ -102,6 +102,68 @@ load_profile() { source "$profile_file" } +# ============================================================================== +# Pre-flight Host Check +# ============================================================================== +# Checks host kernel parameters visible via /proc/sys/ and warns about +# suboptimal settings that affect inference performance. Does NOT block startup. +# +# These parameters are set by: sudo ./scripts/host-setup.sh + +preflight_check() { + local warnings=0 + + # Helper: check a sysctl value and warn if it doesn't match + check_sysctl() { + local path="$1" expected="$2" label="$3" fix="$4" + local actual + if [ -f "$path" ]; then + actual=$(cat "$path" 2>/dev/null) + if [ "$actual" != "$expected" ]; then + warn " $label = $actual (expected: $expected) -- $fix" + warnings=$((warnings + 1)) + fi + fi + } + + log "Checking host kernel parameters..." + + # TCP congestion control (BBR reduces streaming latency) + check_sysctl /proc/sys/net/ipv4/tcp_congestion_control "bbr" \ + "tcp_congestion_control" "BBR reduces token streaming latency" + + # NUMA balancing (causes random latency spikes during decode) + check_sysctl /proc/sys/kernel/numa_balancing "0" \ + "numa_balancing" "page migration causes decode latency jitter" + + # Swappiness (model weights should stay in RAM) + local swappiness + swappiness=$(cat /proc/sys/vm/swappiness 2>/dev/null) + if [ -n "$swappiness" ] && [ "$swappiness" -gt 10 ]; then + warn " vm.swappiness = $swappiness (expected: 0-10) -- high swappiness may evict model weights" + warnings=$((warnings + 1)) + fi + + # GPU persistence mode + if command -v nvidia-smi &> /dev/null; then + local pm + pm=$(nvidia-smi --query-gpu=persistence_mode --format=csv,noheader,nounits 2>/dev/null | head -1 | xargs) + if [ "$pm" = "Disabled" ]; then + warn " GPU persistence mode = Disabled -- adds 100-500ms cold start latency" + warnings=$((warnings + 1)) + fi + fi + + if [ "$warnings" -gt 0 ]; then + warn "" + warn " $warnings performance issue(s) detected. Run on the host:" + warn " sudo ./scripts/host-setup.sh" + warn "" + else + log "Host kernel parameters: OK" + fi +} + # ============================================================================== # Model Download # ============================================================================== @@ -294,10 +356,13 @@ main() { # 2. Load profile load_profile "$profile" - # 3. Download model if needed + # 3. Pre-flight host check (warns about suboptimal kernel params) + preflight_check + + # 4. Download model if needed download_model - # 4. Build launch command (sets FOUNDRY_CMD array directly, no subshell) + # 5. Build launch command (sets FOUNDRY_CMD array directly, no subshell) build_command echo "" @@ -308,7 +373,7 @@ main() { echo -e "${GREEN} http://localhost:${FOUNDRY_PORT:-8080}/v1/chat/completions${NC}" echo "" - # 5. Launch (exec replaces shell process for proper signal handling) + # 6. Launch (exec replaces shell process for proper signal handling) # Use the array form to avoid word-splitting issues exec "${FOUNDRY_CMD[@]}" } diff --git a/models/qwen3.5-9b/profiles/default.sh b/models/qwen3.5-9b/profiles/default.sh index a20c158..2b09587 100644 --- a/models/qwen3.5-9b/profiles/default.sh +++ b/models/qwen3.5-9b/profiles/default.sh @@ -22,4 +22,4 @@ PROFILE_CPU_STRICT=0 PROFILE_CACHE_REUSE=0 # Disabled: hybrid recurrent arch re-processes anyway PROFILE_NO_WEBUI="false" # Keep web UI for exploration PROFILE_METRICS="false" -PROFILE_EXTRA_ARGS="--swa-full --cache-ram 0" +PROFILE_EXTRA_ARGS="--swa-full --cache-ram 0 --reasoning-format none" diff --git a/models/qwen3.5-9b/profiles/rtx5090.sh b/models/qwen3.5-9b/profiles/rtx5090.sh index ce3f3bf..1d2fd07 100644 --- a/models/qwen3.5-9b/profiles/rtx5090.sh +++ b/models/qwen3.5-9b/profiles/rtx5090.sh @@ -51,4 +51,6 @@ PROFILE_METRICS="true" # Prometheus-compatible /metrics endpoint # --mlock: pin model in RAM; -b/-ub 4096: large batch for fast prompt encode # --swa-full: full SWA cache for hybrid attention models (DeltaNet + attention) # --cache-ram 0: disable prompt cache (hybrid recurrent arch forces re-processing) -PROFILE_EXTRA_ARGS="--mlock -b 4096 -ub 4096 --swa-full --cache-ram 0" +# --reasoning-format none: keep tags as plain text in content, no reasoning_content field +# (prevents AI SDK extractReasoningMiddleware crash on empty think blocks) +PROFILE_EXTRA_ARGS="--mlock -b 4096 -ub 4096 --swa-full --cache-ram 0 --reasoning-format none" diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 745915b..413db4a 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -102,6 +102,68 @@ load_profile() { source "$profile_file" } +# ============================================================================== +# Pre-flight Host Check +# ============================================================================== +# Checks host kernel parameters visible via /proc/sys/ and warns about +# suboptimal settings that affect inference performance. Does NOT block startup. +# +# These parameters are set by: sudo ./scripts/host-setup.sh + +preflight_check() { + local warnings=0 + + # Helper: check a sysctl value and warn if it doesn't match + check_sysctl() { + local path="$1" expected="$2" label="$3" fix="$4" + local actual + if [ -f "$path" ]; then + actual=$(cat "$path" 2>/dev/null) + if [ "$actual" != "$expected" ]; then + warn " $label = $actual (expected: $expected) -- $fix" + warnings=$((warnings + 1)) + fi + fi + } + + log "Checking host kernel parameters..." + + # TCP congestion control (BBR reduces streaming latency) + check_sysctl /proc/sys/net/ipv4/tcp_congestion_control "bbr" \ + "tcp_congestion_control" "BBR reduces token streaming latency" + + # NUMA balancing (causes random latency spikes during decode) + check_sysctl /proc/sys/kernel/numa_balancing "0" \ + "numa_balancing" "page migration causes decode latency jitter" + + # Swappiness (model weights should stay in RAM) + local swappiness + swappiness=$(cat /proc/sys/vm/swappiness 2>/dev/null) + if [ -n "$swappiness" ] && [ "$swappiness" -gt 10 ]; then + warn " vm.swappiness = $swappiness (expected: 0-10) -- high swappiness may evict model weights" + warnings=$((warnings + 1)) + fi + + # GPU persistence mode + if command -v nvidia-smi &> /dev/null; then + local pm + pm=$(nvidia-smi --query-gpu=persistence_mode --format=csv,noheader,nounits 2>/dev/null | head -1 | xargs) + if [ "$pm" = "Disabled" ]; then + warn " GPU persistence mode = Disabled -- adds 100-500ms cold start latency" + warnings=$((warnings + 1)) + fi + fi + + if [ "$warnings" -gt 0 ]; then + warn "" + warn " $warnings performance issue(s) detected. Run on the host:" + warn " sudo ./scripts/host-setup.sh" + warn "" + else + log "Host kernel parameters: OK" + fi +} + # ============================================================================== # Model Download # ============================================================================== @@ -294,10 +356,13 @@ main() { # 2. Load profile load_profile "$profile" - # 3. Download model if needed + # 3. Pre-flight host check (warns about suboptimal kernel params) + preflight_check + + # 4. Download model if needed download_model - # 4. Build launch command (sets FOUNDRY_CMD array directly, no subshell) + # 5. Build launch command (sets FOUNDRY_CMD array directly, no subshell) build_command echo "" @@ -308,7 +373,7 @@ main() { echo -e "${GREEN} http://localhost:${FOUNDRY_PORT:-8080}/v1/chat/completions${NC}" echo "" - # 5. Launch (exec replaces shell process for proper signal handling) + # 6. Launch (exec replaces shell process for proper signal handling) # Use the array form to avoid word-splitting issues exec "${FOUNDRY_CMD[@]}" }