From 4933f03938cac36181c3e4c3854c367d69d593f4 Mon Sep 17 00:00:00 2001 From: Foundry Bot Date: Mon, 2 Mar 2026 14:29:25 +0000 Subject: [PATCH] feat: add Qwen3.5-9B as new default model, replacing Qwen3.5-35B-A3B Add Qwen3.5-9B (unsloth UD-Q4_K_XL, 5.66 GB) as the new default model. Newer Qwen3.5 generation that outperforms 35B-A3B on every benchmark while using 1/4 the VRAM. RTX 5090 benchmarks (4 slots, all on GPU): Single-stream: 177 tok/s (compute-bound, 94% SM utilization) 4-concurrent: 423 tok/s aggregate, 106 tok/s per-slot Context: 262K per slot (1M total) in only 29.5 GB VRAM Prompt: 1,688 tok/s Key advantages over 35B-A3B: - 5.5x more context per slot (262K vs 48K) - 32% more aggregate throughput (423 vs 320 tok/s) - Runs on 8 GB GPUs (vs 16 GB minimum) - Massively better benchmarks: +37 TAU2-Bench, +20 HMMT, +8 GPQA Multi-instance analysis (eBPF telemetry): dense 9B is compute-bound at 94% SM utilization. Internal --parallel 4 batching gives 2.6x more throughput than running separate instances. Qwen3.5-35B-A3B is NOT removed -- it stays in the repo and CI for users who want it. Only the default is changed. Files added: models/qwen3.5-9b/{Dockerfile,profiles/rtx5090.sh,default.sh} Files updated: README.md, AGENTS.md: new default model, all benchmark tables docker-compose.yml, Makefile: default model changed to qwen3.5-9b build.yml: qwen3.5-9b added to CI matrix (4 models now) entrypoint.sh: generic download size message (synced to all models) --- .github/workflows/build.yml | 2 +- AGENTS.md | 73 +++--- Makefile | 4 +- README.md | 83 +++--- docker-compose.yml | 4 +- models/hermes-4.3-36b/entrypoint.sh | 2 +- models/qwen3-coder-30b-a3b/entrypoint.sh | 2 +- models/qwen3.5-35b-a3b/entrypoint.sh | 2 +- models/qwen3.5-9b/Dockerfile | 93 +++++++ models/qwen3.5-9b/entrypoint.sh | 316 +++++++++++++++++++++++ models/qwen3.5-9b/profiles/default.sh | 25 ++ models/qwen3.5-9b/profiles/rtx5090.sh | 54 ++++ scripts/entrypoint.sh | 2 +- 13 files changed, 578 insertions(+), 84 deletions(-) create mode 100644 models/qwen3.5-9b/Dockerfile create mode 100755 models/qwen3.5-9b/entrypoint.sh create mode 100644 models/qwen3.5-9b/profiles/default.sh create mode 100644 models/qwen3.5-9b/profiles/rtx5090.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dc91fe4..5de0457 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,7 @@ jobs: strategy: matrix: - model: [qwen3.5-35b-a3b, qwen3-coder-30b-a3b, hermes-4.3-36b] + model: [qwen3.5-9b, qwen3.5-35b-a3b, qwen3-coder-30b-a3b, hermes-4.3-36b] steps: - name: Checkout diff --git a/AGENTS.md b/AGENTS.md index 720efdf..548e250 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -49,8 +49,8 @@ No API key is required by default. If your client demands one, any non-empty str "url": "http://localhost:8080/v1", "models": { "qwen": { - "id": "qwen3.5-35b-a3b", - "name": "Qwen 3.5 35B A3B" + "id": "qwen3.5-9b", + "name": "Qwen 3.5 9B" }, "qwen-coder": { "id": "qwen3-coder-30b-a3b", @@ -69,7 +69,7 @@ Settings > Models > OpenAI API Base: ``` Base URL: http://localhost:8080/v1 API Key: sk-local -Model: qwen3.5-35b-a3b +Model: qwen3.5-9b ``` Cursor uses streaming by default. Foundry supports SSE streaming natively. With multiple parallel slots, you can run Cursor's background indexing and active chat simultaneously without blocking. @@ -83,7 +83,7 @@ Cursor uses streaming by default. Foundry supports SSE streaming natively. With { "title": "Foundry Qwen", "provider": "openai", - "model": "qwen3.5-35b-a3b", + "model": "qwen3.5-9b", "apiBase": "http://localhost:8080/v1", "apiKey": "sk-local" } @@ -96,7 +96,7 @@ Cursor uses streaming by default. Foundry supports SSE streaming natively. With ```bash aider --openai-api-base http://localhost:8080/v1 \ --openai-api-key sk-local \ - --model openai/qwen3.5-35b-a3b + --model openai/qwen3.5-9b ``` Or set environment variables: @@ -104,7 +104,7 @@ Or set environment variables: ```bash export OPENAI_API_BASE=http://localhost:8080/v1 export OPENAI_API_KEY=sk-local -aider --model openai/qwen3.5-35b-a3b +aider --model openai/qwen3.5-9b ``` ### Cline (VS Code) @@ -114,7 +114,7 @@ Settings > Cline > API Provider: OpenAI Compatible ``` Base URL: http://localhost:8080/v1 API Key: sk-local -Model ID: qwen3.5-35b-a3b +Model ID: qwen3.5-9b ``` ## Multi-Agent Frameworks @@ -127,7 +127,7 @@ Foundry's parallel inference slots make it particularly suited for multi-agent w import os os.environ["OPENAI_API_BASE"] = "http://localhost:8080/v1" os.environ["OPENAI_API_KEY"] = "sk-local" -os.environ["OPENAI_MODEL_NAME"] = "qwen3.5-35b-a3b" +os.environ["OPENAI_MODEL_NAME"] = "qwen3.5-9b" from crewai import Agent, Task, Crew @@ -175,7 +175,7 @@ from autogen import AssistantAgent, UserProxyAgent config_list = [ { - "model": "qwen3.5-35b-a3b", + "model": "qwen3.5-9b", "base_url": "http://localhost:8080/v1", "api_key": "sk-local", } @@ -206,7 +206,7 @@ from langchain_openai import ChatOpenAI llm = ChatOpenAI( base_url="http://localhost:8080/v1", api_key="sk-local", - model="qwen3.5-35b-a3b", + model="qwen3.5-9b", streaming=True, ) @@ -220,7 +220,7 @@ print(response.content) from smolagents import ToolCallingAgent, OpenAIServerModel model = OpenAIServerModel( - model_id="qwen3.5-35b-a3b", + model_id="qwen3.5-9b", api_base="http://localhost:8080/v1", api_key="sk-local", ) @@ -259,7 +259,7 @@ Settings > API > Chat Completion (OpenAI): ``` API URL: http://localhost:8080 API Key: sk-local -Model: qwen3.5-35b-a3b +Model: qwen3.5-9b ``` ## Direct API Usage @@ -275,7 +275,7 @@ client = OpenAI( ) response = client.chat.completions.create( - model="qwen3.5-35b-a3b", + model="qwen3.5-9b", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}, @@ -291,7 +291,7 @@ print(response.choices[0].message.content) curl http://localhost:8080/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "qwen3.5-35b-a3b", + "model": "qwen3.5-9b", "messages": [{"role": "user", "content": "Hello!"}], "max_tokens": 256 }' @@ -308,7 +308,7 @@ const client = new OpenAI({ }); const response = await client.chat.completions.create({ - model: "qwen3.5-35b-a3b", + model: "qwen3.5-9b", messages: [{ role: "user", content: "Hello!" }], }); @@ -321,12 +321,12 @@ console.log(response.choices[0].message.content); |----------|-------------------|-----| | **Coding agents** (OpenCode, Cursor, Aider) | Qwen3-Coder-30B-A3B | Fastest decode (275 tok/s), purpose-built for code, tool calling support | | **Multi-agent orchestration** (CrewAI, AutoGen) | Qwen3-Coder-30B-A3B | 3-concurrent at 497 tok/s aggregate, best MoE batching efficiency | -| **General coding + long context** | Qwen3.5-35B-A3B | 192K effective context for large codebases, hybrid recurrent architecture | -| **Reasoning-heavy tasks** | Hermes-4.3-36B | Thinking mode with `` tags, stronger reasoning on hard problems | +| **General reasoning + long context** | Qwen3.5-9B | 262K context per slot (1M total), thinking mode, best benchmark quality across GPQA/HMMT/TAU2 | +| **Reasoning-heavy tasks** | Qwen3.5-9B | Thinking mode with `reasoning_content` field, 81.7 GPQA Diamond, 83.2 HMMT | | **Tool use / function calling** | Qwen3-Coder-30B-A3B or Hermes-4.3-36B | Both have strong tool calling; Coder is 4x faster, Hermes more reliable on complex schemas | | **Roleplay / creative writing** | Hermes-4.3-36B | NousResearch fine-tune optimized for personality and narrative | -| **Long document Q&A** | Qwen3.5-35B-A3B | 192K context window, recurrent layers handle long sequences efficiently | -| **16 GB VRAM GPUs** | Qwen3-Coder-30B-A3B | Smallest disk footprint (17.7 GB), MoE expert offloading works on 16 GB | +| **Long document Q&A** | Qwen3.5-9B | 262K context per slot, recurrent layers handle long sequences efficiently | +| **8 GB VRAM GPUs** | Qwen3.5-9B | Smallest disk footprint (5.66 GB), runs on 8 GB cards with reduced context | ## Performance Considerations @@ -337,14 +337,14 @@ Single-stream decode latency (time to generate one token): | Model | Latency per token | Tokens per second | |-------|-------------------|-------------------| | Qwen3-Coder-30B-A3B | ~3.6 ms | ~275 tok/s | -| Qwen3.5-35B-A3B | ~5.5 ms | ~181 tok/s | +| Qwen3.5-9B | ~5.7 ms | ~177 tok/s | | Hermes-4.3-36B | ~15.5 ms | ~64 tok/s | -For interactive coding agents, Qwen3-Coder delivers the fastest typing experience. Qwen3.5 trades some speed for 192K effective context. For batch/background tasks where latency is less critical, Hermes' stronger reasoning may be worth the tradeoff. +For interactive coding agents, Qwen3-Coder delivers the fastest typing experience. Qwen3.5-9B trades some speed for full 262K context and superior reasoning quality. For batch/background tasks where latency is less critical, Hermes' roleplay and creative strengths may be worth the tradeoff. ### Prompt processing -Prompt processing (prefill) runs at ~1,163 tok/s for Qwen on RTX 5090. A 10K token prompt takes ~8.6 seconds to process. Keep system prompts concise to minimize time-to-first-token. +Prompt processing (prefill) runs at ~1,688 tok/s for Qwen3.5-9B on RTX 5090. A 10K token prompt takes ~5.9 seconds to process. Keep system prompts concise to minimize time-to-first-token. ### Concurrent agent scaling @@ -355,11 +355,10 @@ Qwen3-Coder-30B-A3B (fastest, 3 slots): 3 agents: 497 tok/s (~168 tok/s each, 61% per-agent) ``` -Qwen3.5-35B-A3B (4 slots): +Qwen3.5-9B (4 slots, dense): ``` -1 agent: 181 tok/s (100% per-agent speed) -2 agents: 234 tok/s (~117 tok/s each, 65% per-agent) -4 agents: 320 tok/s (~80 tok/s each, 44% per-agent) +1 agent: 177 tok/s (100% per-agent speed) +4 agents: 423 tok/s (~106 tok/s each, 60% per-agent) ``` If your workflow has more concurrent agents than slots, requests queue until a slot is free. Consider multi-GPU routing (below) for higher concurrency. @@ -370,8 +369,8 @@ VRAM scales with context usage. The default RTX 5090 profiles are tuned for maxi | Model | Default context | VRAM at idle | VRAM at full context | |-------|----------------|--------------|---------------------| +| Qwen3.5-9B | 1M (262K/slot) | 5.7 GB | ~29.5 GB | | Qwen3-Coder-30B-A3B | 192K | 25.0 GB | ~28.9 GB | -| Qwen3.5-35B-A3B | 192K | 25.3 GB | ~26.1 GB | | Hermes-4.3-36B | 32K | 24.5 GB | ~27.8 GB | To reduce VRAM usage, lower the context window: @@ -380,7 +379,7 @@ To reduce VRAM usage, lower the context window: docker run --gpus all -p 8080:8080 \ -v ~/.cache/foundry:/models \ -e FOUNDRY_CTX_LENGTH=32768 \ - ghcr.io/infernet-org/foundry/qwen3.5-35b-a3b:latest + ghcr.io/infernet-org/foundry/qwen3.5-9b:latest ``` ## Structured Output @@ -389,7 +388,7 @@ All three models support JSON mode for structured outputs: ```python response = client.chat.completions.create( - model="qwen3.5-35b-a3b", + model="qwen3.5-9b", messages=[{ "role": "user", "content": "List 3 programming languages with their year of creation. Respond in JSON." @@ -404,7 +403,7 @@ For grammar-constrained generation (guaranteed schema compliance), use the `gram curl http://localhost:8080/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "qwen3.5-35b-a3b", + "model": "qwen3.5-9b", "messages": [{"role": "user", "content": "Generate a person record"}], "response_format": { "type": "json_schema", @@ -430,7 +429,7 @@ Hermes-4.3-36B is specifically trained for tool calling with `` XML f ```python response = client.chat.completions.create( - model="qwen3.5-35b-a3b", + model="qwen3.5-9b", messages=[{"role": "user", "content": "What's the weather in Tokyo?"}], tools=[{ "type": "function", @@ -459,13 +458,13 @@ All models support Jinja chat templates for tool calling. The entrypoint enables ## Thinking / Reasoning Mode -Qwen3.5-35B-A3B supports a thinking mode where it shows its reasoning process in `` tags before answering. +Qwen3.5-9B supports a thinking mode where it shows its reasoning process in `` tags before answering. The server returns thinking content in the `reasoning_content` field: ```python response = client.chat.completions.create( - model="qwen3.5-35b-a3b", + model="qwen3.5-9b", messages=[{"role": "user", "content": "What is 127 * 389?"}], max_tokens=512, ) @@ -484,7 +483,7 @@ All endpoints support Server-Sent Events (SSE) streaming for real-time token del ```python stream = client.chat.completions.create( - model="qwen3.5-35b-a3b", + model="qwen3.5-9b", messages=[{"role": "user", "content": "Write a poem about GPUs."}], stream=True, ) @@ -557,7 +556,7 @@ If you see `no devices with dedicated memory found` in the logs, the CUDA backen ### Connection refused 1. Container might still be loading the model. Check `docker logs ` for progress. -2. First run downloads the model (~20 GB) which can take several minutes. +2. First run downloads the model (6-22 GB depending on model) which can take several minutes. 3. Port conflict: use `-p 8081:8080` to map to a different host port. ### Out of VRAM @@ -568,10 +567,10 @@ Reduce context window or switch to a smaller quantization: docker run --gpus all -p 8080:8080 \ -v ~/.cache/foundry:/models \ -e FOUNDRY_CTX_LENGTH=16384 \ - ghcr.io/infernet-org/foundry/qwen3.5-35b-a3b:latest + ghcr.io/infernet-org/foundry/qwen3.5-9b:latest ``` -For GPUs with less than 24 GB VRAM, use Qwen (MoE) -- its expert offloading can spill inactive experts to CPU. +For GPUs with less than 16 GB VRAM, use Qwen3.5-9B (only 5.66 GB model weight). For 16+ GB, Qwen3-Coder-30B-A3B's MoE expert offloading can spill inactive experts to CPU. ### Inconsistent response speeds diff --git a/Makefile b/Makefile index 6dc6406..e58e666 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ REGISTRY ?= ghcr.io/infernet-org/foundry # Default model (can be overridden: make run MODEL=hermes-4.3-36b) -MODEL ?= qwen3.5-35b-a3b +MODEL ?= qwen3.5-9b MODEL_TAG ?= $(REGISTRY)/$(MODEL) PORT ?= 8080 MODELS_DIR ?= $(HOME)/.cache/foundry @@ -12,7 +12,7 @@ MODELS_DIR ?= $(HOME)/.cache/foundry .PHONY: help build run run-profile test benchmark monitoring down push push-all clean clean-models download help: ## Show this help - @echo "Available models: qwen3.5-35b-a3b (default), qwen3-coder-30b-a3b, hermes-4.3-36b" + @echo "Available models: qwen3.5-9b (default), qwen3-coder-30b-a3b, hermes-4.3-36b, qwen3.5-35b-a3b" @echo "Usage: make run MODEL=qwen3-coder-30b-a3b" @echo "" @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \ diff --git a/README.md b/README.md index a22da10..212d89c 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,10 @@ Foundry compiles [llama.cpp](https://github.com/ggml-org/llama.cpp) from source ```bash docker run --gpus all -p 8080:8080 \ -v ~/.cache/foundry:/models \ - ghcr.io/infernet-org/foundry/qwen3.5-35b-a3b:latest + ghcr.io/infernet-org/foundry/qwen3.5-9b:latest ``` -The first run downloads the model (~20 GB). Subsequent starts are instant. +The first run downloads the model (~6 GB). Subsequent starts are instant. Then use it like any OpenAI-compatible API: @@ -33,7 +33,7 @@ Then use it like any OpenAI-compatible API: curl http://localhost:8080/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "qwen3.5-35b-a3b", + "model": "qwen3.5-9b", "messages": [{"role": "user", "content": "Hello!"}] }' ``` @@ -42,36 +42,39 @@ Works with any OpenAI-compatible client: Cursor, Continue, OpenCode, Open WebUI, ## Models -### Qwen3.5-35B-A3B (MoE) +### Qwen3.5-9B (Dense) -Hybrid Gated DeltaNet + Mixture-of-Experts. 35B total parameters, only 3B active per token. +Hybrid Gated DeltaNet + Dense FFN. 9B total parameters, all active per token. Qwen3.5 generation with vision-language capability. -- 30 recurrent layers (Gated DeltaNet -- fixed-size state, no KV cache) -- 10 full attention layers (standard KV cache, GQA 8:1) -- 256 experts per MoE layer, top-8 + 1 shared active per token -- Quantization: UD-Q4_K_XL via [Unsloth](https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF) (Dynamic 2.0) -- Disk: ~20.6 GB | Min VRAM: 16 GB (with expert offloading) | Max context: 262K native +- 32 layers: 24 Gated DeltaNet (recurrent) + 8 full attention (GQA 16:4) +- All 9B parameters active per token (dense, compute-bound) +- Thinking mode by default (reasoning_content field) +- Quantization: UD-Q4_K_XL via [Unsloth](https://huggingface.co/unsloth/Qwen3.5-9B-GGUF) (Dynamic 2.0) +- Disk: ~5.66 GB | Min VRAM: 8 GB | Max context: 262K native (1M with YaRN) | GPU | VRAM | Context | Decode | 4-concurrent | VRAM used | |-----|------|---------|--------|--------------|-----------| -| RTX 5090 | 32 GB | 192K | ~181 tok/s | ~320 tok/s | 26.1 GB | -| Other NVIDIA (16 GB+) | 16+ GB | 16K | varies | varies | varies | +| RTX 5090 | 32 GB | 262K/slot | ~177 tok/s | ~423 tok/s | 29.5 GB | +| Other NVIDIA (8 GB+) | 8+ GB | 32K/slot | varies | varies | varies |
RTX 5090 detailed benchmark ``` -SINGLE-STREAM DECODE: ~181 tok/s -4-CONCURRENT AGGREGATE: ~320 tok/s (+77% via MoE expert batching) -PROMPT PROCESSING: ~1,163 tok/s (internal metric) -GPU UTILIZATION: 92% -MEMORY BANDWIDTH: 49% (bottleneck: 878 / 1,792 GB/s) -POWER DRAW: 312W / 575W TDP -TEMPERATURE: 52C (under sustained load) -VRAM USAGE: 26.1 GB / 32.6 GB (6 GB headroom) +SINGLE-STREAM DECODE: ~177 tok/s (compute-bound, 94% SM utilization) +4-CONCURRENT AGGREGATE: ~423 tok/s +4-CONCURRENT PER-SLOT: ~106 tok/s each +PROMPT PROCESSING: ~1,688 tok/s +GPU UTILIZATION: 94% SM / 65% mem (single) | 100% SM / 63% mem (4-concurrent) +POWER DRAW: 312W single, 445W 4-concurrent +TEMPERATURE: 52-60C (under sustained load) +VRAM USAGE: 29.5 GB / 32.6 GB (2.6 GB headroom) +CONTEXT: 262K per slot (4 slots, 1M total) ``` -Benchmarked 2026-03-01 with native sm_120a (Blackwell) compilation and `BLACKWELL_NATIVE_FP4=1` enabled. +Benchmarked 2026-03-02 with native sm_120a (Blackwell) compilation and `BLACKWELL_NATIVE_FP4=1` enabled. + +**Why this replaces Qwen3.5-35B-A3B**: Newer Qwen3.5 generation model that outperforms the 35B-A3B on every benchmark -- agent tasks (+37 TAU2-Bench), math (+20 HMMT), reasoning (+8 GPQA), instruction following (+13 IFBench). At 5.66 GB it uses 1/4 the VRAM, enabling full 262K context per slot (vs 48K for the 35B). Internal `--parallel 4` batching provides 2.6x more throughput than running multiple instances (tested with eBPF telemetry: dense model is compute-bound at 94% SM utilization, not memory-bandwidth-bound).
### Hermes-4.3-36B (Dense) @@ -120,14 +123,14 @@ Benchmarked 2026-03-02 with native sm_120a (Blackwell) compilation and `BLACKWEL **Why 3 slots (not 4)?** With 3 slots, `--fit on` allocates 64K context per slot instead of 48K. Aggregate throughput is identical (497 vs 495 tok/s), but per-agent speed under load is 35% faster (168 vs 124 tok/s). The 4th slot rarely matters for a single-GPU workstation. Override with `FOUNDRY_EXTRA_ARGS="--parallel 4"` if needed. -**vs Qwen3.5-35B-A3B**: 52% faster single-stream, 55% faster aggregate. The standard MoE architecture (no DeltaNet recurrent layers) batches more efficiently on Blackwell. Trades the 192K effective context of Qwen3.5 for raw speed. +**vs Qwen3.5-9B**: 52% faster single-stream, 18% faster aggregate. The standard MoE architecture (no DeltaNet recurrent layers) batches more efficiently on Blackwell. Trades the 262K context of Qwen3.5 for raw speed. ## How It Works -Why llama.cpp and not SGLang or vLLM? For **consumer GPUs**, llama.cpp's MoE expert offloading (`--fit on`) is the only engine that can run a 35B-parameter MoE model on a single 16-24 GB card at full speed. SGLang and vLLM require the entire model to fit in VRAM. +Why llama.cpp and not SGLang or vLLM? For **consumer GPUs**, llama.cpp's MoE expert offloading (`--fit on`) is the only engine that can run a 30B-parameter MoE model on a single 16-24 GB card at full speed. SGLang and vLLM require the entire model to fit in VRAM. -Qwen3.5-35B-A3B keeps attention layers on GPU while spilling inactive experts to CPU, which is why a 35B MoE runs **faster** than a 27B dense model on the same hardware. +Qwen3-Coder-30B-A3B keeps attention layers on GPU while spilling inactive experts to CPU, which is why a 30B MoE runs **faster** than a 9B dense model on the same hardware (275 vs 177 tok/s). ### GPU Auto-Detection @@ -146,7 +149,7 @@ Each profile tunes: context length, KV cache quantization, thread count, batch s docker run --gpus all -p 8080:8080 \ -v ~/.cache/foundry:/models \ -e FOUNDRY_PROFILE=rtx5090 \ - ghcr.io/infernet-org/foundry/qwen3.5-35b-a3b:latest + ghcr.io/infernet-org/foundry/qwen3.5-9b:latest ``` Available profiles (per model): `rtx5090`, `default` @@ -177,22 +180,22 @@ All settings can be overridden via environment variables: ## Multi-Agent Inference -The RTX 5090 profiles are configured with multiple concurrent inference slots: `--parallel 4` for Qwen3.5 and Hermes, `--parallel 3` for Qwen3-Coder. This makes Foundry well-suited for multi-agent workflows where several AI agents share a single GPU. +The RTX 5090 profiles are configured with multiple concurrent inference slots: `--parallel 4` for Qwen3.5-9B and Hermes, `--parallel 3` for Qwen3-Coder. This makes Foundry well-suited for multi-agent workflows where several AI agents share a single GPU. ### Why MoE batching works -Qwen3.5-35B-A3B uses a 256-expert MoE architecture with only 8 experts active per token. During single-stream decode, the GPU's tensor cores are largely idle -- the bottleneck is memory bandwidth, not compute. When multiple agents send concurrent requests, llama.cpp batches token generation across all active slots. Different tokens may route to different experts, and CUDA graphs capture the entire batched MoE operation, significantly improving GPU utilization. +Qwen3-Coder-30B-A3B uses a 128-expert MoE architecture with only 8 experts active per token. During single-stream decode, the GPU's tensor cores are largely idle -- the bottleneck is memory bandwidth, not compute. When multiple agents send concurrent requests, llama.cpp batches token generation across all active slots. Different tokens may route to different experts, and CUDA graphs capture the entire batched MoE operation, significantly improving GPU utilization. ### Throughput scaling -Measured on RTX 5090 with Qwen models (MoE): +Measured on RTX 5090: -| Active agents | Qwen3.5-35B-A3B (4 slots) | Qwen3-Coder-30B-A3B (3 slots) | -|---------------|----------------------------|--------------------------------| -| 1 | 181 tok/s | 275 tok/s | -| 2 | 234 tok/s (117 each) | 405 tok/s (204 each) | +| Active agents | Qwen3.5-9B (4 slots, dense) | Qwen3-Coder-30B-A3B (3 slots, MoE) | +|---------------|------------------------------|--------------------------------------| +| 1 | 177 tok/s | 275 tok/s | +| 2 | — | 405 tok/s (204 each) | | 3 | — | 497 tok/s (168 each) | -| 4 | 320 tok/s (80 each) | — | +| 4 | 423 tok/s (106 each) | — | Single-agent speed is unaffected. Concurrent slots only activate when there are simultaneous requests. @@ -219,7 +222,7 @@ Any OpenAI-compatible agent framework works out of the box -- point it at `http: ### Docker Compose ```bash -# Default: Qwen3.5-35B-A3B +# Default: Qwen3.5-9B docker compose up # Choose a different model @@ -244,7 +247,7 @@ GF_ADMIN_PASSWORD=admin ### Build From Source ```bash -make build # Build the default model image (qwen3.5-35b-a3b) +make build # Build the default model image (qwen3.5-9b) make build MODEL=hermes-4.3-36b # Build a different model make build MODEL=qwen3-coder-30b-a3b # Build the coding-optimized model make run # Run with auto-detected GPU @@ -365,12 +368,16 @@ This reduced p99 latency jitter from ~5.8 tok/s spread to ~2.2 tok/s spread in o ``` foundry/ ├── models/ -│ ├── qwen3.5-35b-a3b/ +│ ├── qwen3.5-9b/ │ │ ├── Dockerfile # Multi-stage: compiles llama.cpp for sm_89 + sm_120a │ │ ├── entrypoint.sh # Copied from scripts/entrypoint.sh at build time │ │ └── profiles/ -│ │ ├── rtx5090.sh # 192K ctx, 4 slots, ~320 tok/s aggregate -│ │ └── default.sh # 16K ctx, conservative settings +│ │ ├── rtx5090.sh # 1M ctx, 4 slots, ~423 tok/s aggregate, 262K/slot +│ │ └── default.sh # 32K ctx, 8 GB minimum +│ ├── qwen3.5-35b-a3b/ # Legacy: still available, superseded by qwen3.5-9b +│ │ ├── Dockerfile +│ │ ├── entrypoint.sh +│ │ └── profiles/ │ ├── hermes-4.3-36b/ │ │ ├── Dockerfile # Multi-stage: compiles llama.cpp for sm_89 + sm_120a │ │ ├── entrypoint.sh # Copied from scripts/entrypoint.sh at build time diff --git a/docker-compose.yml b/docker-compose.yml index 69cc081..e60c6de 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -38,9 +38,9 @@ services: # Prometheus (Scrapes metrics) # ============================================================================ inference: - image: ghcr.io/infernet-org/foundry/${FOUNDRY_MODEL:-qwen3.5-35b-a3b}:latest + image: ghcr.io/infernet-org/foundry/${FOUNDRY_MODEL:-qwen3.5-9b}:latest build: - context: models/${FOUNDRY_MODEL:-qwen3.5-35b-a3b}/ + context: models/${FOUNDRY_MODEL:-qwen3.5-9b}/ ports: - "${FOUNDRY_PORT:-8080}:8080" volumes: diff --git a/models/hermes-4.3-36b/entrypoint.sh b/models/hermes-4.3-36b/entrypoint.sh index 9dff64d..745915b 100755 --- a/models/hermes-4.3-36b/entrypoint.sh +++ b/models/hermes-4.3-36b/entrypoint.sh @@ -118,7 +118,7 @@ download_model() { log "Model not found at ${gguf_path}" log "Downloading ${FOUNDRY_GGUF_FILE} from ${FOUNDRY_GGUF_REPO}..." - log "This is a one-time download (~20GB). Subsequent starts will be instant." + log "This is a one-time download. Subsequent starts will be instant." echo "" # Use python3 huggingface_hub to download (huggingface-cli may not be on PATH) diff --git a/models/qwen3-coder-30b-a3b/entrypoint.sh b/models/qwen3-coder-30b-a3b/entrypoint.sh index 9dff64d..745915b 100755 --- a/models/qwen3-coder-30b-a3b/entrypoint.sh +++ b/models/qwen3-coder-30b-a3b/entrypoint.sh @@ -118,7 +118,7 @@ download_model() { log "Model not found at ${gguf_path}" log "Downloading ${FOUNDRY_GGUF_FILE} from ${FOUNDRY_GGUF_REPO}..." - log "This is a one-time download (~20GB). Subsequent starts will be instant." + log "This is a one-time download. Subsequent starts will be instant." echo "" # Use python3 huggingface_hub to download (huggingface-cli may not be on PATH) diff --git a/models/qwen3.5-35b-a3b/entrypoint.sh b/models/qwen3.5-35b-a3b/entrypoint.sh index 9dff64d..745915b 100755 --- a/models/qwen3.5-35b-a3b/entrypoint.sh +++ b/models/qwen3.5-35b-a3b/entrypoint.sh @@ -118,7 +118,7 @@ download_model() { log "Model not found at ${gguf_path}" log "Downloading ${FOUNDRY_GGUF_FILE} from ${FOUNDRY_GGUF_REPO}..." - log "This is a one-time download (~20GB). Subsequent starts will be instant." + log "This is a one-time download. Subsequent starts will be instant." echo "" # Use python3 huggingface_hub to download (huggingface-cli may not be on PATH) diff --git a/models/qwen3.5-9b/Dockerfile b/models/qwen3.5-9b/Dockerfile new file mode 100644 index 0000000..f615ba8 --- /dev/null +++ b/models/qwen3.5-9b/Dockerfile @@ -0,0 +1,93 @@ +# ============================================================================== +# Foundry Model Image: Qwen3.5-9B +# ============================================================================== +# Multi-stage build for a minimal CUDA runtime. +# Compiles llama.cpp from source for sm_89 (Ada) and sm_120a (Blackwell), +# then copies only the binary and required libraries to a clean Ubuntu base. +# +# Weights are NOT baked in. They are downloaded on first run or mounted +# from the host at /models. +# ============================================================================== + +# ------------------------------------------------------------------------------ +# Stage 1: Builder +# ------------------------------------------------------------------------------ +FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS builder + +RUN apt-get update && apt-get install -y git cmake g++ curl +RUN git clone --depth 1 -b b8183 https://github.com/ggml-org/llama.cpp.git /llama.cpp +WORKDIR /llama.cpp + +# Compile explicitly for Ada (sm_89) and Blackwell (sm_120a). +# GGML_BACKEND_DL=ON builds CUDA as a runtime-loaded plugin (dlopen), which +# avoids the libcuda.so.1 transitive link error during Docker builds where +# no real GPU driver is present. This matches the official llama.cpp Dockerfile. +RUN cmake -B build \ + -DGGML_NATIVE=OFF \ + -DGGML_CUDA=ON \ + -DGGML_BACKEND_DL=ON \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DCMAKE_CUDA_ARCHITECTURES="89;120a" \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLAMA_CURL=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DCMAKE_EXE_LINKER_FLAGS="-Wl,--allow-shlib-undefined" && \ + cmake --build build --config Release -j$(nproc) + +# ------------------------------------------------------------------------------ +# Stage 2: Minimal Runtime +# ------------------------------------------------------------------------------ +FROM ubuntu:24.04 + +# Install minimal runtime dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgomp1 \ + python3 python3-pip curl \ + && pip3 install --break-system-packages --no-cache-dir "huggingface-hub>=0.28,<1" "hf_transfer>=0.1.6" \ + && rm -rf /var/lib/apt/lists/* + +# The NVIDIA runtime needs these env vars to mount the CUDA drivers correctly +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility + +# Model metadata +ENV FOUNDRY_MODEL_NAME="Qwen3.5-9B" +ENV FOUNDRY_GGUF_REPO="unsloth/Qwen3.5-9B-GGUF" +ENV FOUNDRY_GGUF_FILE="Qwen3.5-9B-UD-Q4_K_XL.gguf" +ENV FOUNDRY_ARCH="dense" + +# Enable fast downloads +ENV HF_HUB_ENABLE_HF_TRANSFER="1" + +# Runtime defaults (can be overridden) +ENV FOUNDRY_PROFILE="auto" +ENV FOUNDRY_PORT="8080" +ENV FOUNDRY_CTX_LENGTH="" +ENV FOUNDRY_THREADS="" +ENV FOUNDRY_EXTRA_ARGS="" + +# Copy the compiled binary and all shared libraries from the build output. +# With GGML_BACKEND_DL=ON, backends (ggml-cuda, ggml-cpu-*) are .so modules +# loaded at runtime via dlopen. CMake places everything in build/bin/. +COPY --from=builder /llama.cpp/build/bin/ /app/ + +# Cherry-pick only the CUDA runtime libs that libggml-cuda.so actually needs. +# libcuda.so.1 is provided by the NVIDIA container runtime at launch. +COPY --from=builder /usr/local/cuda/lib64/libcudart.so.12 /app/ +COPY --from=builder /usr/local/cuda/lib64/libcublas.so.12 /app/ +COPY --from=builder /usr/local/cuda/lib64/libcublasLt.so.12 /app/ +ENV LD_LIBRARY_PATH="/app" + +# Copy profiles and shared entrypoint +COPY profiles/ /opt/foundry/profiles/ +COPY entrypoint.sh /opt/foundry/entrypoint.sh +RUN chmod +x /opt/foundry/entrypoint.sh + +# Model storage +RUN mkdir -p /models +VOLUME /models + +EXPOSE 8080 + +ENTRYPOINT ["/opt/foundry/entrypoint.sh"] diff --git a/models/qwen3.5-9b/entrypoint.sh b/models/qwen3.5-9b/entrypoint.sh new file mode 100755 index 0000000..745915b --- /dev/null +++ b/models/qwen3.5-9b/entrypoint.sh @@ -0,0 +1,316 @@ +#!/bin/bash +# ============================================================================== +# Foundry Entrypoint (shared across all models) +# ============================================================================== +# 1. Detect GPU and load hardware profile +# 2. Download model if not present +# 3. Apply architecture-aware tuning (MoE vs Dense) +# 4. Launch llama-server with tuned parameters +# +# Model identity is set via Dockerfile ENV vars: +# FOUNDRY_MODEL_NAME -- display name (e.g. "Qwen3.5-35B-A3B") +# FOUNDRY_GGUF_REPO -- HuggingFace repo (e.g. "unsloth/Qwen3.5-35B-A3B-GGUF") +# FOUNDRY_GGUF_FILE -- GGUF filename +# FOUNDRY_ARCH -- architecture type: "moe" or "dense" +# +# Architecture-specific flags are applied automatically based on FOUNDRY_ARCH: +# +# MoE (e.g. Qwen3.5-35B-A3B): +# --fit on Expert offloading: spill inactive experts to CPU +# +# Dense (e.g. Hermes-4.3-36B): +# (no --fit) No experts to offload +# +# Model-specific quirks (e.g. --swa-full for hybrid attention, --cache-ram 0 +# for recurrent architectures) belong in PROFILE_EXTRA_ARGS, NOT in the arch +# tier -- they are not universal to the architecture class. +# +# Hardware-specific tuning (context, threads, KV quant, slots) is set by +# per-GPU profiles in /opt/foundry/profiles/*.sh. +# ============================================================================== + +set -euo pipefail + +FOUNDRY_DIR="/opt/foundry" +PROFILES_DIR="${FOUNDRY_DIR}/profiles" +MODELS_DIR="/models" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +log() { echo -e "${CYAN}[foundry]${NC} $*"; } +warn() { echo -e "${YELLOW}[foundry]${NC} $*" >&2; } +err() { echo -e "${RED}[foundry]${NC} $*" >&2; } + +# ============================================================================== +# GPU Detection +# ============================================================================== + +detect_gpu() { + local gpu_name + if ! command -v nvidia-smi &> /dev/null; then + warn "nvidia-smi not found, using default profile" + echo "default" + return + fi + + gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null | head -1 | xargs) + + if [ -z "$gpu_name" ]; then + warn "Could not detect GPU, using default profile" + echo "default" + return + fi + + # Log to stderr so it doesn't interfere with the captured profile name + log "Detected GPU: ${gpu_name}" >&2 + + # Map GPU name to profile + case "$gpu_name" in + *"5090"*) echo "rtx5090" ;; + *) + warn "Unknown or unsupported GPU '${gpu_name}', using default profile" + echo "default" + ;; + esac +} + +# ============================================================================== +# Profile Loading +# ============================================================================== + +load_profile() { + local profile_name="$1" + local profile_file="${PROFILES_DIR}/${profile_name}.sh" + + if [ ! -f "$profile_file" ]; then + warn "Profile '${profile_name}' not found, falling back to default" + profile_file="${PROFILES_DIR}/default.sh" + fi + + if [ ! -f "$profile_file" ]; then + err "No default profile found at ${profile_file}" + exit 1 + fi + + log "Loading profile: ${profile_name}" + # shellcheck source=profiles/default.sh + source "$profile_file" +} + +# ============================================================================== +# Model Download +# ============================================================================== + +download_model() { + local gguf_path="${MODELS_DIR}/${FOUNDRY_GGUF_FILE}" + + if [ -f "$gguf_path" ]; then + local size + size=$(du -h "$gguf_path" | cut -f1) + log "Model found: ${gguf_path} (${size})" + return 0 + fi + + log "Model not found at ${gguf_path}" + log "Downloading ${FOUNDRY_GGUF_FILE} from ${FOUNDRY_GGUF_REPO}..." + log "This is a one-time download. Subsequent starts will be instant." + echo "" + + # Use python3 huggingface_hub to download (huggingface-cli may not be on PATH) + # Variables are passed via environment to avoid shell injection in inline Python + if python3 -c "import huggingface_hub" 2>/dev/null; then + FOUNDRY_GGUF_REPO="${FOUNDRY_GGUF_REPO}" \ + FOUNDRY_GGUF_FILE="${FOUNDRY_GGUF_FILE}" \ + FOUNDRY_MODELS_DIR="${MODELS_DIR}" \ + python3 -c " +import os +from huggingface_hub import hf_hub_download +token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN') +hf_hub_download( + repo_id=os.environ['FOUNDRY_GGUF_REPO'], + filename=os.environ['FOUNDRY_GGUF_FILE'], + local_dir=os.environ['FOUNDRY_MODELS_DIR'], + token=token +) +" + else + err "huggingface-hub not found. Please mount the GGUF at ${gguf_path}" + err "Or install huggingface-hub: pip install huggingface-hub" + exit 1 + fi + + if [ ! -f "$gguf_path" ]; then + err "Download failed: ${gguf_path} not found after download" + exit 1 + fi + + local size + size=$(du -h "$gguf_path" | cut -f1) + log "Download complete: ${gguf_path} (${size})" +} + +# ============================================================================== +# Build Launch Command +# ============================================================================== +# Flags are layered in three tiers: +# 1. Architecture defaults (FOUNDRY_ARCH) -- systematic, model-class level +# 2. Hardware profile (PROFILE_*) -- per-GPU tuning knobs +# 3. User overrides (FOUNDRY_EXTRA_ARGS) -- escape hatch, highest priority + +build_command() { + local gguf_path="${MODELS_DIR}/${FOUNDRY_GGUF_FILE}" + local arch="${FOUNDRY_ARCH:-dense}" + + # Use a bash array to safely handle arguments with spaces + local -a cmd=("/app/llama-server") + cmd+=("--model" "${gguf_path}") + cmd+=("--host" "0.0.0.0") + cmd+=("--port" "${FOUNDRY_PORT:-8080}") + + # --- Tier 1: Architecture-specific flags ---------------------------------- + # These are determined by the model class, not by the GPU or user preference. + + if [ "$arch" = "moe" ]; then + # MoE: enable expert offloading (spill inactive experts to CPU when VRAM + # is tight). On high-VRAM GPUs --fit keeps everything on GPU automatically. + cmd+=("--fit" "on") + fi + # Dense models: no --fit (no experts to offload). + # Model-specific flags (--swa-full, --cache-ram) go in PROFILE_EXTRA_ARGS. + + # --- Tier 2: Hardware profile tuning -------------------------------------- + # These come from the sourced profile .sh file and tune for the specific GPU. + + # Context length (env override > profile > default) + local ctx="${FOUNDRY_CTX_LENGTH:-${PROFILE_CTX_LENGTH:-32768}}" + cmd+=("--ctx-size" "${ctx}") + + # Thread count (env override > profile > auto) + local threads="${FOUNDRY_THREADS:-${PROFILE_THREADS:-}}" + if [ -n "$threads" ]; then + cmd+=("--threads" "${threads}") + fi + + # Batch thread count (can be higher than decode threads for prompt processing) + local threads_batch="${PROFILE_THREADS_BATCH:-${threads}}" + if [ -n "$threads_batch" ]; then + cmd+=("--threads-batch" "${threads_batch}") + fi + + # Flash attention (new llama.cpp requires explicit on/off/auto value) + local fa="${PROFILE_FLASH_ATTN:-on}" + cmd+=("--flash-attn" "${fa}") + + # KV cache quantization + local ctk="${PROFILE_KV_TYPE_K:-q8_0}" + local ctv="${PROFILE_KV_TYPE_V:-q8_0}" + cmd+=("-ctk" "${ctk}" "-ctv" "${ctv}") + + # Memory mapping + if [ "${PROFILE_NO_MMAP:-true}" = "true" ]; then + cmd+=("--no-mmap") + fi + + # Jinja templates (for tool calling / chat templates) + if [ "${PROFILE_JINJA:-true}" = "true" ]; then + cmd+=("--jinja") + fi + + # Parallel slots for concurrent requests + local slots="${PROFILE_PARALLEL:-2}" + cmd+=("--parallel" "${slots}") + + # Thread priority for reduced scheduling latency + local prio="${PROFILE_PRIO:-0}" + if [ "$prio" != "0" ]; then + cmd+=("--prio" "${prio}") + fi + + # Strict CPU placement for cache locality + if [ "${PROFILE_CPU_STRICT:-0}" = "1" ]; then + cmd+=("--cpu-strict" "1") + fi + + # KV cache reuse for multi-turn chat (prefix sharing via KV shifting) + local cache_reuse="${PROFILE_CACHE_REUSE:-0}" + if [ "$cache_reuse" != "0" ]; then + cmd+=("--cache-reuse" "${cache_reuse}") + fi + + # Disable web UI for headless server deployments + if [ "${PROFILE_NO_WEBUI:-false}" = "true" ]; then + cmd+=("--no-webui") + fi + + # Prometheus-compatible metrics endpoint + if [ "${PROFILE_METRICS:-false}" = "true" ]; then + cmd+=("--metrics") + fi + + # Profile-specific extra args (split on spaces intentionally) + if [ -n "${PROFILE_EXTRA_ARGS:-}" ]; then + # shellcheck disable=SC2206 + cmd+=(${PROFILE_EXTRA_ARGS}) + fi + + # --- Tier 3: User overrides ----------------------------------------------- + if [ -n "${FOUNDRY_EXTRA_ARGS:-}" ]; then + # shellcheck disable=SC2206 + cmd+=(${FOUNDRY_EXTRA_ARGS}) + fi + + # Store the array globally so main() can exec it safely + FOUNDRY_CMD=("${cmd[@]}") +} + +# ============================================================================== +# Main +# ============================================================================== + +main() { + echo "" + echo -e "${GREEN}╔════════════════════════════════════════════╗${NC}" + echo -e "${GREEN}║ Foundry Inference ║${NC}" + echo -e "${GREEN}║ github.com/infernet-org/foundry ║${NC}" + echo -e "${GREEN}╚════════════════════════════════════════════╝${NC}" + echo "" + + log "Model: ${FOUNDRY_MODEL_NAME}" + log "Architecture: ${FOUNDRY_ARCH:-dense}" + + # 1. Determine profile + local profile + if [ "${FOUNDRY_PROFILE}" = "auto" ]; then + profile=$(detect_gpu) + else + profile="${FOUNDRY_PROFILE}" + fi + + # 2. Load profile + load_profile "$profile" + + # 3. Download model if needed + download_model + + # 4. Build launch command (sets FOUNDRY_CMD array directly, no subshell) + build_command + + echo "" + log "Launch command:" + echo -e "${CYAN} ${FOUNDRY_CMD[*]}${NC}" + echo "" + log "OpenAI-compatible API will be available at:" + echo -e "${GREEN} http://localhost:${FOUNDRY_PORT:-8080}/v1/chat/completions${NC}" + echo "" + + # 5. Launch (exec replaces shell process for proper signal handling) + # Use the array form to avoid word-splitting issues + exec "${FOUNDRY_CMD[@]}" +} + +main "$@" diff --git a/models/qwen3.5-9b/profiles/default.sh b/models/qwen3.5-9b/profiles/default.sh new file mode 100644 index 0000000..a20c158 --- /dev/null +++ b/models/qwen3.5-9b/profiles/default.sh @@ -0,0 +1,25 @@ +# ============================================================================== +# Foundry Profile: Default (8GB+ VRAM) +# ============================================================================== +# Qwen3.5-9B UD-Q4_K_XL (~5.66GB) +# +# Conservative profile for GPUs with 8-16GB VRAM. +# At only 5.66GB model weight, this is the lightest model in the lineup +# and runs comfortably on 8GB cards with reduced context. +# ============================================================================== + +PROFILE_CTX_LENGTH=32768 # 32K context -- safe for 8GB+ cards +PROFILE_THREADS=8 # Conservative thread count +PROFILE_THREADS_BATCH=8 +PROFILE_FLASH_ATTN="on" +PROFILE_KV_TYPE_K="q4_0" # Aggressive KV quantization to save VRAM +PROFILE_KV_TYPE_V="q4_0" +PROFILE_NO_MMAP="true" +PROFILE_JINJA="true" # Tool calling support +PROFILE_PARALLEL=2 # 2 slots for smaller GPUs +PROFILE_PRIO=0 # Normal priority (conservative) +PROFILE_CPU_STRICT=0 +PROFILE_CACHE_REUSE=0 # Disabled: hybrid recurrent arch re-processes anyway +PROFILE_NO_WEBUI="false" # Keep web UI for exploration +PROFILE_METRICS="false" +PROFILE_EXTRA_ARGS="--swa-full --cache-ram 0" diff --git a/models/qwen3.5-9b/profiles/rtx5090.sh b/models/qwen3.5-9b/profiles/rtx5090.sh new file mode 100644 index 0000000..ce3f3bf --- /dev/null +++ b/models/qwen3.5-9b/profiles/rtx5090.sh @@ -0,0 +1,54 @@ +# ============================================================================== +# Foundry Profile: RTX 5090 (32GB) +# ============================================================================== +# Qwen3.5-9B UD-Q4_K_XL (~5.66GB) -- Dense model, Qwen3.5 generation +# +# Architecture: Hybrid Gated DeltaNet + Dense FFN (NOT MoE) +# - 32 layers: 24 Gated DeltaNet (recurrent) + 8 full attention (GQA 16:4) +# - All 9B parameters active per token (dense, compute-bound) +# - Vision-language capable (multimodal) +# - Thinking mode by default (reasoning_content field) +# +# Why this model replaces Qwen3.5-35B-A3B: +# Qwen3.5-9B is a newer generation (Qwen3.5) that dramatically outperforms +# the older 35B-A3B on every benchmark: agent tasks (+37 TAU2-Bench), +# math (+20 HMMT), reasoning (+8 GPQA), instruction following (+13 IFBench). +# At 5.66 GB it uses a fraction of the VRAM, enabling full 262K native +# context per slot with 4 parallel slots in only 29.5 GB. +# +# VRAM budget (32,607 MiB total): +# Model weights: ~5.66 GB (CUDA) +# KV cache (1M): ~17.4 GB (4 slots x 262K, q8_0, 8 attn layers only) +# Recurrent state: 201 MB (32 DeltaNet layers, fixed size) +# Compute buffers: ~6.5 GB (CUDA) + 4.2 GB (Host) +# Free headroom: ~2.6 GB +# +# Benchmarked on RTX 5090 (2026-03-02, native sm_120a, BLACKWELL_NATIVE_FP4=1): +# Single-stream decode: ~177 tok/s (compute-bound, 94% SM utilization) +# 4-concurrent aggregate: ~423 tok/s +# 4-concurrent per-slot: ~106 tok/s each +# Prompt processing: ~1,688 tok/s +# GPU: 100% SM / 63% mem @ 4-concurrent | 94% SM / 65% mem @ single +# Power: 312W single, 445W 4-concurrent | Temp: 52-60C +# ============================================================================== + +PROFILE_CTX_LENGTH=1048576 # 1M total -- 262K per slot with 4 parallel slots +PROFILE_THREADS=16 # Physical cores (avoid hyperthreads for decode) +PROFILE_THREADS_BATCH=20 # Higher thread count for prompt processing +PROFILE_FLASH_ATTN="on" # Flash attention for long context perf +PROFILE_KV_TYPE_K="q8_0" # KV cache key quantization +PROFILE_KV_TYPE_V="q8_0" # KV cache value quantization +PROFILE_NO_MMAP="true" # Avoid page faults, load model into RAM +PROFILE_JINJA="true" # Chat template / tool calling support +PROFILE_PARALLEL=4 # 4 slots: 262K/slot, 423 tok/s agg, 106 tok/s each + # Dense model: internal --parallel batching is 2.6x more + # efficient than running multiple instances (tested) +PROFILE_PRIO=2 # High thread priority for reduced scheduling latency +PROFILE_CPU_STRICT=1 # Strict CPU placement for cache locality +PROFILE_CACHE_REUSE=0 # Disabled: hybrid recurrent arch re-processes anyway +PROFILE_NO_WEBUI="true" # Headless: no web UI, reduce attack surface +PROFILE_METRICS="true" # Prometheus-compatible /metrics endpoint +# --mlock: pin model in RAM; -b/-ub 4096: large batch for fast prompt encode +# --swa-full: full SWA cache for hybrid attention models (DeltaNet + attention) +# --cache-ram 0: disable prompt cache (hybrid recurrent arch forces re-processing) +PROFILE_EXTRA_ARGS="--mlock -b 4096 -ub 4096 --swa-full --cache-ram 0" diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 9dff64d..745915b 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -118,7 +118,7 @@ download_model() { log "Model not found at ${gguf_path}" log "Downloading ${FOUNDRY_GGUF_FILE} from ${FOUNDRY_GGUF_REPO}..." - log "This is a one-time download (~20GB). Subsequent starts will be instant." + log "This is a one-time download. Subsequent starts will be instant." echo "" # Use python3 huggingface_hub to download (huggingface-cli may not be on PATH)