antirez · kmc6042 · May 11, 2026 · May 31, 2026
diff --git a/Makefile b/Makefile
@@ -18,7 +18,23 @@ METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
 CORE_OBJS = ds4.o ds4_metal.o
 CPU_CORE_OBJS = ds4_cpu.o
 else
+
 CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
+
+ifeq ($(GPU_BACKEND),rocm)
+ROCM_PATH ?= /opt/rocm
+GPU_CC = $(ROCM_PATH)/bin/hipcc
+ROCM_ARCH ?= gfx1151
+
+GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
+GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas
+
+@echo "ROCM_ARCH: $(ROCM_ARCH)"
+
+EXTRA_DEPS = ds4_rocm.h
+
+else
+
 CUDA_HOME ?= /usr/local/cuda
 NVCC ?= $(CUDA_HOME)/bin/nvcc
 CUDA_ARCH ?=
@@ -27,9 +43,18 @@ NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
 endif
 NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
 CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
+
+GPU_CC = $(NVCC)
+GPU_CFLAGS = $(NVCCFLAGS)
+GPU_LDLIBS = $(CUDA_LDLIBS)
+
+endif
+
 CORE_OBJS = ds4.o ds4_cuda.o
+EXTRA_DEPS =
 CPU_CORE_OBJS = ds4_cpu.o
 METAL_LDLIBS := $(LDLIBS)
+
 endif
 
 .PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
@@ -69,6 +94,7 @@ help:
 	@echo "  make cuda-generic        Build CUDA for a generic local CUDA GPU"
 	@echo "  make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
 	@echo "  make cpu                 Build CPU-only ./ds4, ./ds4-server, and ./ds4-bench"
+	@echo "  make rocm                Build ROCm"
 	@echo "  make test                Build and run tests"
 	@echo "  make clean               Remove build outputs"
 
@@ -86,14 +112,22 @@ cuda:
 	fi
 	$(MAKE) ds4 ds4-server ds4-bench CUDA_ARCH="$(CUDA_ARCH)"
 
+rocm:
+	@if [ -z "$(strip $(ROCM_ARCH))" ]; then \
+		echo "error: specify ROCM_ARCH, for example: make rocm ROCM_ARCH=gfx1151"; \
+		exit 2; \
+	fi
+	$(MAKE) ds4 ds4-server ds4-bench GPU_BACKEND=rocm ROCM_ARCH=$(ROCM_ARCH)
+
+
 ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 ds4-server: ds4_server.o rax.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 ds4-bench: ds4_bench.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
@@ -143,11 +177,11 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h
 ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
 	$(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m
 
-ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc
-	$(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu
+ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS)
+	$(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu
 
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)
 
 ds4_test: ds4_test.o rax.o $(CORE_OBJS)
 ifeq ($(UNAME_S),Darwin)
@@ -161,3 +195,4 @@ test: ds4_test
 
 clean:
 	rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+
diff --git a/ds4_cuda.cu b/ds4_cuda.cu
@@ -1,8 +1,18 @@
+#ifdef __HIP_PLATFORM_AMD__
+#include "ds4_rocm.h"
+
+#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
+#define MASK_T uint64_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 #include <mma.h>
 #include <cublas_v2.h>
 
+#define FULL_WARP_MASK 0xFFFFFFFFu
+#define MASK_T uint32_t
+#endif
+
 #include <stdint.h>
 #include <errno.h>
 #include <limits.h>
@@ -1292,9 +1302,22 @@ extern "C" ds4_gpu_tensor *ds4_gpu_tensor_alloc(uint64_t bytes) {
     if (bytes == 0) bytes = 1;
     ds4_gpu_tensor *t = (ds4_gpu_tensor *)calloc(1, sizeof(*t));
     if (!t) return NULL;
-    if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) {
-        free(t);
-        return NULL;
+
+    if (getenv("DS4_CUDA_MANAGED") != NULL) {
+        /* Use cudaMallocManaged with cudaMemAttachGlobal so the allocation
+         * is GPU-accessible across all streams.  On UMA platforms (Strix
+         * Halo, Grace-Hopper) this allocates from the full unified pool,
+         * bypassing the BIOS VRAM carve-out. */
+        if (!cuda_ok(cudaMallocManaged(&t->ptr, (size_t)bytes, cudaMemAttachGlobal),
+                     "managed tensor alloc")) {
+            free(t);
+            return NULL;
+        }
+    } else {
+        if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) {
+            free(t);
+            return NULL;
+        }
     }
     t->bytes = bytes;
     t->owner = 1;
@@ -1722,14 +1745,14 @@ __global__ static void f32_to_f16_kernel(__half *out, const float *x, uint64_t n
 
 __device__ static float warp_sum_f32(float v) {
     for (int offset = 16; offset > 0; offset >>= 1) {
-        v += __shfl_down_sync(0xffffffffu, v, offset);
+        v += __shfl_down_sync(FULL_WARP_MASK, v, offset);
     }
     return v;
 }
 
 __device__ static float warp_max_f32(float v) {
     for (int offset = 16; offset > 0; offset >>= 1) {
-        v = fmaxf(v, __shfl_down_sync(0xffffffffu, v, offset));
+        v = fmaxf(v, __shfl_down_sync(FULL_WARP_MASK, v, offset));
     }
     return v;
 }
@@ -2846,7 +2869,7 @@ __global__ static void attention_decode_mixed_kernel(
                     for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d];
                     const uint32_t mask = 0xffu << (threadIdx.x & 24u);
                     for (uint32_t off = 4u; off > 0u; off >>= 1u) {
-                        dot += __shfl_down_sync(mask, dot, off, 8);
+                        dot += __shfl_down_sync(static_cast<MASK_T>(mask), dot, off, 8);
                     }
                     s = dot * scale + add;
                 }
@@ -3008,7 +3031,7 @@ __global__ static void attention_indexed_mixed_kernel(
                 for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d];
                 const uint32_t mask = 0xffu << (threadIdx.x & 24u);
                 for (uint32_t off = 4u; off > 0u; off >>= 1u) {
-                    dot += __shfl_down_sync(mask, dot, off, 8);
+                    dot += __shfl_down_sync(static_cast<MASK_T>(mask), dot, off, 8);
                 }
                 if (qlane == 0) scores[row] = dot * scale;
             }
@@ -3189,7 +3212,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel(
         const float *score_row = scores + warp * 768u;
         for (uint32_t i = lane; i < n_score; i += 32u) max_s = fmaxf(max_s, score_row[i]);
         max_s = warp_max_f32(max_s);
-        max_s = __shfl_sync(0xffffffffu, max_s, 0);
+        max_s = __shfl_sync(FULL_WARP_MASK, max_s, 0);
     }
     float den = 0.0f;
     if (valid_head) {
@@ -3201,7 +3224,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel(
         }
         den = warp_sum_f32(den);
         den += expf(sinks[head] - max_s);
-        den = __shfl_sync(0xffffffffu, den, 0);
+        den = __shfl_sync(FULL_WARP_MASK, den, 0);
     }
 
     float4 o0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -3361,7 +3384,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel(
                               dot4_f32(q2, k2) +
                               dot4_f32(q3, k3);
                 score = warp_sum_f32(score) * scale;
-                score = __shfl_sync(0xffffffffu, score, 0);
+                score = __shfl_sync(FULL_WARP_MASK, score, 0);
 
                 const float new_m = fmaxf(max_s, score);
                 const float old_scale = expf(max_s - new_m);
@@ -3485,7 +3508,7 @@ __global__ static void attention_static_mixed_heads8_online_kernel(
                               dot4_f32(q2, k2) +
                               dot4_f32(q3, k3);
                 score = warp_sum_f32(score) * scale;
-                score = __shfl_sync(0xffffffffu, score, 0);
+                score = __shfl_sync(FULL_WARP_MASK, score, 0);
 
                 const float new_m = fmaxf(max_s, score);
                 const float old_scale = expf(max_s - new_m);
@@ -3650,7 +3673,7 @@ __global__ static void attention_decode_mixed_heads8_online_kernel(
                               dot4_f32(q2, k2) +
                               dot4_f32(q3, k3);
                 score = warp_sum_f32(score) * scale;
-                score = __shfl_sync(0xffffffffu, score, 0);
+                score = __shfl_sync(FULL_WARP_MASK, score, 0);
 
                 const float new_m = fmaxf(max_s, score);
                 const float old_scale = expf(max_s - new_m);
@@ -4268,9 +4291,9 @@ __global__ static void router_select_warp_topk_kernel(
         }
         #pragma unroll
         for (uint32_t mask = 16u; mask > 0u; mask >>= 1u) {
-            const float other_score = __shfl_xor_sync(0xffffffffu, best_score, mask);
-            const float other_prob = __shfl_xor_sync(0xffffffffu, best_prob, mask);
-            const uint32_t other_idx = __shfl_xor_sync(0xffffffffu, best_idx, mask);
+            const float other_score = __shfl_xor_sync(FULL_WARP_MASK, best_score, mask);
+            const float other_prob = __shfl_xor_sync(FULL_WARP_MASK, best_prob, mask);
+            const uint32_t other_idx = __shfl_xor_sync(FULL_WARP_MASK, best_idx, mask);
             if (router_score_better(other_score, other_idx, best_score, best_idx)) {
                 best_score = other_score;
                 best_prob = other_prob;
@@ -5198,7 +5221,7 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode
                                              out->ptr,
                                              CUDA_R_32F,
                                              (int)out_dim,
-                                             CUDA_R_32F,
+                                             CUBLAS_COMPUTE_32F,
                                              CUBLAS_GEMM_DEFAULT);
             if (st == CUBLAS_STATUS_SUCCESS) return 1;
             fprintf(stderr, "ds4: cuBLAS q8 f16 matmul failed: status %d\n", (int)st);
@@ -5440,7 +5463,7 @@ extern "C" int ds4_gpu_matmul_f16_tensor(ds4_gpu_tensor *out, const void *model_
                                          out->ptr,
                                          CUDA_R_32F,
                                          (int)out_dim,
-                                         CUDA_R_32F,
+                                         CUBLAS_COMPUTE_32F,
                                          CUBLAS_GEMM_DEFAULT);
         return cublas_ok(st, "f16 matmul");
     }
@@ -6158,7 +6181,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads,
         if (!tmp) return 0;
         float *scores = tmp;
         float *out_tmp = (float *)((char *)tmp + out_offset);
-        const float alpha = rsqrtf((float)head_dim);
+        const float alpha = 1.0f / sqrtf((float)head_dim);
         const float beta = 0.0f;
         cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
                                                       CUBLAS_OP_T,
@@ -6528,7 +6551,7 @@ static int attention_prefill_mixed_launch(
                 n_comp,
                 head_dim);
         if (!cuda_ok(cudaGetLastError(), "attention mixed kv pack launch")) return 0;
-        const float alpha = rsqrtf((float)head_dim);
+        const float alpha = 1.0f / sqrtf((float)head_dim);
         const float beta = 0.0f;
         cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
                                                       CUBLAS_OP_T,
@@ -6733,7 +6756,7 @@ extern "C" int ds4_gpu_attention_output_q8_batch_tensor(
                                                        (int)rank,
                                                        (long long)rank * n_tokens,
                                                        (int)n_groups,
-                                                       CUDA_R_32F,
+                                                       CUBLAS_COMPUTE_32F,
                                                        CUBLAS_GEMM_DEFAULT);
         if (!cublas_ok(st, "attention output a gemm")) return 0;
         attention_unpack_group_low_kernel<<<(low_tmp_count + 255) / 256, 256>>>(
@@ -7395,7 +7418,7 @@ __device__ static void dev_dot_q2_K_q8_K_block8(
 __device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
     uint32_t mask = 0xffffu << (threadIdx.x & 16u);
     for (int offset = 8; offset > 0; offset >>= 1) {
-        v += __shfl_down_sync(mask, v, offset, 16);
+        v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 16);
     }
     (void)lane16;
     return v;
@@ -7404,7 +7427,7 @@ __device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
 __device__ static float quarter_warp_sum_f32(float v, uint32_t lane8) {
     uint32_t mask = 0xffu << (threadIdx.x & 24u);
     for (int offset = 4; offset > 0; offset >>= 1) {
-        v += __shfl_down_sync(mask, v, offset, 8);
+        v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 8);
     }
     (void)lane8;
     return v;