Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 41 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,23 @@ METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
CORE_OBJS = ds4.o ds4_metal.o
CPU_CORE_OBJS = ds4_cpu.o
else

CFLAGS += -D_GNU_SOURCE -fno-finite-math-only

ifeq ($(GPU_BACKEND),rocm)
ROCM_PATH ?= /opt/rocm
GPU_CC = $(ROCM_PATH)/bin/hipcc
ROCM_ARCH ?= gfx1151

GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas

@echo "ROCM_ARCH: $(ROCM_ARCH)"

EXTRA_DEPS = ds4_rocm.h

else

CUDA_HOME ?= /usr/local/cuda
NVCC ?= $(CUDA_HOME)/bin/nvcc
CUDA_ARCH ?=
Expand All @@ -27,9 +43,18 @@ NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
endif
NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas

GPU_CC = $(NVCC)
GPU_CFLAGS = $(NVCCFLAGS)
GPU_LDLIBS = $(CUDA_LDLIBS)

endif

CORE_OBJS = ds4.o ds4_cuda.o
EXTRA_DEPS =
CPU_CORE_OBJS = ds4_cpu.o
METAL_LDLIBS := $(LDLIBS)

endif

.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
Expand Down Expand Up @@ -69,6 +94,7 @@ help:
@echo " make cuda-generic Build CUDA for a generic local CUDA GPU"
@echo " make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
@echo " make cpu Build CPU-only ./ds4, ./ds4-server, and ./ds4-bench"
@echo " make rocm Build ROCm"
@echo " make test Build and run tests"
@echo " make clean Remove build outputs"

Expand All @@ -86,14 +112,22 @@ cuda:
fi
$(MAKE) ds4 ds4-server ds4-bench CUDA_ARCH="$(CUDA_ARCH)"

rocm:
@if [ -z "$(strip $(ROCM_ARCH))" ]; then \
echo "error: specify ROCM_ARCH, for example: make rocm ROCM_ARCH=gfx1151"; \
exit 2; \
fi
$(MAKE) ds4 ds4-server ds4-bench GPU_BACKEND=rocm ROCM_ARCH=$(ROCM_ARCH)


ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)

ds4-server: ds4_server.o rax.o $(CORE_OBJS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)

ds4-bench: ds4_bench.o $(CORE_OBJS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)

cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS)
$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
Expand Down Expand Up @@ -143,11 +177,11 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h
ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
$(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m

ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc
$(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu
ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS)
$(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu

tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
$(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS)

ds4_test: ds4_test.o rax.o $(CORE_OBJS)
ifeq ($(UNAME_S),Darwin)
Expand All @@ -161,3 +195,4 @@ test: ds4_test

clean:
rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o

67 changes: 45 additions & 22 deletions ds4_cuda.cu
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
#ifdef __HIP_PLATFORM_AMD__
#include "ds4_rocm.h"

#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
#define MASK_T uint64_t
#else
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <mma.h>
#include <cublas_v2.h>

#define FULL_WARP_MASK 0xFFFFFFFFu
#define MASK_T uint32_t
#endif

#include <stdint.h>
#include <errno.h>
#include <limits.h>
Expand Down Expand Up @@ -1292,9 +1302,22 @@ extern "C" ds4_gpu_tensor *ds4_gpu_tensor_alloc(uint64_t bytes) {
if (bytes == 0) bytes = 1;
ds4_gpu_tensor *t = (ds4_gpu_tensor *)calloc(1, sizeof(*t));
if (!t) return NULL;
if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) {
free(t);
return NULL;

if (getenv("DS4_CUDA_MANAGED") != NULL) {
/* Use cudaMallocManaged with cudaMemAttachGlobal so the allocation
* is GPU-accessible across all streams. On UMA platforms (Strix
* Halo, Grace-Hopper) this allocates from the full unified pool,
* bypassing the BIOS VRAM carve-out. */
if (!cuda_ok(cudaMallocManaged(&t->ptr, (size_t)bytes, cudaMemAttachGlobal),
"managed tensor alloc")) {
free(t);
return NULL;
}
} else {
if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) {
free(t);
return NULL;
}
}
t->bytes = bytes;
t->owner = 1;
Expand Down Expand Up @@ -1722,14 +1745,14 @@ __global__ static void f32_to_f16_kernel(__half *out, const float *x, uint64_t n

__device__ static float warp_sum_f32(float v) {
for (int offset = 16; offset > 0; offset >>= 1) {
v += __shfl_down_sync(0xffffffffu, v, offset);
v += __shfl_down_sync(FULL_WARP_MASK, v, offset);
}
return v;
}

__device__ static float warp_max_f32(float v) {
for (int offset = 16; offset > 0; offset >>= 1) {
v = fmaxf(v, __shfl_down_sync(0xffffffffu, v, offset));
v = fmaxf(v, __shfl_down_sync(FULL_WARP_MASK, v, offset));
}
return v;
}
Expand Down Expand Up @@ -2846,7 +2869,7 @@ __global__ static void attention_decode_mixed_kernel(
for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d];
const uint32_t mask = 0xffu << (threadIdx.x & 24u);
for (uint32_t off = 4u; off > 0u; off >>= 1u) {
dot += __shfl_down_sync(mask, dot, off, 8);
dot += __shfl_down_sync(static_cast<MASK_T>(mask), dot, off, 8);
}
s = dot * scale + add;
}
Expand Down Expand Up @@ -3008,7 +3031,7 @@ __global__ static void attention_indexed_mixed_kernel(
for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d];
const uint32_t mask = 0xffu << (threadIdx.x & 24u);
for (uint32_t off = 4u; off > 0u; off >>= 1u) {
dot += __shfl_down_sync(mask, dot, off, 8);
dot += __shfl_down_sync(static_cast<MASK_T>(mask), dot, off, 8);
}
if (qlane == 0) scores[row] = dot * scale;
}
Expand Down Expand Up @@ -3189,7 +3212,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel(
const float *score_row = scores + warp * 768u;
for (uint32_t i = lane; i < n_score; i += 32u) max_s = fmaxf(max_s, score_row[i]);
max_s = warp_max_f32(max_s);
max_s = __shfl_sync(0xffffffffu, max_s, 0);
max_s = __shfl_sync(FULL_WARP_MASK, max_s, 0);
}
float den = 0.0f;
if (valid_head) {
Expand All @@ -3201,7 +3224,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel(
}
den = warp_sum_f32(den);
den += expf(sinks[head] - max_s);
den = __shfl_sync(0xffffffffu, den, 0);
den = __shfl_sync(FULL_WARP_MASK, den, 0);
}

float4 o0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
Expand Down Expand Up @@ -3361,7 +3384,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel(
dot4_f32(q2, k2) +
dot4_f32(q3, k3);
score = warp_sum_f32(score) * scale;
score = __shfl_sync(0xffffffffu, score, 0);
score = __shfl_sync(FULL_WARP_MASK, score, 0);

const float new_m = fmaxf(max_s, score);
const float old_scale = expf(max_s - new_m);
Expand Down Expand Up @@ -3485,7 +3508,7 @@ __global__ static void attention_static_mixed_heads8_online_kernel(
dot4_f32(q2, k2) +
dot4_f32(q3, k3);
score = warp_sum_f32(score) * scale;
score = __shfl_sync(0xffffffffu, score, 0);
score = __shfl_sync(FULL_WARP_MASK, score, 0);

const float new_m = fmaxf(max_s, score);
const float old_scale = expf(max_s - new_m);
Expand Down Expand Up @@ -3650,7 +3673,7 @@ __global__ static void attention_decode_mixed_heads8_online_kernel(
dot4_f32(q2, k2) +
dot4_f32(q3, k3);
score = warp_sum_f32(score) * scale;
score = __shfl_sync(0xffffffffu, score, 0);
score = __shfl_sync(FULL_WARP_MASK, score, 0);

const float new_m = fmaxf(max_s, score);
const float old_scale = expf(max_s - new_m);
Expand Down Expand Up @@ -4268,9 +4291,9 @@ __global__ static void router_select_warp_topk_kernel(
}
#pragma unroll
for (uint32_t mask = 16u; mask > 0u; mask >>= 1u) {
const float other_score = __shfl_xor_sync(0xffffffffu, best_score, mask);
const float other_prob = __shfl_xor_sync(0xffffffffu, best_prob, mask);
const uint32_t other_idx = __shfl_xor_sync(0xffffffffu, best_idx, mask);
const float other_score = __shfl_xor_sync(FULL_WARP_MASK, best_score, mask);
const float other_prob = __shfl_xor_sync(FULL_WARP_MASK, best_prob, mask);
const uint32_t other_idx = __shfl_xor_sync(FULL_WARP_MASK, best_idx, mask);
if (router_score_better(other_score, other_idx, best_score, best_idx)) {
best_score = other_score;
best_prob = other_prob;
Expand Down Expand Up @@ -5198,7 +5221,7 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode
out->ptr,
CUDA_R_32F,
(int)out_dim,
CUDA_R_32F,
CUBLAS_COMPUTE_32F,
CUBLAS_GEMM_DEFAULT);
if (st == CUBLAS_STATUS_SUCCESS) return 1;
fprintf(stderr, "ds4: cuBLAS q8 f16 matmul failed: status %d\n", (int)st);
Expand Down Expand Up @@ -5440,7 +5463,7 @@ extern "C" int ds4_gpu_matmul_f16_tensor(ds4_gpu_tensor *out, const void *model_
out->ptr,
CUDA_R_32F,
(int)out_dim,
CUDA_R_32F,
CUBLAS_COMPUTE_32F,
CUBLAS_GEMM_DEFAULT);
return cublas_ok(st, "f16 matmul");
}
Expand Down Expand Up @@ -6158,7 +6181,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads,
if (!tmp) return 0;
float *scores = tmp;
float *out_tmp = (float *)((char *)tmp + out_offset);
const float alpha = rsqrtf((float)head_dim);
const float alpha = 1.0f / sqrtf((float)head_dim);
const float beta = 0.0f;
cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
CUBLAS_OP_T,
Expand Down Expand Up @@ -6528,7 +6551,7 @@ static int attention_prefill_mixed_launch(
n_comp,
head_dim);
if (!cuda_ok(cudaGetLastError(), "attention mixed kv pack launch")) return 0;
const float alpha = rsqrtf((float)head_dim);
const float alpha = 1.0f / sqrtf((float)head_dim);
const float beta = 0.0f;
cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
CUBLAS_OP_T,
Expand Down Expand Up @@ -6733,7 +6756,7 @@ extern "C" int ds4_gpu_attention_output_q8_batch_tensor(
(int)rank,
(long long)rank * n_tokens,
(int)n_groups,
CUDA_R_32F,
CUBLAS_COMPUTE_32F,
CUBLAS_GEMM_DEFAULT);
if (!cublas_ok(st, "attention output a gemm")) return 0;
attention_unpack_group_low_kernel<<<(low_tmp_count + 255) / 256, 256>>>(
Expand Down Expand Up @@ -7395,7 +7418,7 @@ __device__ static void dev_dot_q2_K_q8_K_block8(
__device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
uint32_t mask = 0xffffu << (threadIdx.x & 16u);
for (int offset = 8; offset > 0; offset >>= 1) {
v += __shfl_down_sync(mask, v, offset, 16);
v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 16);
}
(void)lane16;
return v;
Expand All @@ -7404,7 +7427,7 @@ __device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
__device__ static float quarter_warp_sum_f32(float v, uint32_t lane8) {
uint32_t mask = 0xffu << (threadIdx.x & 24u);
for (int offset = 4; offset > 0; offset >>= 1) {
v += __shfl_down_sync(mask, v, offset, 8);
v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 8);
}
(void)lane8;
return v;
Expand Down
Loading