diff --git a/Makefile b/Makefile index 42cde3c35..005bb45e8 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,23 @@ METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal CORE_OBJS = ds4.o ds4_metal.o CPU_CORE_OBJS = ds4_cpu.o else + CFLAGS += -D_GNU_SOURCE -fno-finite-math-only + +ifeq ($(GPU_BACKEND),rocm) +ROCM_PATH ?= /opt/rocm +GPU_CC = $(ROCM_PATH)/bin/hipcc +ROCM_ARCH ?= gfx1151 + +GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH) +GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas + +@echo "ROCM_ARCH: $(ROCM_ARCH)" + +EXTRA_DEPS = ds4_rocm.h + +else + CUDA_HOME ?= /usr/local/cuda NVCC ?= $(CUDA_HOME)/bin/nvcc CUDA_ARCH ?= @@ -27,9 +43,18 @@ NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH) endif NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas + +GPU_CC = $(NVCC) +GPU_CFLAGS = $(NVCCFLAGS) +GPU_LDLIBS = $(CUDA_LDLIBS) + +endif + CORE_OBJS = ds4.o ds4_cuda.o +EXTRA_DEPS = CPU_CORE_OBJS = ds4_cpu.o METAL_LDLIBS := $(LDLIBS) + endif .PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression @@ -69,6 +94,7 @@ help: @echo " make cuda-generic Build CUDA for a generic local CUDA GPU" @echo " make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value" @echo " make cpu Build CPU-only ./ds4, ./ds4-server, and ./ds4-bench" + @echo " make rocm Build ROCm" @echo " make test Build and run tests" @echo " make clean Remove build outputs" @@ -86,14 +112,22 @@ cuda: fi $(MAKE) ds4 ds4-server ds4-bench CUDA_ARCH="$(CUDA_ARCH)" +rocm: + @if [ -z "$(strip $(ROCM_ARCH))" ]; then \ + echo "error: specify ROCM_ARCH, for example: make rocm ROCM_ARCH=gfx1151"; \ + exit 2; \ + fi + $(MAKE) ds4 ds4-server ds4-bench GPU_BACKEND=rocm ROCM_ARCH=$(ROCM_ARCH) + + ds4: ds4_cli.o linenoise.o $(CORE_OBJS) - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) ds4-server: ds4_server.o rax.o $(CORE_OBJS) - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) ds4-bench: ds4_bench.o $(CORE_OBJS) - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS) $(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) @@ -143,11 +177,11 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS) $(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m -ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc - $(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu +ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS) + $(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o - $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) + $(GPU_CC) $(GPU_CFLAGS) -o $@ $^ $(GPU_LDLIBS) ds4_test: ds4_test.o rax.o $(CORE_OBJS) ifeq ($(UNAME_S),Darwin) @@ -161,3 +195,4 @@ test: ds4_test clean: rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o + diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 3b224f99e..8f467b63c 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -1,8 +1,18 @@ +#ifdef __HIP_PLATFORM_AMD__ +#include "ds4_rocm.h" + +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL +#define MASK_T uint64_t +#else #include #include #include #include +#define FULL_WARP_MASK 0xFFFFFFFFu +#define MASK_T uint32_t +#endif + #include #include #include @@ -1292,9 +1302,22 @@ extern "C" ds4_gpu_tensor *ds4_gpu_tensor_alloc(uint64_t bytes) { if (bytes == 0) bytes = 1; ds4_gpu_tensor *t = (ds4_gpu_tensor *)calloc(1, sizeof(*t)); if (!t) return NULL; - if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) { - free(t); - return NULL; + + if (getenv("DS4_CUDA_MANAGED") != NULL) { + /* Use cudaMallocManaged with cudaMemAttachGlobal so the allocation + * is GPU-accessible across all streams. On UMA platforms (Strix + * Halo, Grace-Hopper) this allocates from the full unified pool, + * bypassing the BIOS VRAM carve-out. */ + if (!cuda_ok(cudaMallocManaged(&t->ptr, (size_t)bytes, cudaMemAttachGlobal), + "managed tensor alloc")) { + free(t); + return NULL; + } + } else { + if (!cuda_ok(cudaMalloc(&t->ptr, (size_t)bytes), "tensor alloc")) { + free(t); + return NULL; + } } t->bytes = bytes; t->owner = 1; @@ -1722,14 +1745,14 @@ __global__ static void f32_to_f16_kernel(__half *out, const float *x, uint64_t n __device__ static float warp_sum_f32(float v) { for (int offset = 16; offset > 0; offset >>= 1) { - v += __shfl_down_sync(0xffffffffu, v, offset); + v += __shfl_down_sync(FULL_WARP_MASK, v, offset); } return v; } __device__ static float warp_max_f32(float v) { for (int offset = 16; offset > 0; offset >>= 1) { - v = fmaxf(v, __shfl_down_sync(0xffffffffu, v, offset)); + v = fmaxf(v, __shfl_down_sync(FULL_WARP_MASK, v, offset)); } return v; } @@ -2846,7 +2869,7 @@ __global__ static void attention_decode_mixed_kernel( for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d]; const uint32_t mask = 0xffu << (threadIdx.x & 24u); for (uint32_t off = 4u; off > 0u; off >>= 1u) { - dot += __shfl_down_sync(mask, dot, off, 8); + dot += __shfl_down_sync(static_cast(mask), dot, off, 8); } s = dot * scale + add; } @@ -3008,7 +3031,7 @@ __global__ static void attention_indexed_mixed_kernel( for (uint32_t d = qlane; d < head_dim; d += 8u) dot += qh[d] * kvrow[d]; const uint32_t mask = 0xffu << (threadIdx.x & 24u); for (uint32_t off = 4u; off > 0u; off >>= 1u) { - dot += __shfl_down_sync(mask, dot, off, 8); + dot += __shfl_down_sync(static_cast(mask), dot, off, 8); } if (qlane == 0) scores[row] = dot * scale; } @@ -3189,7 +3212,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel( const float *score_row = scores + warp * 768u; for (uint32_t i = lane; i < n_score; i += 32u) max_s = fmaxf(max_s, score_row[i]); max_s = warp_max_f32(max_s); - max_s = __shfl_sync(0xffffffffu, max_s, 0); + max_s = __shfl_sync(FULL_WARP_MASK, max_s, 0); } float den = 0.0f; if (valid_head) { @@ -3201,7 +3224,7 @@ __global__ static void attention_indexed_mixed_heads8_rb4_kernel( } den = warp_sum_f32(den); den += expf(sinks[head] - max_s); - den = __shfl_sync(0xffffffffu, den, 0); + den = __shfl_sync(FULL_WARP_MASK, den, 0); } float4 o0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -3361,7 +3384,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel( dot4_f32(q2, k2) + dot4_f32(q3, k3); score = warp_sum_f32(score) * scale; - score = __shfl_sync(0xffffffffu, score, 0); + score = __shfl_sync(FULL_WARP_MASK, score, 0); const float new_m = fmaxf(max_s, score); const float old_scale = expf(max_s - new_m); @@ -3485,7 +3508,7 @@ __global__ static void attention_static_mixed_heads8_online_kernel( dot4_f32(q2, k2) + dot4_f32(q3, k3); score = warp_sum_f32(score) * scale; - score = __shfl_sync(0xffffffffu, score, 0); + score = __shfl_sync(FULL_WARP_MASK, score, 0); const float new_m = fmaxf(max_s, score); const float old_scale = expf(max_s - new_m); @@ -3650,7 +3673,7 @@ __global__ static void attention_decode_mixed_heads8_online_kernel( dot4_f32(q2, k2) + dot4_f32(q3, k3); score = warp_sum_f32(score) * scale; - score = __shfl_sync(0xffffffffu, score, 0); + score = __shfl_sync(FULL_WARP_MASK, score, 0); const float new_m = fmaxf(max_s, score); const float old_scale = expf(max_s - new_m); @@ -4268,9 +4291,9 @@ __global__ static void router_select_warp_topk_kernel( } #pragma unroll for (uint32_t mask = 16u; mask > 0u; mask >>= 1u) { - const float other_score = __shfl_xor_sync(0xffffffffu, best_score, mask); - const float other_prob = __shfl_xor_sync(0xffffffffu, best_prob, mask); - const uint32_t other_idx = __shfl_xor_sync(0xffffffffu, best_idx, mask); + const float other_score = __shfl_xor_sync(FULL_WARP_MASK, best_score, mask); + const float other_prob = __shfl_xor_sync(FULL_WARP_MASK, best_prob, mask); + const uint32_t other_idx = __shfl_xor_sync(FULL_WARP_MASK, best_idx, mask); if (router_score_better(other_score, other_idx, best_score, best_idx)) { best_score = other_score; best_prob = other_prob; @@ -5198,7 +5221,7 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode out->ptr, CUDA_R_32F, (int)out_dim, - CUDA_R_32F, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); if (st == CUBLAS_STATUS_SUCCESS) return 1; fprintf(stderr, "ds4: cuBLAS q8 f16 matmul failed: status %d\n", (int)st); @@ -5440,7 +5463,7 @@ extern "C" int ds4_gpu_matmul_f16_tensor(ds4_gpu_tensor *out, const void *model_ out->ptr, CUDA_R_32F, (int)out_dim, - CUDA_R_32F, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); return cublas_ok(st, "f16 matmul"); } @@ -6158,7 +6181,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads, if (!tmp) return 0; float *scores = tmp; float *out_tmp = (float *)((char *)tmp + out_offset); - const float alpha = rsqrtf((float)head_dim); + const float alpha = 1.0f / sqrtf((float)head_dim); const float beta = 0.0f; cublasStatus_t st = cublasSgemmStridedBatched(g_cublas, CUBLAS_OP_T, @@ -6528,7 +6551,7 @@ static int attention_prefill_mixed_launch( n_comp, head_dim); if (!cuda_ok(cudaGetLastError(), "attention mixed kv pack launch")) return 0; - const float alpha = rsqrtf((float)head_dim); + const float alpha = 1.0f / sqrtf((float)head_dim); const float beta = 0.0f; cublasStatus_t st = cublasSgemmStridedBatched(g_cublas, CUBLAS_OP_T, @@ -6733,7 +6756,7 @@ extern "C" int ds4_gpu_attention_output_q8_batch_tensor( (int)rank, (long long)rank * n_tokens, (int)n_groups, - CUDA_R_32F, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); if (!cublas_ok(st, "attention output a gemm")) return 0; attention_unpack_group_low_kernel<<<(low_tmp_count + 255) / 256, 256>>>( @@ -7395,7 +7418,7 @@ __device__ static void dev_dot_q2_K_q8_K_block8( __device__ static float half_warp_sum_f32(float v, uint32_t lane16) { uint32_t mask = 0xffffu << (threadIdx.x & 16u); for (int offset = 8; offset > 0; offset >>= 1) { - v += __shfl_down_sync(mask, v, offset, 16); + v += __shfl_down_sync(static_cast(mask), v, offset, 16); } (void)lane16; return v; @@ -7404,7 +7427,7 @@ __device__ static float half_warp_sum_f32(float v, uint32_t lane16) { __device__ static float quarter_warp_sum_f32(float v, uint32_t lane8) { uint32_t mask = 0xffu << (threadIdx.x & 24u); for (int offset = 4; offset > 0; offset >>= 1) { - v += __shfl_down_sync(mask, v, offset, 8); + v += __shfl_down_sync(static_cast(mask), v, offset, 8); } (void)lane8; return v; diff --git a/ds4_rocm.h b/ds4_rocm.h new file mode 100644 index 000000000..55907c428 --- /dev/null +++ b/ds4_rocm.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include +#include +#include + +#define cudaError_t hipError_t +#define cudaStream_t hipStream_t +#define cudaEvent_t hipEvent_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaMemLocation hipMemLocation + +#define cudaSuccess hipSuccess +#define cudaErrorNotSupported hipErrorNotSupported +#define cudaMemAttachGlobal hipMemAttachGlobal +#define cudaErrorInvalidValue hipErrorInvalidValue +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString + +#define cudaGetDevice hipGetDevice +#define cudaSetDevice hipSetDevice +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDevAttrPageableMemoryAccess hipDeviceAttributePageableMemoryAccess +#define cudaMemLocationTypeDevice hipMemLocationTypeDevice + +#define cudaMalloc hipMalloc +#define cudaMallocHost hipHostMalloc +#define cudaMallocManaged hipMallocManaged +#define cudaFree hipFree +#define cudaFreeHost hipFreeHost +#define cudaMemset hipMemset +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemGetInfo hipMemGetInfo +#define cudaMemsetAsync hipMemsetAsync + +#define cudaHostRegister hipHostRegister +#define cudaHostUnregister hipHostUnregister +#define cudaHostGetDevicePointer hipHostGetDevicePointer +#define cudaHostRegisterMapped hipHostRegisterMapped +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly + +#define cudaMemAdvise(p1, p2, p3, p4) hipMemAdvise(p1, p2, p3, p4.id) +#define cudaMemPrefetchAsync(devPtr, count, location, flags, stream) hipMemPrefetchAsync(devPtr, count, location.id, stream) +#define cudaMemAdviseSetReadMostly hipMemAdviseSetReadMostly +#define cudaMemAdviseSetPreferredLocation hipMemAdviseSetPreferredLocation + +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamNonBlocking hipStreamNonBlocking + +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDestroy hipEventDestroy +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaEventDisableTiming hipEventDisableTiming + +#define cublasHandle_t hipblasHandle_t +#define cublasStatus_t hipblasStatus_t +#define cublasMath_t hipblasMath_t + +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_DEFAULT_MATH HIPBLAS_DEFAULT_MATH +#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F +#define CUBLAS_TF32_TENSOR_OP_MATH HIPBLAS_TF32_TENSOR_OP_MATH +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F + +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasSetMathMode hipblasSetMathMode +#define cublasSgemm hipblasSgemm +#define cublasSgemmStridedBatched hipblasSgemmStridedBatched +#define cublasGemmEx hipblasGemmEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx + +static __device__ __forceinline__ int32_t __vcmpne4(uint32_t a, uint32_t b) { + // For each byte: 0xFF if a != b, 0x00 if a == b + uint32_t diff = a ^ b; + // Spread any set bit in each byte to fill the whole byte + diff |= (diff >> 1); diff |= (diff >> 2); diff |= (diff >> 4); + diff &= 0x01010101u; + diff *= 0xFFu; // 0x01 -> 0xFF per byte + return (int32_t)diff; +} + +static __device__ __forceinline__ int32_t __vsub4(int32_t a, int32_t b) { + // Per-byte subtraction (wrapping, not saturating) + uint32_t ua = (uint32_t)a, ub = (uint32_t)b; + // Trick: subtract bytes in parallel avoiding cross-byte borrows + uint32_t diff = ((ua | 0x80808080u) - (ub & 0x7F7F7F7Fu)) ^ ((ua ^ ~ub) & 0x80808080u); + return (int32_t)diff; +} + +// __dp4a: dot product of 4 signed int8s packed in an int32 +static __device__ __forceinline__ int32_t __dp4a(int32_t a, int32_t b, int32_t c) { + const int8_t *a_bytes = reinterpret_cast(&a); + const int8_t *b_bytes = reinterpret_cast(&b); + return c + (int32_t)a_bytes[0] * b_bytes[0] + + (int32_t)a_bytes[1] * b_bytes[1] + + (int32_t)a_bytes[2] * b_bytes[2] + + (int32_t)a_bytes[3] * b_bytes[3]; +} +