Faster LRM

zalbanob · zalbanob · commit 15fc37556bed · 2026-03-20T12:59:55.000+01:00
diff --git a/inst/include/propr/kernels/cuda/detail/lrm.cuh b/inst/include/propr/kernels/cuda/detail/lrm.cuh
@@ -13,43 +13,125 @@ namespace propr {
     namespace detail {
         namespace cuda {
 
+            // template <class Config>
+            // __global__
+            // void
+            // lrm_basic(float* __restrict__ d_Y, offset_t d_Y_stride,
+            //           float* __restrict__ d_mean,
+            //           int nb_samples,
+            //           int nb_genes) {
+            //     int i = blockIdx.x * blockDim.x + threadIdx.x;
+            //     int j = blockIdx.y * blockDim.y + threadIdx.y;
+            //     if (i >= nb_genes || j >= i) return;
+
+            //     float4 accum = {0.0f, 0.0f, 0.0f, 0.0f};
+            //     int k = 0;
+            //     PROPR_UNROLL
+            //     for (; k < (nb_samples/4)*4; k += 4) {
+            //         float4 y_i = thread::load<Config::LoadModifer,float4>(&d_Y[k + i * d_Y_stride]);
+            //         float4 y_j = thread::load<Config::LoadModifer,float4>(&d_Y[k + j * d_Y_stride]);
+
+            //         accum.x = __logf(__fdividef(y_i.x, y_j.x)) +  accum.x;
+            //         accum.y = __logf(__fdividef(y_i.y, y_j.y)) +  accum.y;
+            //         accum.z = __logf(__fdividef(y_i.z, y_j.z)) +  accum.z;
+            //         accum.w = __logf(__fdividef(y_i.w, y_j.w)) +  accum.w;
+            //     }
+
+            //     accum.x = accum.x + accum.y + accum.z + accum.w;
+            //     for (; k < nb_samples; ++k) {
+            //         float yi = d_Y[k + i * d_Y_stride];
+            //         float yj = d_Y[k + j * d_Y_stride];
+            //         accum.x  =  __logf(__fdividef(yi, yj)) +  accum.x;
+            //     }
+
+            //     float inv_n = __frcp_rn(static_cast<float>(nb_samples));
+            //     float mean  = accum.x * inv_n;
+            //     int pair_index = (i * (i - 1)) / 2 + j;
+            //     d_mean[pair_index] = mean;
+            // }
+
             template <class Config>
             __global__
             void
-            lrm_basic(float* __restrict__ d_Y, offset_t d_Y_stride,
-                      float* __restrict__ d_mean,
-                      int nb_samples,
-                      int nb_genes) {
-                int i = blockIdx.x * blockDim.x + threadIdx.x;
-                int j = blockIdx.y * blockDim.y + threadIdx.y;
-                if (i >= nb_genes || j >= i) return;
-
-                float4 accum = {0.0f, 0.0f, 0.0f, 0.0f};
+            lrm_basic_phase_1(float* __restrict__ d_Y,
+                               offset_t d_Y_stride,
+                               float* __restrict__ d_mean_log,
+                               int nb_samples,
+                               int nb_genes) {
+                const auto EPS = std::numeric_limits<float>::epsilon();
+                const int g = blockIdx.x * blockDim.x + threadIdx.x;
+                if (g >= nb_genes) return;
+
+                const offset_t g_offset = static_cast<offset_t>(g) * d_Y_stride;
+
+                float s0 = 0.0;
+                float s1 = 0.0;
+                float s2 = 0.0;
+                float s3 = 0.0;
                 int k = 0;
+
                 PROPR_UNROLL
-                for (; k < (nb_samples/4)*4; k += 4) {
-                    float4 y_i = thread::load<Config::LoadModifer,float4>(&d_Y[k + i * d_Y_stride]);
-                    float4 y_j = thread::load<Config::LoadModifer,float4>(&d_Y[k + j * d_Y_stride]);                    
-                    
-                    accum.x = __logf(__fdividef(y_i.x, y_j.x)) +  accum.x;
-                    accum.y = __logf(__fdividef(y_i.y, y_j.y)) +  accum.y;
-                    accum.z = __logf(__fdividef(y_i.z, y_j.z)) +  accum.z;
-                    accum.w = __logf(__fdividef(y_i.w, y_j.w)) +  accum.w;
+                for (; k < (nb_samples / 4) * 4; k += 4) {
+                    const float4 y = thread::load<Config::LoadModifer, float4>(&d_Y[g_offset + k]);
+                    s0 += __logf(fmaxf(y.x, EPS));
+                    s1 += __logf(fmaxf(y.y, EPS));
+                    s2 += __logf(fmaxf(y.z, EPS));
+                    s3 += __logf(fmaxf(y.w, EPS));
                 }
 
-                accum.x = accum.x + accum.y + accum.z + accum.w;
+                double sum = (s0 + s1) + (s2 + s3);
                 for (; k < nb_samples; ++k) {
-                    float yi = d_Y[k + i * d_Y_stride];
-                    float yj = d_Y[k + j * d_Y_stride];
-                    accum.x  =  __logf(__fdividef(yi, yj)) +  accum.x;
+                    const float y = thread::load<Config::LoadModifer, float>(&d_Y[g_offset + k]);
+                    sum += static_cast<double>(__logf(fmaxf(y, EPS)));
                 }
 
-                float inv_n = __frcp_rn(static_cast<float>(nb_samples));
-                float mean  = accum.x * inv_n;
-                int pair_index = (i * (i - 1)) / 2 + j;
-                d_mean[pair_index] = mean;
+                const float mean_log = static_cast<float>(sum / static_cast<double>(nb_samples));
+                thread::store<Config::StoreModifer, float>(&d_mean_log[g], mean_log);
+            }
+
+            template <class Config>
+            __global__
+            void
+            lrm_basic_phase_2(float* __restrict__ d_mean_log,
+                              float* __restrict__ d_mean,
+                              int nb_genes) {
+                using P2_Layout = typename Config::P2_Layout;
+                static_assert(P2_Layout::BLK_X == P2_Layout::BLK_Y, "Tile size must be square");
+                constexpr int TILE_G = P2_Layout::BLK_X;
+
+                const int li = threadIdx.x;
+                const int lj = threadIdx.y;
+
+                const int gi = blockIdx.x * TILE_G + li;
+                const int gj = blockIdx.y * TILE_G + lj;
+
+                if (blockIdx.y > blockIdx.x) return;
+
+                __shared__ float sh_i[TILE_G], sh_j[TILE_G];
+
+                if (lj == 0) {
+                    sh_i[li] = (gi < nb_genes)
+                        ? thread::load<Config::LoadModifer, float>(&d_mean_log[gi])
+                        : 0.0f;
+                }
+
+                if (li == 0) {
+                    sh_j[lj] = (gj < nb_genes)
+                        ? thread::load<Config::LoadModifer, float>(&d_mean_log[gj])
+                        : 0.0f;
+                }
+
+                __syncthreads();
+
+                if (gi < nb_genes && gj < nb_genes && gj < gi) {
+                    const offset_t pair_index =
+                        (static_cast<offset_t>(gi) * static_cast<offset_t>(gi - 1)) / 2 +
+                        static_cast<offset_t>(gj);
+                    thread::store<Config::StoreModifer, float>(&d_mean[pair_index], sh_i[li] - sh_j[lj]);
+                }
             }
 
+
             template<class Config>
             __global__
             void
@@ -354,4 +436,4 @@ namespace propr {
 
         }
     }
-}
+}
diff --git a/inst/include/propr/kernels/cuda/dispatch/comparison.cuh b/inst/include/propr/kernels/cuda/dispatch/comparison.cuh
@@ -5,9 +5,9 @@
 namespace propr {
     namespace dispatch {
         namespace cuda {
-            int count_less_than         (Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);
-            int count_greater_than      (Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);
-            int count_less_equal_than   (Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);
+            int count_less_than(Rcpp::NumericVector& x, double cutoff, propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);
+            int count_greater_than(Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);
+            int count_less_equal_than(Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);
             int count_greater_equal_than(Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);
         }
     }
diff --git a/inst/include/propr/kernels/cuda/traits/lrm.cuh b/inst/include/propr/kernels/cuda/traits/lrm.cuh
@@ -7,7 +7,10 @@
 namespace propr {
     namespace cuda {
         namespace traits {
-            struct lrm_basic : thread_layout_2d<>{
+            struct lrm_basic {
+                using P1_Layout = thread_layout_1d<256>;
+                using P2_Layout = thread_layout_2d<16,16>;
+            
                 const static cub::CacheLoadModifier  LoadModifer  = cub::LOAD_CG;
                 const static cub::CacheStoreModifier StoreModifer = cub::STORE_CG;
             };
@@ -28,4 +31,4 @@ namespace propr {
             };
         }
     }
-}
+}
diff --git a/src/dispatch/cpu/CMakeLists.txt b/src/dispatch/cpu/CMakeLists.txt
@@ -7,6 +7,7 @@ set(PROPR_SOURCES
         ${CMAKE_CURRENT_SOURCE_DIR}/lrm.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/lrv.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/omega.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/genewise.cpp
         ${PROPR_SOURCES}
         PARENT_SCOPE
 )
diff --git a/src/dispatch/cuda/lrm.cu b/src/dispatch/cuda/lrm.cu
@@ -26,22 +26,37 @@ propr::dispatch::cuda::lrm_basic(NumericVector& out, NumericMatrix &Y, propr::pr
     float* d_Y;
     offset_t stride; d_Y = RcppMatrixToDevice<float>(Y, stride);
 
+    float* d_mean_log;
+    PROPR_CUDA_CHECK(cudaMalloc(&d_mean_log, static_cast<size_t>(N_genes) * sizeof(float)));
+
     float* d_mean;
     PROPR_CUDA_CHECK(cudaMalloc(&d_mean, N_pairs * sizeof(float)));
 
-    dim3 blockDim(Config::BLK_X, Config::BLK_Y);
-    dim3 gridDim(propr::ceil_div(N_genes, Config::BLK_X),  propr::ceil_div(N_genes, Config::BLK_Y));
+    dim3 block1(Config::P1_Layout::BLK_X);
+    dim3 grid1(propr::ceil_div(N_genes, Config::P1_Layout::BLK_X));
+
+    dim3 block2(Config::P2_Layout::BLK_X, Config::P2_Layout::BLK_Y);
+    dim3 grid2(propr::ceil_div(N_genes, Config::P2_Layout::BLK_X),
+               propr::ceil_div(N_genes, Config::P2_Layout::BLK_Y));
+               
 
     {
         PROPR_PROFILE_CUDA("kernel", context.stream);
-        propr::detail::cuda::lrm_basic<Config><<<gridDim, blockDim, 0, context.stream>>>(
-            d_Y, stride, d_mean, N_samples, N_genes
+        propr::detail::cuda::lrm_basic_phase_1<Config><<<grid1, block1, 0, context.stream>>>(
+            d_Y, stride, d_mean_log, N_samples, N_genes
+        );
+        PROPR_CUDA_CHECK(cudaGetLastError());
+
+        propr::detail::cuda::lrm_basic_phase_2<Config><<<grid2, block2, 0, context.stream>>>(
+            d_mean_log, d_mean, N_genes
         );
+        PROPR_CUDA_CHECK(cudaGetLastError());
         PROPR_STREAM_SYNCHRONIZE(context);
     }
 
     copyToNumericVector(d_mean, out, N_pairs);
     PROPR_CUDA_CHECK(cudaFree(d_Y));
+    PROPR_CUDA_CHECK(cudaFree(d_mean_log));
     PROPR_CUDA_CHECK(cudaFree(d_mean));
 }
 
@@ -167,4 +182,4 @@ propr::dispatch::cuda::lrm_alpha_weighted(NumericVector& out,
     PROPR_CUDA_CHECK(cudaFree(d_Yfull));
     PROPR_CUDA_CHECK(cudaFree(d_Wfull));
     PROPR_CUDA_CHECK(cudaFree(d_means));
-}
+}
diff --git a/src/dispatch/runtime/resolve_backend.cpp b/src/dispatch/runtime/resolve_backend.cpp
@@ -24,7 +24,7 @@ Backend resolve_backend(const Rcpp::String& requested) {
         }
     }
 
-    if (req == "cuda") {
+    if (req == "cuda" || req == "gpu" ) {
         if (cuda_is_available()) return Backend::CUDA;
         static bool warned = false;
         if (!warned) {

Original file line number	Diff line number	Diff line change
`@@ -5,9 +5,9 @@`
`5`	`5`	`namespace propr {`
`6`	`6`	`namespace dispatch {`
`7`	`7`	`namespace cuda {`
`8`		`- int count_less_than (Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);`
`9`		`- int count_greater_than (Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);`
`10`		`- int count_less_equal_than (Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);`
	`8`	`+ int count_less_than(Rcpp::NumericVector& x, double cutoff, propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);`
	`9`	`+ int count_greater_than(Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);`
	`10`	`+ int count_less_equal_than(Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);`
`11`	`11`	`int count_greater_equal_than(Rcpp::NumericVector& x, double cutoff,propr::propr_context context=DEFAULT_GLOBAL_CONTEXT);`
`12`	`12`	`}`
`13`	`13`	`}`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ set(PROPR_SOURCES`
`7`	`7`	`${CMAKE_CURRENT_SOURCE_DIR}/lrm.cpp`
`8`	`8`	`${CMAKE_CURRENT_SOURCE_DIR}/lrv.cpp`
`9`	`9`	`${CMAKE_CURRENT_SOURCE_DIR}/omega.cpp`
	`10`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/genewise.cpp`
`10`	`11`	`${PROPR_SOURCES}`
`11`	`12`	`PARENT_SCOPE`
`12`	`13`	`)`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ Backend resolve_backend(const Rcpp::String& requested) {`
`24`	`24`	`}`
`25`	`25`	`}`
`26`	`26`
`27`		`- if (req == "cuda") {`
	`27`	`+ if (req == "cuda" \|\| req == "gpu" ) {`
`28`	`28`	`if (cuda_is_available()) return Backend::CUDA;`
`29`	`29`	`static bool warned = false;`
`30`	`30`	`if (!warned) {`