NVIDIA · davebayer · May 4, 2026 · miscco · May 4, 2026 · miscco
@@ -120,12 +120,13 @@ class group
   }
 
 public:
-  using unit_type             = _Unit;
-  using level_type            = typename _ParentGroup::level_type;
-  using hierarchy_type        = typename _ParentGroup::hierarchy_type;
-  using mapping_type          = _Mapping;
-  using __mapping_result_type = _MappingResult;
-  using synchronizer_type     = _Synchronizer;
+  using unit_type                    = _Unit;
+  using level_type                   = typename _ParentGroup::level_type;
+  using hierarchy_type               = typename _ParentGroup::hierarchy_type;
+  using mapping_type                 = _Mapping;
+  using __mapping_result_type        = _MappingResult;
+  using synchronizer_type            = _Synchronizer;
+  using __synchronizer_instance_type = _SynchronizerInstance;
 
   _CCCL_DEVICE_API explicit group(
     const _Unit& __unit,
@@ -162,6 +163,11 @@ public:
     return __synchronizer_;
   }
 
+  [[nodiscard]] _CCCL_DEVICE_API const _SynchronizerInstance& __synchronizer_instance() const noexcept
+  {
+    return __synchronizer_instance_;
+  }
+
   // todo(dabayer): Do we want to expose .arrive() and .wait()? Do we want to implement .sync() using them? Do we want
   //                aligned/unaligned variants?
   _CCCL_DEVICE_API void sync() noexcept

@@ -55,6 +55,11 @@ public:
       return {0u};
     }
 
+    [[nodiscard]] _CCCL_DEVICE_API unsigned __lane_mask() const noexcept
+    {
+      return __lane_mask_;
+    }
+
     template <class _MappingResult>
     _CCCL_DEVICE_API void do_sync(const _MappingResult&, const lane_synchronizer&) const noexcept
     {

@@ -116,46 +116,60 @@ __device__ cuda::std::optional<T> sum(cudax::this_cluster<Hierarchy> group, T (&
   return (cuda::gpu_thread.is_root_rank(group)) ? cuda::std::optional{result} : cuda::std::nullopt;
 }
 
-// todo(dabayer): Add support for warp and cluster levels.
 template <class Group, class T, cuda::std::size_t N>
 __device__ cuda::std::optional<T> sum(Group group, T (&array)[N])
 {
-  using Unit          = typename Group::unit_type;
-  using MappingResult = typename Group::__mapping_result_type;
-
-  constexpr auto ngroups = MappingResult::static_group_count();
-  static_assert(ngroups != cuda::std::dynamic_extent, "group count must be statically known");
-
-  __shared__ T group_sums[ngroups];
+  using Level = typename Group::level_type;
 
-  if (!Unit{}.is_part_of(group))
+  if constexpr (cuda::std::is_same_v<Level, cuda::warp_level>)
   {
-    return cuda::std::nullopt;
+    const auto result_unit = sum(cudax::this_thread{group.hierarchy()}, array);
+
+    // todo(dabayer): Implement fallback for cc < 80.
+    T result;
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+                 ({ result = __reduce_add_sync(group.__synchronizer_instance().__lane_mask(), result_unit.value()); }))
+    return (cuda::gpu_thread.is_root_rank(group)) ? cuda::std::optional{result} : cuda::std::nullopt;
   }
+  else
+  {
+    using Unit          = typename Group::unit_type;
+    using MappingResult = typename Group::__mapping_result_type;
 
-  // todo(dabayer): Replace by group.rank(level) once this query is available.
-  const auto group_rank = group.__mapping_result().group_rank();
+    constexpr auto ngroups = MappingResult::static_group_count();
+    static_assert(ngroups != cuda::std::dynamic_extent, "group count must be statically known");
 
-  if (cuda::gpu_thread.is_root_rank(group))
-  {
-    group_sums[group_rank] = 0;
-  }
+    __shared__ T group_sums[ngroups];
 
-  const auto unit_group  = cudax::make_this_group(Unit{}, group.hierarchy());
-  const auto result_unit = sum(unit_group, array);
+    if (!Unit{}.is_part_of(group))
+    {
+      return cuda::std::nullopt;
+    }
 
-  // Wait until group_sums are are filled with 0.
-  group.sync_aligned();
+    const auto group_rank = group.rank(Level{});
 
-  if (cuda::gpu_thread.is_root_rank(unit_group))
-  {
-    cuda::atomic_ref<T, cuda::thread_scope_block>{group_sums[group_rank]} += result_unit.value();
-  }
+    if (cuda::gpu_thread.is_root_rank(group))
+    {
+      group_sums[group_rank] = 0;
+    }
+
+    const auto unit_group  = cudax::make_this_group(Unit{}, group.hierarchy());
+    const auto result_unit = sum(unit_group, array);
+
+    // Wait until group_sums are are filled with 0.
+    group.sync_aligned();
 
-  // Wait until all unit_group roots add the intermediate sum to the shared memory.
-  group.sync_aligned();
+    if (cuda::gpu_thread.is_root_rank(unit_group))
+    {
+      constexpr auto min_thread_scope = cudax::__minimum_required_scope_for<Level>();
+      cuda::atomic_ref<T, min_thread_scope>{group_sums[group_rank]} += result_unit.value();
+    }
 
-  return (cuda::gpu_thread.is_root_rank(group)) ? cuda::std::optional{group_sums[group_rank]} : cuda::std::nullopt;
+    // Wait until all unit_group roots add the intermediate sum to the shared memory.
+    group.sync_aligned();
+
+    return (cuda::gpu_thread.is_root_rank(group)) ? cuda::std::optional{group_sums[group_rank]} : cuda::std::nullopt;
+  }
 }
 
 template <class Group>
@@ -190,10 +204,24 @@ struct TestKernel
     test_cooperative_algorithm(cudax::this_block{config});
     test_cooperative_algorithm(cudax::this_cluster{config});
 
+    // todo(dabayer): Enable these once cc < 75 fallback for __reduce_add is implemented.
+    NV_IF_TARGET(
+      NV_PROVIDES_SM_80, ({
+        test_cooperative_algorithm(
+          cudax::group{cuda::gpu_thread, cudax::this_warp{config}, cudax::group_by<2>{}, cudax::lane_synchronizer{}});
+        test_cooperative_algorithm(
+          cudax::group{cuda::gpu_thread, cudax::this_warp{config}, cudax::group_by<16>{}, cudax::lane_synchronizer{}});
+      }))
+
     test_cooperative_algorithm(
       cudax::group{cuda::gpu_thread, cudax::this_block{config}, cudax::group_by<2>{}, cudax::lane_synchronizer{}});
     test_cooperative_algorithm(
       cudax::group{cuda::gpu_thread, cudax::this_block{config}, cudax::group_by<16>{}, cudax::lane_synchronizer{}});
+
+    test_cooperative_algorithm(
+      cudax::group{cuda::gpu_thread, cudax::this_cluster{config}, cudax::group_by<2>{}, cudax::lane_synchronizer{}});
+    test_cooperative_algorithm(
+      cudax::group{cuda::gpu_thread, cudax::this_cluster{config}, cudax::group_by<16>{}, cudax::lane_synchronizer{}});
   }
 };
 } // namespace