[CUB] Add DeviceFind lower/upper bound for sorted values via merge-path#8780
[CUB] Add DeviceFind lower/upper bound for sorted values via merge-path#8780AneeshGidda wants to merge 3 commits intoNVIDIA:mainfrom
Conversation
f0ce49d to
2818850
Compare
2818850 to
ed7170f
Compare
fbusato
left a comment
There was a problem hiding this comment.
thanks a lot for the contribution, @AneeshGidda! and sorry for the delay. The results look amazing. I started the review and I added some comments
| const int needles_count = total_in_tile - haystack_count; | ||
|
|
||
| { | ||
| auto d_range_cm = try_make_cache_modified_iterator<LoadModifier>(d_range + range_beg); |
There was a problem hiding this comment.
| auto d_range_cm = try_make_cache_modified_iterator<LoadModifier>(d_range + range_beg); | |
| const auto d_range_cm = try_make_cache_modified_iterator<LoadModifier>(d_range + range_beg); |
| for (int i = threadIdx.x; i < haystack_count; i += BlockThreads) | ||
| { | ||
| storage.haystack[i] = d_range_cm[i]; | ||
| } |
There was a problem hiding this comment.
minor. we could rewrite this loop with a fixed number of iterations + unroll, and check if (index < haystack_count)
|
|
||
| { | ||
| auto d_values_cm = try_make_cache_modified_iterator<LoadModifier>(d_values + values_beg); | ||
| for (int i = threadIdx.x; i < needles_count; i += BlockThreads) |
|
|
||
| const auto partition_comp = Mode::make_partition_comp(compare_op); | ||
|
|
||
| const int d0_thread = |
There was a problem hiding this comment.
suggestion. Move ItemsPerThread * static_cast<int>(threadIdx.x) to a variable because it is in both branches.
There was a problem hiding this comment.
I don't think (::cuda::std::min) is still needed
| IsFullTile ? (ItemsPerThread * static_cast<int>(threadIdx.x)) | ||
| : (::cuda::std::min) (ItemsPerThread * static_cast<int>(threadIdx.x), total_in_tile); | ||
|
|
||
| const int i0 = static_cast<int>( |
There was a problem hiding this comment.
question. Is static_cast<int>( required here? cub::MergePath should return the right type
| HaystackIt d_range, | ||
| Offset range_count, | ||
| NeedlesIt d_values, | ||
| Offset values_count, | ||
| Offset num_diagonals, | ||
| Offset* range_beg_offsets, | ||
| PartitionCompOp partition_comp) |
There was a problem hiding this comment.
suggestion, use _CCCL_GRID_CONSTANT where possible
| return error; | ||
| } | ||
|
|
||
| return dispatch_compute_cap(policy_selector, cc, [&](auto policy_getter) -> cudaError_t { |
There was a problem hiding this comment.
I think this should have a fully qualified namespace
| using traits_t = policy_traits<decltype(policy_getter)>; | ||
|
|
||
| const Offset total_items = range_count + values_count; | ||
| const Offset num_tiles = ::cuda::ceil_div(total_items, static_cast<Offset>(traits_t::tile_size)); |
There was a problem hiding this comment.
| const Offset num_tiles = ::cuda::ceil_div(total_items, static_cast<Offset>(traits_t::tile_size)); | |
| const Offset num_tiles = ::cuda::ceil_div(total_items, Offset{traits_t::tile_size}); |
| // Lightweight pass; not worth exposing through the tuning system. | ||
| constexpr int threads_per_partition_block = 256; | ||
| const int partition_grid_size = | ||
| static_cast<int>(::cuda::ceil_div(num_diagonals, static_cast<Offset>(threads_per_partition_block))); |
There was a problem hiding this comment.
| static_cast<int>(::cuda::ceil_div(num_diagonals, static_cast<Offset>(threads_per_partition_block))); | |
| static_cast<int>(::cuda::ceil_div(num_diagonals, Offset{threads_per_partition_block})); |
|
|
||
| #include <thrust/system/cuda/detail/core/triple_chevron_launch.h> | ||
|
|
||
| #include <cuda/std/__algorithm/min.h> |
There was a problem hiding this comment.
missing headers
#include <cuda/__cmath/ceil_div.h>
#include <cuda/std/__type_traits/is_empty.h>
Description
closes #7964
Adds two new
cub::DeviceFindalgorithms that exploit the additional precondition that the values (needles) are sorted, in addition to the range (haystack):cub::DeviceFind::LowerBoundSortedValuescub::DeviceFind::UpperBoundSortedValuesThe implementation uses the Merge-Path algorithm to partition the combined traversal across thread blocks, achieving O(N + M) total device work versus the O(M log N) of the existing
LowerBound/UpperBound(which perform M independent binary searches).Public API
Both
[d_range, d_range + range_num_items)and[d_values, d_values + values_num_items)must be sorted consistently withcomp.Files added
cub/device/device_find.cuhcub/agent/agent_find_bound_sorted_values.cuhcub/device/dispatch/dispatch_find_bound_sorted_values.cuhcub/device/dispatch/tuning/tuning_find_bound_sorted_values.cuhcub/test/catch2_test_device_find_bound_sorted_values.cucub/benchmarks/bench/find_bound/{lower_bound,upper_bound,lower_bound_sorted_values,upper_bound_sorted_values}.cu+ sharedfind_bound_common.cuhBenchmarks
Tested on RTX 4090. Speedup =
LowerBoundtime ÷LowerBoundSortedValuestime (and analogously for upper)LowerBoundSortedValuesvsLowerBoundUpperBoundSortedValuesvsUpperBoundTakeaways
Checklist