Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 12 additions & 24 deletions include/oneapi/dpl/pstl/algorithm_fwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -452,25 +452,15 @@ std::pair<_DifferenceType, _DifferenceType>
__brick_compute_mask(_RandomAccessIterator, _DifferenceType, _IterPredicate, bool*,
/*vector=*/std::true_type) noexcept;

template <class _ForwardIterator, class _OutputIterator>
void
__brick_copy_by_mask(_ForwardIterator, _ForwardIterator, _OutputIterator, bool*,
/*vector=*/::std::false_type) noexcept;

template <class _RandomAccessIterator, class _OutputIterator>
void
__brick_copy_by_mask(_RandomAccessIterator, _RandomAccessIterator, _OutputIterator, bool*,
/*vector=*/::std::true_type) noexcept;

template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _Bound, class _Assigner>
template <bool, class _RandomAccessIterator1, class _RandomAccessIterator2, class _Bound, class _Assigner>
_Bound
__brick_bounded_copy_by_mask(_RandomAccessIterator1, _Bound, _RandomAccessIterator2, _Bound, bool*, _Assigner,
/*vector=*/std::false_type) noexcept;
__brick_copy_by_mask(_RandomAccessIterator1, _Bound, _RandomAccessIterator2, _Bound, bool*, _Assigner,
/*vector=*/std::false_type) noexcept;

template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _Bound, class _Assigner>
template <bool, class _RandomAccessIterator1, class _RandomAccessIterator2, class _Bound, class _Assigner>
_Bound
__brick_bounded_copy_by_mask(_RandomAccessIterator1, _Bound, _RandomAccessIterator2, _Bound, bool*, _Assigner,
/*vector=*/std::true_type) noexcept;
__brick_copy_by_mask(_RandomAccessIterator1, _Bound, _RandomAccessIterator2, _Bound, bool*, _Assigner,
/*vector=*/std::true_type) noexcept;

template <class _ForwardIterator, class _OutputIterator1, class _OutputIterator2>
void
Expand All @@ -484,9 +474,9 @@ __brick_partition_by_mask(_RandomAccessIterator, _RandomAccessIterator, _OutputI

template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _DifferenceType,
class _RandomAccessIterator2, class _IterPredicate>
_RandomAccessIterator2
std::pair<_RandomAccessIterator1, _RandomAccessIterator2>
__parallel_selective_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&&, _RandomAccessIterator1, _DifferenceType,
_RandomAccessIterator2, _IterPredicate);
_RandomAccessIterator2, _DifferenceType, _IterPredicate);

template <class _Tag, class _ExecutionPolicy, class _ForwardIterator, class _OutputIterator, class _UnaryPredicate>
_OutputIterator
Expand All @@ -499,13 +489,11 @@ _RandomAccessIterator2
__pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&&, _RandomAccessIterator1, _RandomAccessIterator1,
_RandomAccessIterator2, _UnaryPredicate);

template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
class _UnaryPredicate>
template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _DifferenceType,
class _RandomAccessIterator2, class _UnaryPredicate>
std::pair<_RandomAccessIterator1, _RandomAccessIterator2>
__pattern_bounded_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&&, _RandomAccessIterator1,
typename std::iterator_traits<_RandomAccessIterator1>::difference_type,
_RandomAccessIterator2,
typename std::iterator_traits<_RandomAccessIterator2>::difference_type, _UnaryPredicate);
__pattern_bounded_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&&, _RandomAccessIterator1, _DifferenceType,
_RandomAccessIterator2, _DifferenceType, _UnaryPredicate);

//------------------------------------------------------------------------
// count
Expand Down
171 changes: 52 additions & 119 deletions include/oneapi/dpl/pstl/algorithm_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1267,75 +1267,32 @@ __brick_compute_mask(_RandomAccessIterator __first, _DifferenceType __len, _Iter
return std::make_pair(__count_true, __len - __count_true);
}

template <class _ForwardIterator, class _OutputIterator, class _Assigner>
void
__brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, bool* __mask,
_Assigner __assigner, /*vector=*/::std::false_type) noexcept
{
for (; __first != __last; ++__first, (void)++__mask)
{
if (*__mask)
{
__assigner(__first, __result);
++__result;
}
}
}

template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _Assigner>
void
__brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result,
bool* __mask, _Assigner __assigner, /*vector=*/::std::true_type) noexcept
{
__unseq_backend::__simd_copy_by_mask(__first, __last - __first, __result, __mask, __assigner);
}

template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _Bound, class _Assigner>
template <bool __Bounded, class _RandomAccessIterator1, class _RandomAccessIterator2, class _Bound, class _Assigner>
_Bound
__brick_bounded_copy_by_mask(_RandomAccessIterator1 __first, _Bound __in_len, _RandomAccessIterator2 __result,
_Bound __out_len, bool* __mask, _Assigner __assigner, /*vector=*/std::false_type) noexcept
__brick_copy_by_mask(_RandomAccessIterator1 __first, _Bound __in_len, _RandomAccessIterator2 __result, _Bound __out_len,
bool* __mask, _Assigner __assigner, /*vector=*/std::false_type) noexcept
{
_Bound __i = 0, __j = 0;
for (; __i < __in_len && __j < __out_len; ++__i, (void)++__first)
for (; __i < __in_len; ++__i)
{
if (__mask[__i])
{
__assigner(__first, __result);
++__result;
if constexpr (__Bounded)
if (__j == __out_len)
break;
__assigner(__first + __i, __result + __j);
++__j;
}
}
return __i;
}

template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _Bound, class _Assigner>
template <bool __Bounded, class _RandomAccessIterator1, class _RandomAccessIterator2, class _Bound, class _Assigner>
_Bound
__brick_bounded_copy_by_mask(_RandomAccessIterator1 __first, _Bound __in_len, _RandomAccessIterator2 __result,
_Bound __out_len, bool* __mask, _Assigner __assigner,
/*vector=*/std::true_type) noexcept
__brick_copy_by_mask(_RandomAccessIterator1 __first, _Bound __in_len, _RandomAccessIterator2 __result, _Bound __out_len,
bool* __mask, _Assigner __assigner, /*vector=*/std::true_type) noexcept
{
#if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT)
_Bound __n = __in_len, __m = __out_len;
while (__m > 0 && __m < __n)
{
_Bound __copied = __unseq_backend::__simd_copy_by_mask(__first, __m, __result, __mask, __assigner);
__n -= __m;
__first += __m;
__mask += __m;
__m -= __copied;
__result += __copied;
}
// The loop above may not decrease __m or __n below 0
if (__m >= __n) // enough space left for the rest
{
__unseq_backend::__simd_copy_by_mask(__first, __n, __result, __mask, __assigner);
__n = 0;
}
return __in_len - __n;
#else
return __internal::__brick_bounded_copy_by_mask(__first, __in_len, __result, __out_len, __mask, __assigner,
std::false_type());
#endif
return __unseq_backend::__simd_copy_by_mask<__Bounded>(__first, __in_len, __result, __out_len, __mask, __assigner);
}

template <class _ForwardIterator, class _OutputIterator1, class _OutputIterator2>
Expand Down Expand Up @@ -1373,29 +1330,46 @@ __brick_partition_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1

template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _DifferenceType,
class _RandomAccessIterator2, class _IterPredicate>
_RandomAccessIterator2
std::pair<_RandomAccessIterator1, _RandomAccessIterator2>
__parallel_selective_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first,
_DifferenceType __n, _RandomAccessIterator2 __result, _IterPredicate __pred)
_DifferenceType __n, _RandomAccessIterator2 __result, _DifferenceType __n_out,
_IterPredicate __pred)
{
using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
__par_backend::__buffer<bool> __mask_buf(__n);
bool* __mask = __mask_buf.get();

return __internal::__except_handler([&__exec, __n, __first, __result, __pred, __mask]() {
_DifferenceType __m{};
return __internal::__except_handler([&__exec, __n, __first, __result, __pred, __mask, __n_out]() {
_DifferenceType __stop_in{__n}, __stop_out{__n_out};
__par_backend::__parallel_strict_scan(
__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0),
[=](_DifferenceType __i, _DifferenceType __len) { // Reduce
return __internal::__brick_compute_mask(__first + __i, __len, __pred, __mask + __i, _IsVector{}).first;
},
std::plus<_DifferenceType>(), // Combine
[=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan
__internal::__brick_copy_by_mask(
__first + __i, __first + (__i + __len), __result + __initial, __mask + __i,
[](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{});
[=, &__stop_in](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan
if (__initial > __n_out) // The chunk has neither elements to write nor the stop position
return;
auto __assign = [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; };
_DifferenceType __space = __n_out - __initial;
if (__space >= __len)
{
__internal::__brick_copy_by_mask</*bounded =*/ false>(
__first + __i, __len, __result + __initial, __space, __mask + __i, __assign, _IsVector{});
}
else
{
_DifferenceType __stop = __internal::__brick_copy_by_mask</*bounded =*/ true>(
__first + __i, __len, __result + __initial, __space, __mask + __i, __assign, _IsVector{});
if (__stop != __len) // Found the position of the first element that cannot be copied
__stop_in = __i + __stop; // Since there is only one such position, there is no data race
}
},
[&__m](_DifferenceType __total) { __m = __total; }); // Apex
return __result + __m;
[&__stop_out](_DifferenceType __total) { // Apex
if (__total < __stop_out) // Output size is bigger than needed
__stop_out = __total;
});
return std::make_pair(__first + __stop_in, __result + __stop_out);
});
}

Expand All @@ -1420,64 +1394,23 @@ __pattern_copy_if(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _R
if (_DifferenceType(1) < __n)
{
return __parallel_selective_copy(
__tag, std::forward<_ExecutionPolicy>(__exec), __first, __n, __result,
[&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx) { return __pred(__it[__idx]); });
__tag, std::forward<_ExecutionPolicy>(__exec), __first, __n, __result, __n,
[&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx) { return __pred(__it[__idx]); }).second;
}
// trivial sequence - use serial algorithm
return __internal::__brick_copy_if(__first, __last, __result, __pred, _IsVector{});
}

template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
class _UnaryPredicate>
template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _DifferenceType,
class _RandomAccessIterator2, class _UnaryPredicate>
std::pair<_RandomAccessIterator1, _RandomAccessIterator2>
__pattern_bounded_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first,
typename std::iterator_traits<_RandomAccessIterator1>::difference_type __n,
_RandomAccessIterator2 __result,
typename std::iterator_traits<_RandomAccessIterator2>::difference_type __n_out,
__pattern_bounded_copy_if(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first,
_DifferenceType __n, _RandomAccessIterator2 __result, _DifferenceType __n_out,
_UnaryPredicate __pred)
{
using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
using _DifferenceType = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;

__par_backend::__buffer<bool> __mask_buf(__n);
bool* __mask = __mask_buf.get();
auto __it_pred = [=](_RandomAccessIterator1 __it, _DifferenceType __idx) {
return std::invoke(__pred, __it[__idx]);
};
return __internal::__except_handler([&__exec, __n, __first, __result, __it_pred, __mask, __n_out]() {
_DifferenceType __res_in{__n}, __res_out{__n_out};
__par_backend::__parallel_strict_scan(
__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0),
[=](_DifferenceType __i, _DifferenceType __len) { // Reduce
return __internal::__brick_compute_mask(__first + __i, __len, __it_pred, __mask + __i, _IsVector{})
.first;
},
std::plus<_DifferenceType>(), // Combine
[=, &__res_in](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan
if (__initial > __n_out) // The chunk has neither elements to write nor the stop position
return;
bool* __mask_start = __mask + __i;
bool* __mask_end = __mask + (__i + __len);
if (__initial < __n_out)
{
_DifferenceType __checked = __internal::__brick_bounded_copy_by_mask(
__first + __i, __len, __result + __initial, __n_out - __initial, __mask_start,
[](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{});
if (__checked == __len)
return;
__mask_start += __checked;
}
bool* __stop =
__internal::__brick_find_if(__mask_start, __mask_end, oneapi::dpl::identity{}, _IsVector{});
if (__stop != __mask_end) // Found the position of the first element that cannot be copied
__res_in = __stop - __mask; // Since there is only one such position, there is no data race
},
[&__res_out](auto __total_out) { // Apex
if (__total_out < __res_out) // Output size is bigger than needed
__res_out = __total_out;
});
return std::make_pair(__first + __res_in, __result + __res_out);
});
return __parallel_selective_copy(
__tag, std::forward<_ExecutionPolicy>(__exec), __first, __n, __result, _DifferenceType{__n_out},
[&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx) { return __pred(__it[__idx]); });
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -1624,8 +1557,8 @@ __remove_elements(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA
},
::std::plus<_DifferenceType>(),
[=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) {
__internal::__brick_copy_by_mask(
__first + __i, __first + __i + __len, __result + __initial, __mask + __i,
__internal::__brick_copy_by_mask</*bounded*/ false>(
__first + __i, __len, __result + __initial, __len, __mask + __i,
[](_RandomAccessIterator __x, _Tp* __z) { ::new (std::addressof(*__z)) _Tp(std::move(*__x)); },
_IsVector{});
},
Expand Down Expand Up @@ -1712,10 +1645,10 @@ __pattern_unique_copy(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec
{
*__result++ = *__first++; // Always copy the first element
--__n;
return __parallel_selective_copy(__tag, std::forward<_ExecutionPolicy>(__exec), __first, __n, __result,
return __parallel_selective_copy(__tag, std::forward<_ExecutionPolicy>(__exec), __first, __n, __result, __n,
[&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx) {
return !__pred(__it[__idx], __it[__idx - 1]);
});
}).second;
}
// trivial sequence - use serial algorithm
return __internal::__brick_unique_copy(__first, __last, __result, __pred, _IsVector{});
Expand Down
5 changes: 3 additions & 2 deletions include/oneapi/dpl/pstl/algorithm_ranges_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -544,10 +544,11 @@ std::ranges::copy_if_result<std::ranges::borrowed_iterator_t<_InRange>, std::ran
__pattern_copy_if_ranges(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _InRange&& __in_r,
_OutRange&& __out_r, _Pred __pred, _Proj __proj)
{
using _Size = oneapi::dpl::__ranges::__common_size_t<_InRange, _OutRange>;
auto __first_in = std::ranges::begin(__in_r);
auto __first_out = std::ranges::begin(__out_r);
auto __sz_in = std::ranges::size(__in_r);
auto __sz_out = std::ranges::size(__out_r);
_Size __sz_in = std::ranges::size(__in_r);
_Size __sz_out = std::ranges::size(__out_r);

// TODO: test if redirecting to "regular" copy_if for sufficient output performs better
if (__sz_in > 0 && __sz_out > 0)
Expand Down
19 changes: 14 additions & 5 deletions include/oneapi/dpl/pstl/unseq_backend_simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,23 +297,32 @@ __simd_compute_mask(_Iterator __first, _DifferenceType __n, _IterPredicate __pre
return __count;
}

template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _Assigner>
template <bool __Bounded, class _InputIterator, class _DifferenceType, class _OutputIterator, class _Assigner>
_DifferenceType
__simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, bool* __mask,
_Assigner __assigner) noexcept
__simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _DifferenceType __n_out,
bool* __mask, _Assigner __assign) noexcept
{
std::make_signed_t<_DifferenceType> __cnt = -1; // to use inclusive scan of the mask
_DifferenceType __stop = __n;
_ONEDPL_PRAGMA_SIMD_SCAN(+ : __cnt)
for (_DifferenceType __i = 0; __i < __n; ++__i)
{
__cnt += __mask[__i];
_ONEDPL_PRAGMA_SIMD_INCLUSIVE_SCAN(__cnt)
if (__mask[__i])
{
__assigner(__first + __i, __result + __cnt);
if constexpr (__Bounded)
{
if (__cnt < __n_out)
__assign(__first + __i, __result + __cnt);
if (__cnt == __n_out) // together with the mask, the conditions are true for only one index
__stop = __i;
}
else
__assign(__first + __i, __result + __cnt);
}
}
return __cnt + 1; // accounts for the initial -1
return __stop;
}

template <class _InputIterator, class _DifferenceType, class _OutputIterator1, class _OutputIterator2>
Expand Down
Loading