| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "gusev_d_double_sort_even_odd_batcher/tbb/include/ops_tbb.hpp" | ||
| 2 | |||
| 3 | #include <tbb/blocked_range.h> | ||
| 4 | #include <tbb/global_control.h> | ||
| 5 | #include <tbb/parallel_for.h> | ||
| 6 | #include <tbb/parallel_invoke.h> | ||
| 7 | #include <tbb/task_arena.h> | ||
| 8 | |||
| 9 | #include <algorithm> | ||
| 10 | #include <array> | ||
| 11 | #include <bit> | ||
| 12 | #include <cstddef> | ||
| 13 | #include <cstdint> | ||
| 14 | #include <iterator> | ||
| 15 | #include <utility> | ||
| 16 | #include <vector> | ||
| 17 | |||
| 18 | #include "gusev_d_double_sort_even_odd_batcher/tbb/include/common.hpp" | ||
| 19 | #include "util/include/util.hpp" | ||
| 20 | |||
| 21 | namespace gusev_d_double_sort_even_odd_batcher_tbb_task_threads { | ||
| 22 | namespace { | ||
| 23 | |||
| 24 | constexpr int kRadixPasses = 8; | ||
| 25 | constexpr int kBitsPerByte = 8; | ||
| 26 | constexpr size_t kRadixBuckets = 256; | ||
| 27 | constexpr uint64_t kBucketMask = 0xFFULL; | ||
| 28 | constexpr size_t kMinParallelElements = 128; | ||
| 29 | constexpr size_t kMinParallelPairs = 32; | ||
| 30 | |||
| 31 | static_assert((kRadixPasses % 2) == 0, "Radix sort expects the final data to remain in the input buffer"); | ||
| 32 | |||
| 33 | using Block = std::vector<ValueType>; | ||
| 34 | using BlockList = std::vector<Block>; | ||
| 35 | |||
| 36 | struct BlockRange { | ||
| 37 | size_t begin = 0; | ||
| 38 | size_t end = 0; | ||
| 39 | }; | ||
| 40 | |||
| 41 | uint64_t DoubleToSortableKey(ValueType value) { | ||
| 42 | const auto bits = std::bit_cast<uint64_t>(value); | ||
| 43 | const auto sign_mask = uint64_t{1} << 63; | ||
| 44 |
4/4✓ Branch 0 taken 94504 times.
✓ Branch 1 taken 81912 times.
✓ Branch 2 taken 94504 times.
✓ Branch 3 taken 81912 times.
|
352832 | return (bits & sign_mask) == 0 ? bits ^ sign_mask : ~bits; |
| 45 | } | ||
| 46 | |||
| 47 | size_t GetBucketIndex(ValueType value, int shift) { | ||
| 48 | 352832 | return static_cast<size_t>((DoubleToSortableKey(value) >> shift) & kBucketMask); | |
| 49 | } | ||
| 50 | |||
| 51 | void BuildPrefixSums(std::array<size_t, kRadixBuckets> &count) { | ||
| 52 | size_t prefix = 0; | ||
| 53 |
2/2✓ Branch 0 taken 331776 times.
✓ Branch 1 taken 1296 times.
|
333072 | for (auto &value : count) { |
| 54 | 331776 | const auto current = value; | |
| 55 | 331776 | value = prefix; | |
| 56 | 331776 | prefix += current; | |
| 57 | } | ||
| 58 | } | ||
| 59 | |||
| 60 |
2/2✓ Branch 0 taken 284 times.
✓ Branch 1 taken 162 times.
|
446 | void RadixSortDoubles(OutType &data) { |
| 61 |
2/2✓ Branch 0 taken 284 times.
✓ Branch 1 taken 162 times.
|
446 | if (data.size() < 2) { |
| 62 | 284 | return; | |
| 63 | } | ||
| 64 | |||
| 65 | 162 | OutType buffer(data.size()); | |
| 66 | auto *src = &data; | ||
| 67 | auto *dst = &buffer; | ||
| 68 | |||
| 69 |
2/2✓ Branch 0 taken 1296 times.
✓ Branch 1 taken 162 times.
|
1458 | for (int byte = 0; byte < kRadixPasses; ++byte) { |
| 70 | 1296 | std::array<size_t, kRadixBuckets> count{}; | |
| 71 | 1296 | const auto shift = byte * kBitsPerByte; | |
| 72 | |||
| 73 |
4/4✓ Branch 0 taken 94504 times.
✓ Branch 1 taken 81912 times.
✓ Branch 2 taken 176416 times.
✓ Branch 3 taken 1296 times.
|
177712 | for (ValueType value : *src) { |
| 74 | 176416 | count.at(GetBucketIndex(value, shift))++; | |
| 75 | } | ||
| 76 | BuildPrefixSums(count); | ||
| 77 | |||
| 78 |
4/4✓ Branch 0 taken 94504 times.
✓ Branch 1 taken 81912 times.
✓ Branch 2 taken 176416 times.
✓ Branch 3 taken 1296 times.
|
177712 | for (ValueType value : *src) { |
| 79 | const auto bucket = GetBucketIndex(value, shift); | ||
| 80 | 176416 | (*dst)[count.at(bucket)++] = value; | |
| 81 | } | ||
| 82 | |||
| 83 | std::swap(src, dst); | ||
| 84 | } | ||
| 85 | } | ||
| 86 | |||
| 87 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 740 times.
|
740 | void SplitByGlobalParity(const OutType &source, size_t global_offset, OutType &even_values, OutType &odd_values) { |
| 88 | even_values.clear(); | ||
| 89 | odd_values.clear(); | ||
| 90 | 740 | even_values.reserve((source.size() + 1) / 2); | |
| 91 | 740 | odd_values.reserve(source.size() / 2); | |
| 92 | |||
| 93 |
2/2✓ Branch 0 taken 27677 times.
✓ Branch 1 taken 740 times.
|
28417 | for (size_t i = 0; i < source.size(); ++i) { |
| 94 |
2/2✓ Branch 0 taken 13869 times.
✓ Branch 1 taken 13808 times.
|
27677 | if (((global_offset + i) & 1U) == 0U) { |
| 95 | even_values.push_back(source[i]); | ||
| 96 | } else { | ||
| 97 | odd_values.push_back(source[i]); | ||
| 98 | } | ||
| 99 | } | ||
| 100 | 740 | } | |
| 101 | |||
| 102 | 370 | OutType InterleaveParityGroups(size_t total_size, const OutType &even_values, const OutType &odd_values) { | |
| 103 | 370 | OutType result(total_size); | |
| 104 |
4/6✓ Branch 0 taken 18 times.
✓ Branch 1 taken 352 times.
✓ Branch 2 taken 18 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 18 times.
|
370 | if (total_size < kMinParallelElements || even_values.empty() || odd_values.empty()) { |
| 105 |
2/2✓ Branch 0 taken 1382 times.
✓ Branch 1 taken 352 times.
|
1734 | for (size_t even_index = 0; even_index < even_values.size(); ++even_index) { |
| 106 | 1382 | result[even_index * 2] = even_values[even_index]; | |
| 107 | } | ||
| 108 |
2/2✓ Branch 0 taken 1330 times.
✓ Branch 1 taken 352 times.
|
1682 | for (size_t odd_index = 0; odd_index < odd_values.size(); ++odd_index) { |
| 109 | 1330 | result[(odd_index * 2) + 1] = odd_values[odd_index]; | |
| 110 | } | ||
| 111 | return result; | ||
| 112 | } | ||
| 113 | |||
| 114 | 36 | tbb::parallel_invoke([&]() { | |
| 115 |
2/2✓ Branch 0 taken 12487 times.
✓ Branch 1 taken 18 times.
|
12505 | for (size_t even_index = 0; even_index < even_values.size(); ++even_index) { |
| 116 | 12487 | result[even_index * 2] = even_values[even_index]; | |
| 117 | } | ||
| 118 |
1/4✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
18 | }, [&]() { |
| 119 |
2/2✓ Branch 0 taken 12478 times.
✓ Branch 1 taken 18 times.
|
12496 | for (size_t odd_index = 0; odd_index < odd_values.size(); ++odd_index) { |
| 120 | 12478 | result[(odd_index * 2) + 1] = odd_values[odd_index]; | |
| 121 | } | ||
| 122 | }); | ||
| 123 | |||
| 124 | 18 | return result; | |
| 125 | } | ||
| 126 | |||
| 127 |
2/2✓ Branch 0 taken 2190 times.
✓ Branch 1 taken 25353 times.
|
27543 | void RunOddEvenPhase(OutType &data, size_t start) { |
| 128 | 27543 | const auto pair_count = (data.size() - start) / 2; | |
| 129 |
2/2✓ Branch 0 taken 2190 times.
✓ Branch 1 taken 25353 times.
|
27543 | if (pair_count < kMinParallelPairs) { |
| 130 |
2/2✓ Branch 0 taken 17088 times.
✓ Branch 1 taken 2190 times.
|
19278 | for (size_t pair_index = 0; pair_index < pair_count; ++pair_index) { |
| 131 |
2/2✓ Branch 0 taken 508 times.
✓ Branch 1 taken 16580 times.
|
17088 | const auto left_index = start + (pair_index * 2); |
| 132 |
2/2✓ Branch 0 taken 508 times.
✓ Branch 1 taken 16580 times.
|
17088 | if (data[left_index] > data[left_index + 1]) { |
| 133 | std::swap(data[left_index], data[left_index + 1]); | ||
| 134 | } | ||
| 135 | } | ||
| 136 | return; | ||
| 137 | } | ||
| 138 | |||
| 139 | 7136866 | tbb::parallel_for(tbb::blocked_range<size_t>(0, pair_count), [&](const tbb::blocked_range<size_t> &range) { | |
| 140 |
2/2✓ Branch 0 taken 35176682 times.
✓ Branch 1 taken 7111513 times.
|
42288195 | for (size_t pair_index = range.begin(); pair_index != range.end(); ++pair_index) { |
| 141 | 35176682 | const auto left_index = start + (pair_index * 2); | |
| 142 |
2/2✓ Branch 0 taken 5891 times.
✓ Branch 1 taken 35170791 times.
|
35176682 | if (data[left_index] > data[left_index + 1]) { |
| 143 | std::swap(data[left_index], data[left_index + 1]); | ||
| 144 | } | ||
| 145 | } | ||
| 146 | 7111513 | }); | |
| 147 | } | ||
| 148 | |||
| 149 | 370 | void OddEvenFinalize(OutType &data) { | |
| 150 |
2/2✓ Branch 0 taken 27677 times.
✓ Branch 1 taken 370 times.
|
28047 | for (size_t phase = 0; phase < data.size(); ++phase) { |
| 151 | 27677 | const auto start = phase & 1U; | |
| 152 |
2/2✓ Branch 0 taken 27543 times.
✓ Branch 1 taken 134 times.
|
27677 | if (data.size() > start + 1) { |
| 153 | 27543 | RunOddEvenPhase(data, start); | |
| 154 | } | ||
| 155 | } | ||
| 156 | 370 | } | |
| 157 | |||
| 158 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 370 times.
|
370 | void MergeParityGroups(const OutType &left_even, const OutType &right_even, const OutType &left_odd, |
| 159 | const OutType &right_odd, OutType &merged_even, OutType &merged_odd) { | ||
| 160 | merged_even.clear(); | ||
| 161 | merged_odd.clear(); | ||
| 162 | 370 | merged_even.reserve(left_even.size() + right_even.size()); | |
| 163 | 370 | merged_odd.reserve(left_odd.size() + right_odd.size()); | |
| 164 | |||
| 165 |
2/2✓ Branch 0 taken 352 times.
✓ Branch 1 taken 18 times.
|
370 | if ((merged_even.capacity() + merged_odd.capacity()) < kMinParallelElements) { |
| 166 | 352 | std::ranges::merge(left_even, right_even, std::back_inserter(merged_even)); | |
| 167 | 352 | std::ranges::merge(left_odd, right_odd, std::back_inserter(merged_odd)); | |
| 168 | 352 | return; | |
| 169 | } | ||
| 170 | |||
| 171 | 18 | tbb::parallel_invoke([&]() { std::ranges::merge(left_even, right_even, std::back_inserter(merged_even)); }, | |
| 172 | 36 | [&]() { std::ranges::merge(left_odd, right_odd, std::back_inserter(merged_odd)); }); | |
| 173 | } | ||
| 174 | |||
| 175 |
2/2✓ Branch 0 taken 352 times.
✓ Branch 1 taken 18 times.
|
370 | void SplitBlocksByParity(const OutType &left, const OutType &right, OutType &left_even, OutType &left_odd, |
| 176 | OutType &right_even, OutType &right_odd) { | ||
| 177 |
2/2✓ Branch 0 taken 352 times.
✓ Branch 1 taken 18 times.
|
370 | if ((left.size() + right.size()) < kMinParallelElements) { |
| 178 | 352 | SplitByGlobalParity(left, 0, left_even, left_odd); | |
| 179 | 352 | SplitByGlobalParity(right, left.size(), right_even, right_odd); | |
| 180 | 352 | return; | |
| 181 | } | ||
| 182 | |||
| 183 | 18 | tbb::parallel_invoke([&]() { SplitByGlobalParity(left, 0, left_even, left_odd); }, | |
| 184 | 36 | [&]() { SplitByGlobalParity(right, left.size(), right_even, right_odd); }); | |
| 185 | } | ||
| 186 | |||
| 187 | 370 | OutType MergeBatcherEvenOdd(const OutType &left, const OutType &right) { | |
| 188 | 370 | OutType left_even; | |
| 189 | 370 | OutType left_odd; | |
| 190 | 370 | OutType right_even; | |
| 191 | 370 | OutType right_odd; | |
| 192 | |||
| 193 |
1/2✓ Branch 1 taken 370 times.
✗ Branch 2 not taken.
|
370 | SplitBlocksByParity(left, right, left_even, left_odd, right_even, right_odd); |
| 194 | |||
| 195 | 370 | OutType merged_even; | |
| 196 | 370 | OutType merged_odd; | |
| 197 |
1/2✓ Branch 1 taken 370 times.
✗ Branch 2 not taken.
|
370 | MergeParityGroups(left_even, right_even, left_odd, right_odd, merged_even, merged_odd); |
| 198 | |||
| 199 |
1/2✓ Branch 1 taken 370 times.
✗ Branch 2 not taken.
|
370 | auto result = InterleaveParityGroups(left.size() + right.size(), merged_even, merged_odd); |
| 200 |
1/2✓ Branch 1 taken 370 times.
✗ Branch 2 not taken.
|
370 | OddEvenFinalize(result); |
| 201 | 370 | return result; | |
| 202 | } | ||
| 203 | |||
| 204 | BlockRange GetBlockRange(size_t block_index, size_t block_count, size_t total_size) { | ||
| 205 | return { | ||
| 206 | 446 | .begin = (block_index * total_size) / block_count, | |
| 207 | 446 | .end = ((block_index + 1) * total_size) / block_count, | |
| 208 | }; | ||
| 209 | } | ||
| 210 | |||
| 211 | size_t GetBlockCount(size_t input_size, size_t parallelism) { | ||
| 212 |
2/2✓ Branch 0 taken 57 times.
✓ Branch 1 taken 19 times.
|
76 | return std::max<size_t>(1, std::min(input_size, parallelism)); |
| 213 | } | ||
| 214 | |||
| 215 | 446 | void FillAndSortBlock(const InType &input, Block &block, BlockRange range) { | |
| 216 | 446 | block.assign(input.begin() + static_cast<std::ptrdiff_t>(range.begin), | |
| 217 | 446 | input.begin() + static_cast<std::ptrdiff_t>(range.end)); | |
| 218 | 446 | RadixSortDoubles(block); | |
| 219 | 446 | } | |
| 220 | |||
| 221 |
2/2✓ Branch 0 taken 64 times.
✓ Branch 1 taken 12 times.
|
76 | BlockList MakeSortedBlocks(const InType &input, size_t parallelism) { |
| 222 | 76 | const auto block_count = GetBlockCount(input.size(), parallelism); | |
| 223 | 76 | const auto total_size = input.size(); | |
| 224 | |||
| 225 | 76 | BlockList blocks(block_count); | |
| 226 |
4/4✓ Branch 0 taken 19 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 48 times.
✓ Branch 3 taken 9 times.
|
76 | if (block_count == 1 || input.size() < kMinParallelElements) { |
| 227 |
2/2✓ Branch 0 taken 419 times.
✓ Branch 1 taken 67 times.
|
486 | for (size_t block = 0; block < block_count; ++block) { |
| 228 |
1/2✓ Branch 1 taken 419 times.
✗ Branch 2 not taken.
|
419 | FillAndSortBlock(input, blocks[block], GetBlockRange(block, block_count, total_size)); |
| 229 | } | ||
| 230 | return blocks; | ||
| 231 | } | ||
| 232 | |||
| 233 |
1/2✓ Branch 1 taken 9 times.
✗ Branch 2 not taken.
|
36 | tbb::parallel_for(tbb::blocked_range<size_t>(0, block_count), [&](const tbb::blocked_range<size_t> &range) { |
| 234 |
2/2✓ Branch 0 taken 27 times.
✓ Branch 1 taken 27 times.
|
54 | for (size_t block = range.begin(); block != range.end(); ++block) { |
| 235 | 27 | FillAndSortBlock(input, blocks[block], GetBlockRange(block, block_count, total_size)); | |
| 236 | } | ||
| 237 | 27 | }); | |
| 238 | |||
| 239 | 9 | return blocks; | |
| 240 | ✗ | } | |
| 241 | |||
| 242 | 370 | void MergeBlockPair(const BlockList &blocks, BlockList &next, size_t pair_index) { | |
| 243 | 370 | next[pair_index] = MergeBatcherEvenOdd(blocks[pair_index * 2], blocks[(pair_index * 2) + 1]); | |
| 244 | 370 | } | |
| 245 | |||
| 246 | 123 | BlockList MergeBlockPairs(const BlockList &blocks) { | |
| 247 | 123 | const auto pair_count = blocks.size() / 2; | |
| 248 | 123 | BlockList next((blocks.size() + 1) / 2); | |
| 249 |
2/2✓ Branch 0 taken 119 times.
✓ Branch 1 taken 4 times.
|
123 | if (pair_count < kMinParallelPairs) { |
| 250 |
2/2✓ Branch 0 taken 242 times.
✓ Branch 1 taken 119 times.
|
361 | for (size_t pair = 0; pair < pair_count; ++pair) { |
| 251 |
1/2✓ Branch 1 taken 242 times.
✗ Branch 2 not taken.
|
242 | MergeBlockPair(blocks, next, pair); |
| 252 | } | ||
| 253 |
2/2✓ Branch 0 taken 47 times.
✓ Branch 1 taken 72 times.
|
119 | if ((blocks.size() & 1U) != 0U) { |
| 254 |
1/2✓ Branch 1 taken 47 times.
✗ Branch 2 not taken.
|
47 | next.back() = blocks.back(); |
| 255 | } | ||
| 256 | return next; | ||
| 257 | } | ||
| 258 | |||
| 259 |
2/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 4 times.
✗ Branch 4 not taken.
|
4 | tbb::parallel_for(tbb::blocked_range<size_t>(0, pair_count), [&](const tbb::blocked_range<size_t> &range) { |
| 260 |
4/4✓ Branch 0 taken 64 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 64 times.
✓ Branch 3 taken 64 times.
|
256 | for (size_t pair = range.begin(); pair != range.end(); ++pair) { |
| 261 |
1/2✓ Branch 2 taken 64 times.
✗ Branch 3 not taken.
|
128 | MergeBlockPair(blocks, next, pair); |
| 262 | } | ||
| 263 | }); | ||
| 264 | |||
| 265 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if ((blocks.size() & 1U) != 0U) { |
| 266 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | next.back() = blocks.back(); |
| 267 | } | ||
| 268 | |||
| 269 | return next; | ||
| 270 | ✗ | } | |
| 271 | |||
| 272 | 76 | Block MergeBlocks(BlockList blocks) { | |
| 273 |
2/2✓ Branch 0 taken 123 times.
✓ Branch 1 taken 76 times.
|
199 | while (blocks.size() > 1) { |
| 274 | 123 | blocks = MergeBlockPairs(blocks); | |
| 275 | } | ||
| 276 | |||
| 277 | 76 | return std::move(blocks.front()); | |
| 278 | } | ||
| 279 | |||
| 280 | } // namespace | ||
| 281 | |||
| 282 |
1/2✓ Branch 1 taken 96 times.
✗ Branch 2 not taken.
|
96 | DoubleSortEvenOddBatcherTBB::DoubleSortEvenOddBatcherTBB(const InType &in) { |
| 283 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 284 |
1/2✓ Branch 1 taken 96 times.
✗ Branch 2 not taken.
|
96 | GetInput() = in; |
| 285 | GetOutput() = {}; | ||
| 286 | 96 | } | |
| 287 | |||
| 288 | 92 | bool DoubleSortEvenOddBatcherTBB::ValidationImpl() { | |
| 289 | 92 | return GetOutput().empty(); | |
| 290 | } | ||
| 291 | |||
| 292 | 84 | bool DoubleSortEvenOddBatcherTBB::PreProcessingImpl() { | |
| 293 | 84 | input_data_ = GetInput(); | |
| 294 | result_data_.clear(); | ||
| 295 | 84 | return true; | |
| 296 | } | ||
| 297 | |||
| 298 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 76 times.
|
84 | bool DoubleSortEvenOddBatcherTBB::RunImpl() { |
| 299 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 76 times.
|
84 | if (input_data_.empty()) { |
| 300 | result_data_.clear(); | ||
| 301 | 8 | return true; | |
| 302 | } | ||
| 303 | |||
| 304 | 76 | const auto parallelism = static_cast<size_t>(std::max(1, ppc::util::GetNumThreads())); | |
| 305 | 76 | const tbb::global_control control(tbb::global_control::max_allowed_parallelism, parallelism); | |
| 306 | |||
| 307 |
1/2✓ Branch 1 taken 76 times.
✗ Branch 2 not taken.
|
76 | auto blocks = MakeSortedBlocks(input_data_, parallelism); |
| 308 |
1/2✓ Branch 1 taken 76 times.
✗ Branch 2 not taken.
|
152 | result_data_ = MergeBlocks(std::move(blocks)); |
| 309 | return true; | ||
| 310 | 76 | } | |
| 311 | |||
| 312 | 80 | bool DoubleSortEvenOddBatcherTBB::PostProcessingImpl() { | |
| 313 | 80 | GetOutput() = result_data_; | |
| 314 | 80 | return true; | |
| 315 | } | ||
| 316 | |||
| 317 | } // namespace gusev_d_double_sort_even_odd_batcher_tbb_task_threads | ||
| 318 |