| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "gusev_d_double_sort_even_odd_batcher/stl/include/ops_stl.hpp" | ||
| 2 | |||
| 3 | #include <algorithm> | ||
| 4 | #include <array> | ||
| 5 | #include <bit> | ||
| 6 | #include <cstddef> | ||
| 7 | #include <cstdint> | ||
| 8 | #include <exception> | ||
| 9 | #include <mutex> | ||
| 10 | #include <thread> | ||
| 11 | #include <utility> | ||
| 12 | #include <vector> | ||
| 13 | |||
| 14 | #include "gusev_d_double_sort_even_odd_batcher/stl/include/common.hpp" | ||
| 15 | #include "util/include/util.hpp" | ||
| 16 | |||
| 17 | namespace gusev_d_double_sort_even_odd_batcher_stl_task_threads { | ||
| 18 | namespace { | ||
| 19 | |||
| 20 | constexpr int kRadixPasses = 8; | ||
| 21 | constexpr int kBitsPerByte = 8; | ||
| 22 | constexpr size_t kRadixBuckets = 256; | ||
| 23 | constexpr uint64_t kBucketMask = 0xFFULL; | ||
| 24 | constexpr size_t kMinParallelElements = 128; | ||
| 25 | constexpr size_t kMinThreadedTasks = 2; | ||
| 26 | |||
| 27 | static_assert((kRadixPasses % 2) == 0, "Radix sort expects the final data to remain in the input buffer"); | ||
| 28 | |||
| 29 | using Block = std::vector<ValueType>; | ||
| 30 | using BlockList = std::vector<Block>; | ||
| 31 | |||
| 32 | struct BlockRange { | ||
| 33 | size_t begin = 0; | ||
| 34 | size_t end = 0; | ||
| 35 | }; | ||
| 36 | |||
| 37 | 31784 | struct MergeItem { | |
| 38 | ValueType value = 0.0; | ||
| 39 | bool is_padding = true; | ||
| 40 | }; | ||
| 41 | |||
| 42 | uint64_t DoubleToSortableKey(ValueType value) { | ||
| 43 | const auto bits = std::bit_cast<uint64_t>(value); | ||
| 44 | const auto sign_mask = uint64_t{1} << 63; | ||
| 45 |
4/4✓ Branch 0 taken 89488 times.
✓ Branch 1 taken 67952 times.
✓ Branch 2 taken 89488 times.
✓ Branch 3 taken 67952 times.
|
314880 | return (bits & sign_mask) == 0 ? bits ^ sign_mask : ~bits; |
| 46 | } | ||
| 47 | |||
| 48 | size_t GetBucketIndex(ValueType value, int shift) { | ||
| 49 | 314880 | return static_cast<size_t>((DoubleToSortableKey(value) >> shift) & kBucketMask); | |
| 50 | } | ||
| 51 | |||
| 52 | void BuildPrefixSums(std::array<size_t, kRadixBuckets> &count) { | ||
| 53 | size_t prefix = 0; | ||
| 54 |
2/2✓ Branch 0 taken 712704 times.
✓ Branch 1 taken 2784 times.
|
715488 | for (auto &value : count) { |
| 55 | 712704 | const auto current = value; | |
| 56 | 712704 | value = prefix; | |
| 57 | 712704 | prefix += current; | |
| 58 | } | ||
| 59 | } | ||
| 60 | |||
| 61 |
2/2✓ Branch 0 taken 48 times.
✓ Branch 1 taken 348 times.
|
396 | void RadixSortDoubles(OutType &data) { |
| 62 |
2/2✓ Branch 0 taken 48 times.
✓ Branch 1 taken 348 times.
|
396 | if (data.size() < 2) { |
| 63 | 48 | return; | |
| 64 | } | ||
| 65 | |||
| 66 | 348 | OutType buffer(data.size()); | |
| 67 | auto *source = &data; | ||
| 68 | auto *destination = &buffer; | ||
| 69 | |||
| 70 |
2/2✓ Branch 0 taken 2784 times.
✓ Branch 1 taken 348 times.
|
3132 | for (int byte = 0; byte < kRadixPasses; ++byte) { |
| 71 | 2784 | std::array<size_t, kRadixBuckets> count{}; | |
| 72 | 2784 | const auto shift = byte * kBitsPerByte; | |
| 73 | |||
| 74 |
4/4✓ Branch 0 taken 89488 times.
✓ Branch 1 taken 67952 times.
✓ Branch 2 taken 157440 times.
✓ Branch 3 taken 2784 times.
|
160224 | for (ValueType value : *source) { |
| 75 | 157440 | count.at(GetBucketIndex(value, shift))++; | |
| 76 | } | ||
| 77 | BuildPrefixSums(count); | ||
| 78 | |||
| 79 |
4/4✓ Branch 0 taken 89488 times.
✓ Branch 1 taken 67952 times.
✓ Branch 2 taken 157440 times.
✓ Branch 3 taken 2784 times.
|
160224 | for (ValueType value : *source) { |
| 80 | const auto bucket = GetBucketIndex(value, shift); | ||
| 81 | 157440 | (*destination)[count.at(bucket)++] = value; | |
| 82 | } | ||
| 83 | |||
| 84 | std::swap(source, destination); | ||
| 85 | } | ||
| 86 | } | ||
| 87 | |||
| 88 | size_t NextPowerOfTwo(size_t value) { | ||
| 89 | size_t result = 1; | ||
| 90 |
2/2✓ Branch 0 taken 794 times.
✓ Branch 1 taken 244 times.
|
1038 | while (result < value) { |
| 91 | 794 | result <<= 1U; | |
| 92 | } | ||
| 93 | return result; | ||
| 94 | } | ||
| 95 | |||
| 96 | bool IsGreater(const MergeItem &lhs, const MergeItem &rhs) { | ||
| 97 |
2/2✓ Branch 0 taken 126582 times.
✓ Branch 1 taken 7414 times.
|
133996 | if (lhs.is_padding != rhs.is_padding) { |
| 98 | return lhs.is_padding; | ||
| 99 | } | ||
| 100 |
4/4✓ Branch 0 taken 93860 times.
✓ Branch 1 taken 32722 times.
✓ Branch 2 taken 47106 times.
✓ Branch 3 taken 46754 times.
|
126582 | return !lhs.is_padding && lhs.value > rhs.value; |
| 101 | } | ||
| 102 | |||
| 103 |
2/2✓ Branch 0 taken 126582 times.
✓ Branch 1 taken 7414 times.
|
133996 | void CompareExchange(std::vector<MergeItem> &data, size_t left, size_t right) { |
| 104 |
2/2✓ Branch 0 taken 4640 times.
✓ Branch 1 taken 2774 times.
|
7414 | if (IsGreater(data[left], data[right])) { |
| 105 | std::swap(data[left], data[right]); | ||
| 106 | } | ||
| 107 | 133996 | } | |
| 108 | |||
| 109 | void CompareExchangeBlocks(std::vector<MergeItem> &data, size_t first, size_t distance) { | ||
| 110 |
4/4✓ Branch 0 taken 15892 times.
✓ Branch 1 taken 244 times.
✓ Branch 2 taken 118104 times.
✓ Branch 3 taken 30502 times.
|
164742 | for (size_t i = 0; i < distance; ++i) { |
| 111 | 133996 | CompareExchange(data, first + i, first + distance + i); | |
| 112 | } | ||
| 113 | } | ||
| 114 | |||
| 115 |
1/2✓ Branch 0 taken 244 times.
✗ Branch 1 not taken.
|
244 | void OddEvenMerge(std::vector<MergeItem> &data) { |
| 116 |
1/2✓ Branch 0 taken 244 times.
✗ Branch 1 not taken.
|
244 | if (data.size() < 2) { |
| 117 | return; | ||
| 118 | } | ||
| 119 | |||
| 120 | 244 | auto distance = data.size() / 2; | |
| 121 | CompareExchangeBlocks(data, 0, distance); | ||
| 122 | |||
| 123 |
2/2✓ Branch 0 taken 794 times.
✓ Branch 1 taken 244 times.
|
1038 | for (distance /= 2; distance > 0; distance /= 2) { |
| 124 | 794 | const auto step = distance * 2; | |
| 125 |
2/2✓ Branch 0 taken 30502 times.
✓ Branch 1 taken 794 times.
|
31296 | for (size_t first = distance; (first + distance) < data.size(); first += step) { |
| 126 | CompareExchangeBlocks(data, first, distance); | ||
| 127 | } | ||
| 128 | } | ||
| 129 | } | ||
| 130 | |||
| 131 | void CopyBlockToMergeItems(const Block &block, std::vector<MergeItem> &items, size_t offset) { | ||
| 132 |
4/4✓ Branch 0 taken 12390 times.
✓ Branch 1 taken 244 times.
✓ Branch 2 taken 10886 times.
✓ Branch 3 taken 244 times.
|
23764 | for (size_t i = 0; i < block.size(); ++i) { |
| 133 | 10886 | auto &item = items[offset + i]; | |
| 134 | 23276 | item.value = block[i]; | |
| 135 | 23276 | item.is_padding = false; | |
| 136 | } | ||
| 137 | } | ||
| 138 | |||
| 139 | 244 | Block ExtractMergedValues(const std::vector<MergeItem> &items, size_t result_size) { | |
| 140 | 244 | Block result; | |
| 141 |
1/2✓ Branch 1 taken 244 times.
✗ Branch 2 not taken.
|
244 | result.reserve(result_size); |
| 142 |
2/2✓ Branch 0 taken 31784 times.
✓ Branch 1 taken 244 times.
|
32028 | for (const auto &item : items) { |
| 143 |
2/2✓ Branch 0 taken 23276 times.
✓ Branch 1 taken 8508 times.
|
31784 | if (!item.is_padding) { |
| 144 |
1/2✓ Branch 0 taken 23276 times.
✗ Branch 1 not taken.
|
23276 | result.push_back(item.value); |
| 145 | } | ||
| 146 | } | ||
| 147 | 244 | return result; | |
| 148 | } | ||
| 149 | |||
| 150 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 244 times.
|
244 | Block MergeBatcherEvenOdd(const Block &left, const Block &right) { |
| 151 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 244 times.
|
244 | if (left.empty()) { |
| 152 | ✗ | return right; | |
| 153 | } | ||
| 154 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 244 times.
|
244 | if (right.empty()) { |
| 155 | ✗ | return left; | |
| 156 | } | ||
| 157 | |||
| 158 | const auto half_size = NextPowerOfTwo(std::max(left.size(), right.size())); | ||
| 159 | 244 | std::vector<MergeItem> items(half_size * 2); | |
| 160 | |||
| 161 | CopyBlockToMergeItems(left, items, 0); | ||
| 162 | CopyBlockToMergeItems(right, items, half_size); | ||
| 163 | 244 | OddEvenMerge(items); | |
| 164 |
1/2✓ Branch 1 taken 244 times.
✗ Branch 2 not taken.
|
244 | return ExtractMergedValues(items, left.size() + right.size()); |
| 165 | } | ||
| 166 | |||
| 167 | BlockRange GetBlockRange(size_t block_index, size_t block_count, size_t total_size) { | ||
| 168 | BlockRange range; | ||
| 169 | 396 | range.begin = (block_index * total_size) / block_count; | |
| 170 | 396 | range.end = ((block_index + 1) * total_size) / block_count; | |
| 171 | return range; | ||
| 172 | } | ||
| 173 | |||
| 174 | size_t GetBlockCount(size_t input_size, size_t parallelism) { | ||
| 175 |
2/2✓ Branch 0 taken 114 times.
✓ Branch 1 taken 38 times.
|
152 | return std::max<size_t>(1, std::min(input_size, parallelism)); |
| 176 | } | ||
| 177 | |||
| 178 | 152 | size_t GetSafeParallelism() { | |
| 179 | 152 | const auto requested = static_cast<size_t>(std::max(1, ppc::util::GetNumThreads())); | |
| 180 | 152 | const auto available = std::thread::hardware_concurrency(); | |
| 181 |
1/2✓ Branch 0 taken 152 times.
✗ Branch 1 not taken.
|
152 | if (available == 0) { |
| 182 | return requested; | ||
| 183 | } | ||
| 184 | 152 | return std::min(requested, static_cast<size_t>(available)); | |
| 185 | } | ||
| 186 | |||
| 187 | 396 | void FillAndSortBlock(const InType &input, Block &block, BlockRange range) { | |
| 188 | 396 | block.assign(input.begin() + static_cast<std::ptrdiff_t>(range.begin), | |
| 189 | 396 | input.begin() + static_cast<std::ptrdiff_t>(range.end)); | |
| 190 | 396 | RadixSortDoubles(block); | |
| 191 | 396 | } | |
| 192 | |||
| 193 | template <typename Function> | ||
| 194 | void RunSequentialByIndex(size_t work_count, const Function &function) { | ||
| 195 |
4/4✓ Branch 0 taken 152 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 342 times.
✓ Branch 3 taken 134 times.
|
780 | for (size_t index = 0; index < work_count; ++index) { |
| 196 | 342 | function(index); | |
| 197 | } | ||
| 198 | } | ||
| 199 | |||
| 200 | ✗ | void StoreCurrentException(std::exception_ptr &worker_exception, std::mutex &exception_mutex) noexcept { | |
| 201 | const std::scoped_lock lock(exception_mutex); | ||
| 202 | ✗ | if (worker_exception == nullptr) { | |
| 203 | ✗ | worker_exception = std::current_exception(); | |
| 204 | } | ||
| 205 | ✗ | } | |
| 206 | |||
| 207 | template <typename Function> | ||
| 208 | 292 | void RunThreadChunk(size_t thread_index, size_t work_count, size_t thread_count, const Function &function, | |
| 209 | std::exception_ptr &worker_exception, std::mutex &exception_mutex) noexcept { | ||
| 210 | try { | ||
| 211 |
2/2✓ Branch 0 taken 146 times.
✓ Branch 1 taken 146 times.
|
584 | for (size_t index = thread_index; index < work_count; index += thread_count) { |
| 212 |
1/2✓ Branch 1 taken 54 times.
✗ Branch 2 not taken.
|
108 | function(index); |
| 213 | } | ||
| 214 | ✗ | } catch (...) { | |
| 215 | ✗ | StoreCurrentException(worker_exception, exception_mutex); | |
| 216 | } | ||
| 217 | 292 | } | |
| 218 | |||
| 219 | 64 | void JoinWorkersAndRethrow(std::vector<std::thread> &workers, const std::exception_ptr &worker_exception) { | |
| 220 |
2/2✓ Branch 0 taken 146 times.
✓ Branch 1 taken 64 times.
|
210 | for (auto &worker : workers) { |
| 221 | 146 | worker.join(); | |
| 222 | } | ||
| 223 | |||
| 224 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 64 times.
|
64 | if (worker_exception != nullptr) { |
| 225 | ✗ | std::rethrow_exception(worker_exception); | |
| 226 | } | ||
| 227 | 64 | } | |
| 228 | |||
| 229 | template <typename Function> | ||
| 230 | 700 | void RunThreadedByIndex(size_t work_count, size_t max_threads, const Function &function) { | |
| 231 |
4/4✓ Branch 0 taken 160 times.
✓ Branch 1 taken 190 times.
✓ Branch 2 taken 96 times.
✓ Branch 3 taken 64 times.
|
700 | if (work_count < kMinThreadedTasks || max_threads < kMinThreadedTasks) { |
| 232 | RunSequentialByIndex(work_count, function); | ||
| 233 | 572 | return; | |
| 234 | } | ||
| 235 | |||
| 236 | 128 | const auto thread_count = std::min(work_count, max_threads); | |
| 237 | 128 | std::vector<std::thread> workers; | |
| 238 |
1/2✓ Branch 1 taken 64 times.
✗ Branch 2 not taken.
|
128 | workers.reserve(thread_count); |
| 239 | std::exception_ptr worker_exception; | ||
| 240 | 128 | std::mutex exception_mutex; | |
| 241 | try { | ||
| 242 |
2/2✓ Branch 0 taken 146 times.
✓ Branch 1 taken 64 times.
|
420 | for (size_t thread_index = 0; thread_index < thread_count; ++thread_index) { |
| 243 |
1/2✓ Branch 1 taken 146 times.
✗ Branch 2 not taken.
|
584 | workers.emplace_back([&, thread_index] { |
| 244 | 146 | RunThreadChunk(thread_index, work_count, thread_count, function, worker_exception, exception_mutex); | |
| 245 | }); | ||
| 246 | } | ||
| 247 | ✗ | } catch (...) { | |
| 248 | ✗ | StoreCurrentException(worker_exception, exception_mutex); | |
| 249 | } | ||
| 250 | |||
| 251 |
1/2✓ Branch 1 taken 64 times.
✗ Branch 2 not taken.
|
128 | JoinWorkersAndRethrow(workers, worker_exception); |
| 252 | 128 | } | |
| 253 | |||
| 254 |
2/2✓ Branch 0 taken 136 times.
✓ Branch 1 taken 16 times.
|
152 | BlockList MakeSortedBlocks(const InType &input, size_t parallelism) { |
| 255 | 152 | const auto block_count = GetBlockCount(input.size(), parallelism); | |
| 256 | 152 | const auto total_size = input.size(); | |
| 257 | 152 | BlockList blocks(block_count); | |
| 258 | |||
| 259 | 396 | const auto sort_block = [&](size_t block_index) { | |
| 260 | 396 | FillAndSortBlock(input, blocks[block_index], GetBlockRange(block_index, block_count, total_size)); | |
| 261 |
2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 24 times.
|
152 | }; |
| 262 | |||
| 263 |
2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 24 times.
|
152 | if (input.size() < kMinParallelElements) { |
| 264 |
1/2✓ Branch 1 taken 128 times.
✗ Branch 2 not taken.
|
128 | RunThreadedByIndex(block_count, 1, sort_block); |
| 265 | } else { | ||
| 266 |
1/2✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
|
24 | RunThreadedByIndex(block_count, parallelism, sort_block); |
| 267 | } | ||
| 268 | |||
| 269 | 152 | return blocks; | |
| 270 | ✗ | } | |
| 271 | |||
| 272 | 244 | void MergeBlockPair(const BlockList &blocks, BlockList &next, size_t pair_index) { | |
| 273 | 244 | next[pair_index] = MergeBatcherEvenOdd(blocks[pair_index * 2], blocks[(pair_index * 2) + 1]); | |
| 274 | 244 | } | |
| 275 | |||
| 276 | 198 | BlockList MergeBlockPairs(const BlockList &blocks, size_t parallelism) { | |
| 277 | 198 | const auto pair_count = blocks.size() / 2; | |
| 278 | 198 | BlockList next((blocks.size() + 1) / 2); | |
| 279 | |||
| 280 |
4/6✓ Branch 1 taken 92 times.
✗ Branch 2 not taken.
✓ Branch 5 taken 198 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 38 times.
✓ Branch 8 taken 160 times.
|
442 | RunThreadedByIndex(pair_count, parallelism, [&](size_t pair_index) { MergeBlockPair(blocks, next, pair_index); }); |
| 281 | |||
| 282 |
2/2✓ Branch 0 taken 38 times.
✓ Branch 1 taken 160 times.
|
198 | if ((blocks.size() & 1U) != 0U) { |
| 283 |
1/2✓ Branch 1 taken 38 times.
✗ Branch 2 not taken.
|
38 | next.back() = blocks.back(); |
| 284 | } | ||
| 285 | |||
| 286 | 198 | return next; | |
| 287 | ✗ | } | |
| 288 | |||
| 289 | 152 | Block MergeBlocks(BlockList blocks, size_t parallelism) { | |
| 290 |
2/2✓ Branch 0 taken 198 times.
✓ Branch 1 taken 152 times.
|
350 | while (blocks.size() > 1) { |
| 291 | 198 | blocks = MergeBlockPairs(blocks, parallelism); | |
| 292 | } | ||
| 293 | |||
| 294 | 152 | return std::move(blocks.front()); | |
| 295 | } | ||
| 296 | |||
| 297 | } // namespace | ||
| 298 | |||
| 299 |
1/2✓ Branch 1 taken 192 times.
✗ Branch 2 not taken.
|
192 | DoubleSortEvenOddBatcherSTL::DoubleSortEvenOddBatcherSTL(const InType &in) : input_data_(in) { |
| 300 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 301 |
1/2✓ Branch 1 taken 192 times.
✗ Branch 2 not taken.
|
192 | GetInput() = in; |
| 302 | GetOutput() = {}; | ||
| 303 | 192 | } | |
| 304 | |||
| 305 | 184 | bool DoubleSortEvenOddBatcherSTL::ValidationImpl() { | |
| 306 | 184 | return GetOutput().empty(); | |
| 307 | } | ||
| 308 | |||
| 309 | 168 | bool DoubleSortEvenOddBatcherSTL::PreProcessingImpl() { | |
| 310 | 168 | input_data_ = GetInput(); | |
| 311 | result_data_.clear(); | ||
| 312 | 168 | return true; | |
| 313 | } | ||
| 314 | |||
| 315 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 152 times.
|
168 | bool DoubleSortEvenOddBatcherSTL::RunImpl() { |
| 316 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 152 times.
|
168 | if (input_data_.empty()) { |
| 317 | result_data_.clear(); | ||
| 318 | 16 | return true; | |
| 319 | } | ||
| 320 | |||
| 321 | 152 | const auto parallelism = GetSafeParallelism(); | |
| 322 | 152 | auto blocks = MakeSortedBlocks(input_data_, parallelism); | |
| 323 |
1/2✓ Branch 1 taken 152 times.
✗ Branch 2 not taken.
|
304 | result_data_ = MergeBlocks(std::move(blocks), parallelism); |
| 324 | return true; | ||
| 325 | 152 | } | |
| 326 | |||
| 327 | 160 | bool DoubleSortEvenOddBatcherSTL::PostProcessingImpl() { | |
| 328 | 160 | GetOutput() = result_data_; | |
| 329 | 160 | return true; | |
| 330 | } | ||
| 331 | |||
| 332 | } // namespace gusev_d_double_sort_even_odd_batcher_stl_task_threads | ||
| 333 |