| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "leonova_a_radix_merge_sort/tbb/include/ops_tbb.hpp" | ||
| 2 | |||
| 3 | #include <algorithm> | ||
| 4 | #include <cstddef> | ||
| 5 | #include <cstdint> | ||
| 6 | #include <utility> | ||
| 7 | #include <vector> | ||
| 8 | |||
| 9 | #include "leonova_a_radix_merge_sort/common/include/common.hpp" | ||
| 10 | #include "tbb/parallel_for.h" | ||
| 11 | #include "tbb/task_arena.h" | ||
| 12 | #include "util/include/util.hpp" | ||
| 13 | |||
| 14 | namespace leonova_a_radix_merge_sort { | ||
| 15 | |||
| 16 | 12 | tbb::task_arena &LeonovaARadixMergeSortTBB::GetTbbArena() { | |
| 17 |
4/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 8 times.
✓ Branch 3 taken 4 times.
✗ Branch 4 not taken.
✓ Branch 6 taken 4 times.
✗ Branch 7 not taken.
|
12 | static int num_threads = std::max(1, ppc::util::GetNumThreads()); |
| 18 |
3/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 8 times.
✓ Branch 3 taken 4 times.
✗ Branch 4 not taken.
|
12 | static tbb::task_arena arena(num_threads); |
| 19 | 12 | return arena; | |
| 20 | } | ||
| 21 | |||
| 22 | inline uint64_t LeonovaARadixMergeSortTBB::ToUnsignedValue(int64_t value) { | ||
| 23 | 1048580 | return static_cast<uint64_t>(value) ^ kSignBitMask; | |
| 24 | } | ||
| 25 | |||
| 26 | inline std::pair<size_t, size_t> LeonovaARadixMergeSortTBB::GetChunk(size_t tid, size_t num_threads, size_t size) { | ||
| 27 | 510 | const size_t chunk = (size + num_threads - 1) / num_threads; | |
| 28 | 510 | const size_t begin = tid * chunk; | |
| 29 | 510 | const size_t end = std::min(begin + chunk, size); | |
| 30 | return {begin, end}; | ||
| 31 | } | ||
| 32 | |||
| 33 | ✗ | void LeonovaARadixMergeSortTBB::FillUnsignedKeys(const std::vector<int64_t> &arr, size_t left, size_t size, | |
| 34 | std::vector<uint64_t> &keys, size_t num_threads) { | ||
| 35 | ✗ | tbb::parallel_for(static_cast<size_t>(0), num_threads, [&](size_t tid) { | |
| 36 | 30 | auto [begin, end] = GetChunk(tid, num_threads, size); | |
| 37 |
2/2✓ Branch 0 taken 1048580 times.
✓ Branch 1 taken 30 times.
|
1048610 | for (size_t index = begin; index < end; ++index) { |
| 38 | 1048580 | keys[index] = ToUnsignedValue(arr[left + index]); | |
| 39 | } | ||
| 40 | 30 | }); | |
| 41 | ✗ | } | |
| 42 | |||
| 43 | ✗ | void LeonovaARadixMergeSortTBB::CountBytesParallel(const std::vector<uint64_t> &keys, size_t size, int shift, | |
| 44 | std::vector<CounterRow> &local_counts, size_t num_threads) { | ||
| 45 | ✗ | tbb::parallel_for(static_cast<size_t>(0), num_threads, [&](size_t tid) { | |
| 46 | 240 | auto [begin, end] = GetChunk(tid, num_threads, size); | |
| 47 | |||
| 48 | 240 | auto &row = local_counts[tid]; | |
| 49 | |||
| 50 |
2/2✓ Branch 0 taken 8388640 times.
✓ Branch 1 taken 240 times.
|
8388880 | for (size_t index = begin; index < end; ++index) { |
| 51 | 8388640 | ++row[(keys[index] >> shift) & 0xFFU]; | |
| 52 | } | ||
| 53 | 240 | }); | |
| 54 | ✗ | } | |
| 55 | |||
| 56 |
1/2✓ Branch 0 taken 96 times.
✗ Branch 1 not taken.
|
96 | void LeonovaARadixMergeSortTBB::ReduceCounts(const std::vector<CounterRow> &local_counts, CounterRow &global_counts) { |
| 57 | std::ranges::fill(global_counts, 0); | ||
| 58 | |||
| 59 |
2/2✓ Branch 0 taken 240 times.
✓ Branch 1 taken 96 times.
|
336 | for (const auto &row : local_counts) { |
| 60 |
2/2✓ Branch 0 taken 61440 times.
✓ Branch 1 taken 240 times.
|
61680 | for (size_t index = 0; index < kNumCounters; ++index) { |
| 61 | 61440 | global_counts[index] += row[index]; | |
| 62 | } | ||
| 63 | } | ||
| 64 | 96 | } | |
| 65 | |||
| 66 | 96 | void LeonovaARadixMergeSortTBB::BuildOffsets(const std::vector<CounterRow> &local_counts, | |
| 67 | std::vector<CounterRow> &local_offsets, CounterRow &global_counts, | ||
| 68 | size_t num_threads) { | ||
| 69 | 96 | CounterRow bucket_totals = global_counts; | |
| 70 | |||
| 71 | size_t prefix = 0; | ||
| 72 |
2/2✓ Branch 0 taken 24576 times.
✓ Branch 1 taken 96 times.
|
24672 | for (size_t index = 0; index < kNumCounters; ++index) { |
| 73 | 24576 | size_t count = bucket_totals[index]; | |
| 74 | 24576 | bucket_totals[index] = prefix; | |
| 75 | 24576 | prefix += count; | |
| 76 | } | ||
| 77 | |||
| 78 |
2/2✓ Branch 0 taken 240 times.
✓ Branch 1 taken 96 times.
|
336 | for (size_t tndex = 0; tndex < num_threads; ++tndex) { |
| 79 | auto &offset_row = local_offsets[tndex]; | ||
| 80 | const auto &count_row = local_counts[tndex]; | ||
| 81 | |||
| 82 |
2/2✓ Branch 0 taken 61440 times.
✓ Branch 1 taken 240 times.
|
61680 | for (size_t index = 0; index < kNumCounters; ++index) { |
| 83 | 61440 | offset_row[index] = bucket_totals[index]; | |
| 84 | 61440 | bucket_totals[index] += count_row[index]; | |
| 85 | } | ||
| 86 | } | ||
| 87 | 96 | } | |
| 88 | |||
| 89 | ✗ | void LeonovaARadixMergeSortTBB::ScatterParallel(const std::vector<uint64_t> &keys, const std::vector<int64_t> &arr, | |
| 90 | size_t left, size_t size, int shift, | ||
| 91 | std::vector<CounterRow> &local_offsets, std::vector<int64_t> &temp_arr, | ||
| 92 | std::vector<uint64_t> &temp_keys, size_t num_threads) { | ||
| 93 | ✗ | tbb::parallel_for(static_cast<size_t>(0), num_threads, [&](size_t tid) { | |
| 94 | 240 | auto [begin, end] = GetChunk(tid, num_threads, size); | |
| 95 | |||
| 96 | 240 | auto &offsets = local_offsets[tid]; | |
| 97 | |||
| 98 |
2/2✓ Branch 0 taken 8388640 times.
✓ Branch 1 taken 240 times.
|
8388880 | for (size_t index = begin; index < end; ++index) { |
| 99 | 8388640 | const size_t byte = (keys[index] >> shift) & 0xFFU; | |
| 100 | 8388640 | const size_t pos = offsets[byte]++; | |
| 101 | |||
| 102 | 8388640 | temp_arr[pos] = arr[left + index]; | |
| 103 | 8388640 | temp_keys[pos] = keys[index]; | |
| 104 | } | ||
| 105 | 240 | }); | |
| 106 | ✗ | } | |
| 107 | |||
| 108 |
1/2✓ Branch 1 taken 128 times.
✗ Branch 2 not taken.
|
128 | LeonovaARadixMergeSortTBB::LeonovaARadixMergeSortTBB(const InType &in) { |
| 109 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 110 |
1/2✓ Branch 1 taken 128 times.
✗ Branch 2 not taken.
|
128 | GetInput() = in; |
| 111 |
1/2✓ Branch 1 taken 128 times.
✗ Branch 2 not taken.
|
128 | GetOutput() = std::vector<int64_t>(GetInput().size()); |
| 112 | 128 | } | |
| 113 | |||
| 114 | 256 | bool LeonovaARadixMergeSortTBB::ValidationImpl() { | |
| 115 | 256 | return !GetInput().empty(); | |
| 116 | } | ||
| 117 | |||
| 118 | 128 | bool LeonovaARadixMergeSortTBB::PreProcessingImpl() { | |
| 119 | 128 | return true; | |
| 120 | } | ||
| 121 | |||
| 122 | 128 | bool LeonovaARadixMergeSortTBB::RunImpl() { | |
| 123 |
1/2✓ Branch 1 taken 128 times.
✗ Branch 2 not taken.
|
128 | if (!ValidationImpl()) { |
| 124 | return false; | ||
| 125 | } | ||
| 126 | |||
| 127 | 128 | GetOutput() = GetInput(); | |
| 128 | |||
| 129 |
2/2✓ Branch 0 taken 120 times.
✓ Branch 1 taken 8 times.
|
128 | if (GetOutput().size() > 1) { |
| 130 | 120 | RadixMergeSort(GetOutput(), 0, GetOutput().size()); | |
| 131 | } | ||
| 132 | |||
| 133 | return true; | ||
| 134 | } | ||
| 135 | |||
| 136 | 128 | bool LeonovaARadixMergeSortTBB::PostProcessingImpl() { | |
| 137 | 128 | return true; | |
| 138 | } | ||
| 139 | |||
| 140 | 112 | void LeonovaARadixMergeSortTBB::SequentialRadixSort(std::vector<int64_t> &arr, size_t left, size_t right) { | |
| 141 | 112 | const size_t size = right - left; | |
| 142 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 112 times.
|
112 | if (size <= 1) { |
| 143 | ✗ | return; | |
| 144 | } | ||
| 145 | |||
| 146 | 112 | std::vector<uint64_t> keys(size); | |
| 147 |
1/4✓ Branch 1 taken 112 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
112 | std::vector<int64_t> temp_arr(size); |
| 148 |
1/4✓ Branch 1 taken 112 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
112 | std::vector<uint64_t> temp_keys(size); |
| 149 | |||
| 150 |
2/2✓ Branch 0 taken 3384 times.
✓ Branch 1 taken 112 times.
|
3496 | for (size_t index = 0; index < size; ++index) { |
| 151 | 3384 | keys[index] = ToUnsignedValue(arr[left + index]); | |
| 152 | } | ||
| 153 | |||
| 154 |
2/2✓ Branch 0 taken 896 times.
✓ Branch 1 taken 112 times.
|
1008 | for (int byte_pos = 0; byte_pos < kNumBytes; ++byte_pos) { |
| 155 | 896 | const int shift = byte_pos * kByteSize; | |
| 156 | |||
| 157 |
1/4✓ Branch 1 taken 896 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
896 | std::vector<size_t> counts(kNumCounters, 0); |
| 158 |
1/4✓ Branch 1 taken 896 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
896 | std::vector<size_t> offsets(kNumCounters, 0); |
| 159 | |||
| 160 |
2/2✓ Branch 0 taken 27072 times.
✓ Branch 1 taken 896 times.
|
27968 | for (size_t index = 0; index < size; ++index) { |
| 161 | 27072 | ++counts[(keys[index] >> shift) & 0xFFU]; | |
| 162 | } | ||
| 163 | |||
| 164 | size_t sum = 0; | ||
| 165 |
2/2✓ Branch 0 taken 229376 times.
✓ Branch 1 taken 896 times.
|
230272 | for (size_t index = 0; index < kNumCounters; ++index) { |
| 166 | 229376 | offsets[index] = sum; | |
| 167 | 229376 | sum += counts[index]; | |
| 168 | } | ||
| 169 | |||
| 170 |
2/2✓ Branch 0 taken 27072 times.
✓ Branch 1 taken 896 times.
|
27968 | for (size_t index = 0; index < size; ++index) { |
| 171 | 27072 | size_t byte = (keys[index] >> shift) & 0xFFU; | |
| 172 | 27072 | size_t pos = offsets[byte]++; | |
| 173 | |||
| 174 | 27072 | temp_arr[pos] = arr[left + index]; | |
| 175 | 27072 | temp_keys[pos] = keys[index]; | |
| 176 | } | ||
| 177 | |||
| 178 | std::ranges::copy(temp_arr, arr.begin() + static_cast<std::ptrdiff_t>(left)); | ||
| 179 | keys.swap(temp_keys); | ||
| 180 | } | ||
| 181 | } | ||
| 182 | |||
| 183 | 124 | void LeonovaARadixMergeSortTBB::RadixSort(std::vector<int64_t> &arr, size_t left, size_t right) { | |
| 184 | 124 | const size_t size = right - left; | |
| 185 |
1/2✓ Branch 0 taken 124 times.
✗ Branch 1 not taken.
|
124 | if (size <= 1) { |
| 186 | 112 | return; | |
| 187 | } | ||
| 188 | |||
| 189 |
2/2✓ Branch 0 taken 112 times.
✓ Branch 1 taken 12 times.
|
124 | if (size < kMinParallelSize) { |
| 190 | 112 | SequentialRadixSort(arr, left, right); | |
| 191 | 112 | return; | |
| 192 | } | ||
| 193 | |||
| 194 | 12 | auto &arena = GetTbbArena(); | |
| 195 | |||
| 196 |
3/4✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 3 times.
|
12 | const size_t num_threads = std::max<size_t>(1, std::min<size_t>(arena.max_concurrency(), size)); |
| 197 | |||
| 198 | 12 | std::vector<uint64_t> keys(size); | |
| 199 |
1/4✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
12 | std::vector<uint64_t> temp_keys(size); |
| 200 |
1/4✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
12 | std::vector<int64_t> temp_arr(size); |
| 201 | |||
| 202 |
2/6✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 12 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
12 | std::vector<CounterRow> local_counts(num_threads, CounterRow(kNumCounters, 0)); |
| 203 |
2/4✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 12 times.
✗ Branch 5 not taken.
|
12 | std::vector<CounterRow> local_offsets(num_threads, CounterRow(kNumCounters, 0)); |
| 204 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | CounterRow global_counts(kNumCounters); |
| 205 | |||
| 206 |
2/6✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 12 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
|
12 | arena.execute([&] { |
| 207 | 12 | FillUnsignedKeys(arr, left, size, keys, num_threads); | |
| 208 | |||
| 209 |
2/2✓ Branch 0 taken 96 times.
✓ Branch 1 taken 12 times.
|
108 | for (int byte_pos = 0; byte_pos < kNumBytes; ++byte_pos) { |
| 210 | 96 | const int shift = byte_pos * kByteSize; | |
| 211 | |||
| 212 |
2/2✓ Branch 0 taken 240 times.
✓ Branch 1 taken 96 times.
|
336 | for (auto &row : local_counts) { |
| 213 | std::ranges::fill(row, 0); | ||
| 214 | } | ||
| 215 | |||
| 216 | 96 | CountBytesParallel(keys, size, shift, local_counts, num_threads); | |
| 217 | 96 | ReduceCounts(local_counts, global_counts); | |
| 218 | 96 | BuildOffsets(local_counts, local_offsets, global_counts, num_threads); | |
| 219 | |||
| 220 | 96 | ScatterParallel(keys, arr, left, size, shift, local_offsets, temp_arr, temp_keys, num_threads); | |
| 221 | |||
| 222 |
1/2✓ Branch 0 taken 96 times.
✗ Branch 1 not taken.
|
96 | std::ranges::copy(temp_arr, arr.begin() + static_cast<std::ptrdiff_t>(left)); |
| 223 | 96 | keys.swap(temp_keys); | |
| 224 | } | ||
| 225 | 12 | }); | |
| 226 | 12 | } | |
| 227 | |||
| 228 | 4 | void LeonovaARadixMergeSortTBB::SimpleMerge(std::vector<int64_t> &arr, size_t left, size_t mid, size_t right) { | |
| 229 | 4 | std::vector<int64_t> merged(right - left); | |
| 230 | |||
| 231 | size_t in = left; | ||
| 232 | size_t j = mid; | ||
| 233 | size_t k = 0; | ||
| 234 | |||
| 235 |
2/2✓ Branch 0 taken 262148 times.
✓ Branch 1 taken 4 times.
|
262152 | while (in < mid && j < right) { |
| 236 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 262148 times.
|
262148 | merged[k++] = (arr[in] <= arr[j]) ? arr[in++] : arr[j++]; |
| 237 | } | ||
| 238 |
2/2✓ Branch 0 taken 262144 times.
✓ Branch 1 taken 4 times.
|
262148 | while (in < mid) { |
| 239 | 262144 | merged[k++] = arr[in++]; | |
| 240 | } | ||
| 241 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
|
4 | while (j < right) { |
| 242 | ✗ | merged[k++] = arr[j++]; | |
| 243 | } | ||
| 244 | |||
| 245 | std::ranges::copy(merged, arr.begin() + static_cast<std::ptrdiff_t>(left)); | ||
| 246 | 4 | } | |
| 247 | |||
| 248 | 120 | void LeonovaARadixMergeSortTBB::RadixMergeSort(std::vector<int64_t> &arr, size_t left, size_t right) { | |
| 249 | struct Task { | ||
| 250 | size_t l, r; | ||
| 251 | bool sorted; | ||
| 252 | }; | ||
| 253 | |||
| 254 | 120 | std::vector<Task> stack; | |
| 255 |
1/2✓ Branch 1 taken 120 times.
✗ Branch 2 not taken.
|
120 | stack.push_back({left, right, false}); |
| 256 | |||
| 257 |
2/2✓ Branch 0 taken 132 times.
✓ Branch 1 taken 120 times.
|
252 | while (!stack.empty()) { |
| 258 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 132 times.
|
132 | auto [l, r, sorted] = stack.back(); |
| 259 | stack.pop_back(); | ||
| 260 | |||
| 261 | 132 | size_t size = r - l; | |
| 262 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 132 times.
|
132 | if (size <= 1) { |
| 263 | ✗ | continue; | |
| 264 | } | ||
| 265 | |||
| 266 |
2/2✓ Branch 0 taken 124 times.
✓ Branch 1 taken 8 times.
|
132 | if (size <= kRadixThreshold) { |
| 267 |
1/2✓ Branch 1 taken 124 times.
✗ Branch 2 not taken.
|
124 | RadixSort(arr, l, r); |
| 268 | 124 | continue; | |
| 269 | } | ||
| 270 | |||
| 271 | 8 | size_t mid = l + (size / 2); | |
| 272 | |||
| 273 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (!sorted) { |
| 274 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | stack.push_back({l, r, true}); |
| 275 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | stack.push_back({mid, r, false}); |
| 276 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | stack.push_back({l, mid, false}); |
| 277 | } else { | ||
| 278 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | SimpleMerge(arr, l, mid, r); |
| 279 | } | ||
| 280 | } | ||
| 281 | 120 | } | |
| 282 | |||
| 283 | } // namespace leonova_a_radix_merge_sort | ||
| 284 |