| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "mityaeva_radix/stl/include/sorter_stl.hpp" | ||
| 2 | |||
| 3 | #include <algorithm> | ||
| 4 | #include <cstddef> | ||
| 5 | #include <cstdint> | ||
| 6 | #include <cstring> | ||
| 7 | #include <thread> | ||
| 8 | #include <vector> | ||
| 9 | |||
| 10 | #include "util/include/util.hpp" | ||
| 11 | |||
| 12 | namespace mityaeva_radix { | ||
| 13 | |||
| 14 | namespace { | ||
| 15 | template <typename Index, typename Functor> | ||
| 16 | 3168 | void ParallelFor(Index start, Index finish, size_t num_threads, Functor f) { | |
| 17 |
3/4✓ Branch 0 taken 1144 times.
✓ Branch 1 taken 440 times.
✓ Branch 2 taken 1188 times.
✗ Branch 3 not taken.
|
3168 | if (num_threads < 2 || (finish - start) < 150) { |
| 18 | 2992 | f(start, finish, 0); | |
| 19 | 3168 | return; | |
| 20 | } | ||
| 21 | ✗ | auto portion = (finish - start) / num_threads; | |
| 22 | ✗ | auto remainder = (finish - start) % num_threads; | |
| 23 | |||
| 24 | ✗ | std::vector<std::thread> threads; | |
| 25 | ✗ | threads.reserve(num_threads); | |
| 26 | ✗ | for (auto thread_idx = 0UZ; thread_idx < num_threads; thread_idx++) { | |
| 27 | ✗ | auto begin = start + (portion * thread_idx) + std::min(thread_idx, remainder); | |
| 28 | ✗ | auto end = start + ((thread_idx + 1) * portion) + std::min(thread_idx + 1, remainder); | |
| 29 | ✗ | threads.emplace_back(f, begin, end, thread_idx); | |
| 30 | } | ||
| 31 | |||
| 32 | ✗ | for (auto &thread : threads) { | |
| 33 | ✗ | thread.join(); | |
| 34 | } | ||
| 35 | ✗ | } | |
| 36 | } // namespace | ||
| 37 | |||
| 38 | ✗ | uint64_t SorterStl::DoubleToSortable(uint64_t x) { | |
| 39 |
2/6✗ Branch 0 not taken.
✗ Branch 1 not taken.
✓ Branch 2 taken 2176 times.
✓ Branch 3 taken 2280 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
4456 | if ((x & 0x8000000000000000ULL) != 0U) { |
| 40 | 2176 | return ~x; | |
| 41 | } | ||
| 42 | 2280 | return x | 0x8000000000000000ULL; | |
| 43 | } | ||
| 44 | |||
| 45 | ✗ | uint64_t SorterStl::SortableToDouble(uint64_t x) { | |
| 46 | ✗ | if ((x & 0x8000000000000000ULL) != 0U) { | |
| 47 | 2280 | return x & 0x7FFFFFFFFFFFFFFFULL; | |
| 48 | } | ||
| 49 | 2176 | return ~x; | |
| 50 | } | ||
| 51 | |||
| 52 | 704 | void SorterStl::CountingPass(std::vector<uint64_t> *current, std::vector<uint64_t> *next, int shift, int radix, | |
| 53 | int num_threads, size_t data_size) { | ||
| 54 |
1/2✓ Branch 2 taken 704 times.
✗ Branch 3 not taken.
|
704 | std::vector<std::vector<int>> thread_counters(num_threads, std::vector<int>(radix, 0)); |
| 55 | |||
| 56 |
1/2✓ Branch 1 taken 704 times.
✗ Branch 2 not taken.
|
704 | ParallelFor(0UZ, data_size, num_threads, [&](size_t start, size_t end, size_t thread_idx) { |
| 57 | 704 | auto &local_counters = thread_counters[thread_idx]; | |
| 58 |
2/2✓ Branch 0 taken 35648 times.
✓ Branch 1 taken 704 times.
|
36352 | for (size_t i = start; i < end; i++) { |
| 59 | 35648 | int digit = static_cast<int>(((*current)[i] >> static_cast<size_t>(shift)) & static_cast<size_t>(radix - 1)); | |
| 60 | 35648 | local_counters[digit]++; | |
| 61 | } | ||
| 62 | 704 | }); | |
| 63 | |||
| 64 |
1/2✓ Branch 1 taken 704 times.
✗ Branch 2 not taken.
|
704 | std::vector<int> prefix_sums(static_cast<size_t>(radix * num_threads), 0); |
| 65 | |||
| 66 | int total = 0; | ||
| 67 |
2/2✓ Branch 0 taken 180224 times.
✓ Branch 1 taken 704 times.
|
180928 | for (int digit = 0; digit < radix; digit++) { |
| 68 | int digit_sum = 0; | ||
| 69 |
2/2✓ Branch 0 taken 450560 times.
✓ Branch 1 taken 180224 times.
|
630784 | for (int thread_idx = 0; thread_idx < num_threads; thread_idx++) { |
| 70 | 450560 | prefix_sums[(thread_idx * radix) + digit] = total + digit_sum; | |
| 71 | 450560 | digit_sum += thread_counters[thread_idx][digit]; | |
| 72 | } | ||
| 73 | 180224 | total += digit_sum; | |
| 74 | } | ||
| 75 | |||
| 76 |
1/2✓ Branch 1 taken 704 times.
✗ Branch 2 not taken.
|
704 | ParallelFor(0UZ, data_size, num_threads, [&](size_t start, size_t end, size_t thread_idx) { |
| 77 | 704 | std::vector<int> local_pos(radix, 0); | |
| 78 |
2/2✓ Branch 0 taken 180224 times.
✓ Branch 1 taken 704 times.
|
180928 | for (int digit = 0; digit < radix; digit++) { |
| 79 | 180224 | local_pos[digit] = prefix_sums[(thread_idx * radix) + digit]; | |
| 80 | } | ||
| 81 | |||
| 82 |
2/2✓ Branch 0 taken 35648 times.
✓ Branch 1 taken 704 times.
|
36352 | for (size_t i = start; i < end; i++) { |
| 83 | 35648 | int digit = static_cast<int>(((*current)[i] >> static_cast<size_t>(shift)) & static_cast<size_t>(radix - 1)); | |
| 84 | 35648 | auto pos = static_cast<size_t>(local_pos[digit]++); | |
| 85 | 35648 | (*next)[pos] = (*current)[i]; | |
| 86 | } | ||
| 87 | 704 | }); | |
| 88 | 704 | } | |
| 89 | |||
| 90 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 88 times.
|
96 | void SorterStl::Sort(std::vector<double> &data) { |
| 91 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 88 times.
|
96 | if (data.size() <= 1) { |
| 92 | 8 | return; | |
| 93 | } | ||
| 94 | |||
| 95 | 88 | int num_threads = ppc::util::GetNumThreads(); | |
| 96 |
1/2✓ Branch 2 taken 88 times.
✗ Branch 3 not taken.
|
88 | std::vector<double> temp(data.size()); |
| 97 |
1/4✓ Branch 1 taken 88 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
88 | std::vector<uint64_t> as_uint(data.size()); |
| 98 |
1/2✓ Branch 1 taken 88 times.
✗ Branch 2 not taken.
|
88 | ParallelFor(0UZ, data.size(), num_threads, [&](size_t start, size_t end, size_t) { |
| 99 |
2/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✓ Branch 2 taken 4456 times.
✓ Branch 3 taken 88 times.
|
4544 | for (auto i = start; i < end; i++) { |
| 100 | uint64_t bits = 0; | ||
| 101 |
2/4✗ Branch 0 not taken.
✗ Branch 1 not taken.
✓ Branch 2 taken 2176 times.
✓ Branch 3 taken 2280 times.
|
4456 | std::memcpy(&bits, &data[i], sizeof(double)); |
| 102 | 4456 | as_uint[i] = DoubleToSortable(bits); | |
| 103 | } | ||
| 104 | }); | ||
| 105 | const int bits_per_pass = 8; | ||
| 106 | const int radix = 1 << bits_per_pass; | ||
| 107 | const int passes = static_cast<int>(sizeof(uint64_t) * 8 / bits_per_pass); | ||
| 108 | |||
| 109 |
1/4✓ Branch 1 taken 88 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
88 | std::vector<uint64_t> uint_temp(data.size()); |
| 110 | 88 | std::vector<uint64_t> *current = &as_uint; | |
| 111 | std::vector<uint64_t> *next = &uint_temp; | ||
| 112 | |||
| 113 |
2/2✓ Branch 0 taken 704 times.
✓ Branch 1 taken 88 times.
|
792 | for (int pass = 0; pass < passes; pass++) { |
| 114 |
1/2✓ Branch 1 taken 704 times.
✗ Branch 2 not taken.
|
704 | int shift = pass * bits_per_pass; |
| 115 |
1/2✓ Branch 1 taken 704 times.
✗ Branch 2 not taken.
|
704 | CountingPass(current, next, shift, radix, num_threads, data.size()); |
| 116 | std::swap(current, next); | ||
| 117 | } | ||
| 118 | |||
| 119 |
1/2✓ Branch 1 taken 88 times.
✗ Branch 2 not taken.
|
88 | ParallelFor(0UZ, data.size(), num_threads, [&](size_t start, size_t end, size_t) { |
| 120 |
2/2✓ Branch 0 taken 4456 times.
✓ Branch 1 taken 88 times.
|
4544 | for (auto i = start; i < end; i++) { |
| 121 | uint64_t bits = 0; | ||
| 122 |
1/2✓ Branch 0 taken 4456 times.
✗ Branch 1 not taken.
|
4456 | if (current == &as_uint) { |
| 123 |
2/2✓ Branch 0 taken 2280 times.
✓ Branch 1 taken 2176 times.
|
8912 | bits = SortableToDouble(as_uint[i]); |
| 124 | } else { | ||
| 125 | ✗ | bits = SortableToDouble(uint_temp[i]); | |
| 126 | } | ||
| 127 | 4456 | std::memcpy(&data[i], &bits, sizeof(double)); | |
| 128 | } | ||
| 129 | 88 | }); | |
| 130 | } | ||
| 131 | |||
| 132 | } // namespace mityaeva_radix | ||
| 133 |