| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "baldin_a_radix_sort/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <mpi.h> | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <array> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <cstdint> | ||
| 9 | #include <iterator> | ||
| 10 | #include <vector> | ||
| 11 | |||
| 12 | #include "baldin_a_radix_sort/common/include/common.hpp" | ||
| 13 | #include "util/include/util.hpp" | ||
| 14 | |||
| 15 | namespace baldin_a_radix_sort { // comment for ci rerun | ||
| 16 | |||
| 17 |
1/2✓ Branch 1 taken 26 times.
✗ Branch 2 not taken.
|
26 | BaldinARadixSortALL::BaldinARadixSortALL(const InType &in) { |
| 18 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 19 |
1/2✓ Branch 1 taken 26 times.
✗ Branch 2 not taken.
|
26 | GetInput() = in; |
| 20 | GetOutput() = {}; | ||
| 21 | 26 | } | |
| 22 | |||
| 23 | 26 | bool BaldinARadixSortALL::ValidationImpl() { | |
| 24 | 26 | return true; | |
| 25 | } | ||
| 26 | |||
| 27 | 26 | bool BaldinARadixSortALL::PreProcessingImpl() { | |
| 28 | 26 | int rank = 0; | |
| 29 | |||
| 30 |
1/2✓ Branch 1 taken 26 times.
✗ Branch 2 not taken.
|
26 | if (ppc::util::IsUnderMpirun()) { |
| 31 | 26 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 32 | } | ||
| 33 | |||
| 34 |
2/2✓ Branch 0 taken 13 times.
✓ Branch 1 taken 13 times.
|
26 | if (rank == 0) { |
| 35 | 13 | GetOutput() = GetInput(); | |
| 36 | } | ||
| 37 | |||
| 38 | 26 | return true; | |
| 39 | } | ||
| 40 | |||
| 41 | namespace { | ||
| 42 | |||
| 43 | 104 | void CountingSortStep(std::vector<int>::iterator in_begin, std::vector<int>::iterator in_end, | |
| 44 | std::vector<int>::iterator out_begin, size_t byte_index) { | ||
| 45 | 104 | size_t shift = byte_index * 8; | |
| 46 | 104 | std::array<size_t, 256> count = {0}; | |
| 47 | |||
| 48 |
2/2✓ Branch 0 taken 944 times.
✓ Branch 1 taken 104 times.
|
1048 | for (auto it = in_begin; it != in_end; it++) { |
| 49 | 944 | auto raw_val = static_cast<unsigned int>(*it); | |
| 50 | 944 | unsigned int byte_val = (raw_val >> shift) & 0xFF; | |
| 51 | |||
| 52 |
2/2✓ Branch 0 taken 236 times.
✓ Branch 1 taken 708 times.
|
944 | if (byte_index == sizeof(int) - 1) { |
| 53 | 236 | byte_val ^= 128; | |
| 54 | } | ||
| 55 | 944 | count.at(byte_val)++; | |
| 56 | } | ||
| 57 | |||
| 58 | 104 | std::array<size_t, 256> prefix{}; | |
| 59 | prefix[0] = 0; | ||
| 60 |
2/2✓ Branch 0 taken 26520 times.
✓ Branch 1 taken 104 times.
|
26624 | for (int i = 1; i < 256; i++) { |
| 61 | 26520 | prefix.at(i) = prefix.at(i - 1) + count.at(i - 1); | |
| 62 | } | ||
| 63 | |||
| 64 |
2/2✓ Branch 0 taken 944 times.
✓ Branch 1 taken 104 times.
|
1048 | for (auto it = in_begin; it != in_end; it++) { |
| 65 | 944 | auto raw_val = static_cast<unsigned int>(*it); | |
| 66 | 944 | unsigned int byte_val = (raw_val >> shift) & 0xFF; | |
| 67 | |||
| 68 |
2/2✓ Branch 0 taken 236 times.
✓ Branch 1 taken 708 times.
|
944 | if (byte_index == sizeof(int) - 1) { |
| 69 | 236 | byte_val ^= 128; | |
| 70 | } | ||
| 71 | |||
| 72 | 944 | *(out_begin + static_cast<int64_t>(prefix.at(byte_val))) = *it; | |
| 73 | 944 | prefix.at(byte_val)++; | |
| 74 | } | ||
| 75 | 104 | } | |
| 76 | |||
| 77 |
2/2✓ Branch 0 taken 20 times.
✓ Branch 1 taken 26 times.
|
46 | void RadixSortLocal(std::vector<int>::iterator begin, std::vector<int>::iterator end) { |
| 78 | 46 | size_t n = std::distance(begin, end); | |
| 79 |
2/2✓ Branch 0 taken 20 times.
✓ Branch 1 taken 26 times.
|
46 | if (n < 2) { |
| 80 | 20 | return; | |
| 81 | } | ||
| 82 | |||
| 83 | 26 | std::vector<int> temp(n); | |
| 84 | |||
| 85 |
2/2✓ Branch 0 taken 104 times.
✓ Branch 1 taken 26 times.
|
130 | for (size_t i = 0; i < sizeof(int); i++) { |
| 86 | size_t shift = i; | ||
| 87 | |||
| 88 |
2/2✓ Branch 0 taken 52 times.
✓ Branch 1 taken 52 times.
|
104 | if (i % 2 == 0) { |
| 89 | 52 | CountingSortStep(begin, end, temp.begin(), shift); | |
| 90 | } else { | ||
| 91 | 52 | CountingSortStep(temp.begin(), temp.end(), begin, shift); | |
| 92 | } | ||
| 93 | } | ||
| 94 | } | ||
| 95 | |||
| 96 | } // namespace | ||
| 97 | |||
| 98 | 26 | void BaldinARadixSortALL::DistributeData(int rank, int size, int &n) { | |
| 99 | 26 | bool is_mpi = ppc::util::IsUnderMpirun(); | |
| 100 | |||
| 101 |
1/2✓ Branch 0 taken 26 times.
✗ Branch 1 not taken.
|
26 | if (is_mpi) { |
| 102 | 26 | MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 103 | } | ||
| 104 | |||
| 105 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.
|
26 | if (n == 0) { |
| 106 | return; | ||
| 107 | } | ||
| 108 | |||
| 109 | 24 | counts_.resize(size); | |
| 110 | 24 | displs_.resize(size); | |
| 111 | 24 | int chunk = n / size; | |
| 112 | 24 | int rem = n % size; | |
| 113 | int curr = 0; | ||
| 114 |
2/2✓ Branch 0 taken 48 times.
✓ Branch 1 taken 24 times.
|
72 | for (int i = 0; i < size; i++) { |
| 115 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 18 times.
|
78 | counts_[i] = chunk + (i < rem ? 1 : 0); |
| 116 | 48 | displs_[i] = curr; | |
| 117 | 48 | curr += counts_[i]; | |
| 118 | } | ||
| 119 | |||
| 120 | 24 | local_data_.resize(counts_[rank]); | |
| 121 | |||
| 122 |
1/2✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
|
24 | if (is_mpi) { |
| 123 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
36 | MPI_Scatterv(rank == 0 ? GetOutput().data() : nullptr, counts_.data(), displs_.data(), MPI_INT, local_data_.data(), |
| 124 | counts_[rank], MPI_INT, 0, MPI_COMM_WORLD); | ||
| 125 | } else { | ||
| 126 | ✗ | local_data_ = GetOutput(); | |
| 127 | } | ||
| 128 | } | ||
| 129 | |||
| 130 | 24 | void BaldinARadixSortALL::GatherData(int rank) { | |
| 131 | 24 | bool is_mpi = ppc::util::IsUnderMpirun(); | |
| 132 | |||
| 133 |
1/2✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
|
24 | if (is_mpi) { |
| 134 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
36 | MPI_Gatherv(local_data_.data(), counts_[rank], MPI_INT, rank == 0 ? GetOutput().data() : nullptr, counts_.data(), |
| 135 | displs_.data(), MPI_INT, 0, MPI_COMM_WORLD); | ||
| 136 | } else { | ||
| 137 | ✗ | GetOutput() = local_data_; | |
| 138 | } | ||
| 139 | 24 | } | |
| 140 | |||
| 141 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 23 times.
|
24 | void BaldinARadixSortALL::LocalSort(int num_threads) { |
| 142 | 24 | int local_n = static_cast<int>(local_data_.size()); | |
| 143 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 23 times.
|
24 | if (local_n == 0) { |
| 144 | 1 | return; | |
| 145 | } | ||
| 146 | |||
| 147 | 23 | std::vector<int> local_offsets(num_threads + 1); | |
| 148 | 23 | int l_chunk = local_n / num_threads; | |
| 149 | 23 | int l_rem = local_n % num_threads; | |
| 150 | int l_curr = 0; | ||
| 151 |
2/2✓ Branch 0 taken 46 times.
✓ Branch 1 taken 23 times.
|
69 | for (int i = 0; i < num_threads; i++) { |
| 152 |
2/2✓ Branch 0 taken 35 times.
✓ Branch 1 taken 11 times.
|
46 | local_offsets[i] = l_curr; |
| 153 |
2/2✓ Branch 0 taken 35 times.
✓ Branch 1 taken 11 times.
|
81 | l_curr += l_chunk + (i < l_rem ? 1 : 0); |
| 154 | } | ||
| 155 | 23 | local_offsets[num_threads] = local_n; | |
| 156 | |||
| 157 | 23 | auto &local_data_ref = local_data_; | |
| 158 | |||
| 159 | 23 | #pragma omp parallel num_threads(num_threads) default(none) shared(num_threads, local_offsets, local_data_ref) | |
| 160 | { | ||
| 161 | int tid = omp_get_thread_num(); | ||
| 162 | auto begin = local_data_ref.begin() + local_offsets[tid]; | ||
| 163 | auto end = local_data_ref.begin() + local_offsets[tid + 1]; | ||
| 164 | RadixSortLocal(begin, end); | ||
| 165 | } | ||
| 166 | |||
| 167 |
2/2✓ Branch 0 taken 23 times.
✓ Branch 1 taken 23 times.
|
46 | for (int step = 1; step < num_threads; step *= 2) { |
| 168 | 23 | #pragma omp parallel for num_threads(num_threads) default(none) shared(step, num_threads, local_offsets, local_data_ref) | |
| 169 | for (int i = 0; i < num_threads; i += 2 * step) { | ||
| 170 | if (i + step < num_threads) { | ||
| 171 | auto begin = local_data_ref.begin() + local_offsets[i]; | ||
| 172 | auto middle = local_data_ref.begin() + local_offsets[i + step]; | ||
| 173 | int end_idx = std::min(i + (2 * step), num_threads); | ||
| 174 | auto end = local_data_ref.begin() + local_offsets[end_idx]; | ||
| 175 | |||
| 176 | std::inplace_merge(begin, middle, end); | ||
| 177 | } | ||
| 178 | } | ||
| 179 | } | ||
| 180 | } | ||
| 181 | |||
| 182 | 12 | void BaldinARadixSortALL::GlobalMerge(int num_threads, int size, int n) { | |
| 183 | auto &out = GetOutput(); | ||
| 184 | |||
| 185 | 12 | std::vector<int> global_offsets(size + 1); | |
| 186 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 12 times.
|
36 | for (int i = 0; i < size; i++) { |
| 187 | 24 | global_offsets[i] = displs_[i]; | |
| 188 | } | ||
| 189 | 12 | global_offsets[size] = n; | |
| 190 | |||
| 191 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
24 | for (int step = 1; step < size; step *= 2) { |
| 192 | 12 | #pragma omp parallel for num_threads(num_threads) default(none) shared(step, size, global_offsets, out) | |
| 193 | for (int i = 0; i < size; i += 2 * step) { | ||
| 194 | if (i + step < size) { | ||
| 195 | auto begin = out.begin() + global_offsets[i]; | ||
| 196 | auto middle = out.begin() + global_offsets[i + step]; | ||
| 197 | int end_idx = std::min(i + (2 * step), size); | ||
| 198 | auto end = out.begin() + global_offsets[end_idx]; | ||
| 199 | |||
| 200 | std::inplace_merge(begin, middle, end); | ||
| 201 | } | ||
| 202 | } | ||
| 203 | } | ||
| 204 | 12 | } | |
| 205 | |||
| 206 | 26 | bool BaldinARadixSortALL::RunImpl() { | |
| 207 | 26 | int rank = 0; | |
| 208 | 26 | int size = 1; | |
| 209 | 26 | bool is_mpi = ppc::util::IsUnderMpirun(); | |
| 210 | |||
| 211 |
1/2✓ Branch 0 taken 26 times.
✗ Branch 1 not taken.
|
26 | if (is_mpi) { |
| 212 | 26 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 213 | 26 | MPI_Comm_size(MPI_COMM_WORLD, &size); | |
| 214 | } | ||
| 215 | |||
| 216 | 26 | int n = 0; | |
| 217 |
2/2✓ Branch 0 taken 13 times.
✓ Branch 1 taken 13 times.
|
26 | if (rank == 0) { |
| 218 | 13 | n = static_cast<int>(GetOutput().size()); | |
| 219 | } | ||
| 220 | |||
| 221 | 26 | DistributeData(rank, size, n); | |
| 222 | |||
| 223 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 2 times.
|
26 | if (n == 0) { |
| 224 | return true; | ||
| 225 | } | ||
| 226 | |||
| 227 | 24 | int num_threads = ppc::util::GetNumThreads(); | |
| 228 | |||
| 229 | 24 | LocalSort(num_threads); | |
| 230 | |||
| 231 | 24 | GatherData(rank); | |
| 232 | |||
| 233 |
3/4✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
✓ Branch 2 taken 12 times.
✗ Branch 3 not taken.
|
24 | if (rank == 0 && size > 1) { |
| 234 | 12 | GlobalMerge(num_threads, size, n); | |
| 235 | } | ||
| 236 | |||
| 237 | return true; | ||
| 238 | } | ||
| 239 | |||
| 240 | 26 | bool BaldinARadixSortALL::PostProcessingImpl() { | |
| 241 | 26 | return true; | |
| 242 | } | ||
| 243 | |||
| 244 | } // namespace baldin_a_radix_sort | ||
| 245 |