| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "boltenkov_s_gaussian_kernel/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <mpi.h> | ||
| 4 | #include <omp.h> | ||
| 5 | |||
| 6 | #include <algorithm> | ||
| 7 | #include <array> | ||
| 8 | #include <climits> | ||
| 9 | #include <cstddef> | ||
| 10 | #include <vector> | ||
| 11 | |||
| 12 | #include "boltenkov_s_gaussian_kernel/common/include/common.hpp" | ||
| 13 | #include "util/include/util.hpp" | ||
| 14 | |||
| 15 | namespace boltenkov_s_gaussian_kernel { | ||
| 16 | |||
| 17 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | BoltenkovSGaussianKernelALL::BoltenkovSGaussianKernelALL(const InType &in) |
| 18 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | : kernel_{{{1, 2, 1}, {2, 4, 2}, {1, 2, 1}}} { |
| 19 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 20 | 4 | int rank = 0; | |
| 21 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); |
| 22 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | if (rank == 0) { |
| 23 | GetInput() = in; | ||
| 24 | } else { | ||
| 25 | 2 | GetInput() = InType(); | |
| 26 | } | ||
| 27 | 4 | GetOutput() = std::vector<std::vector<int>>(); | |
| 28 | 4 | } | |
| 29 | |||
| 30 | 4 | bool BoltenkovSGaussianKernelALL::ValidationImpl() { | |
| 31 | 4 | int rank = 0; | |
| 32 | 4 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 33 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | if (rank == 0) { |
| 34 | 2 | std::size_t n = std::get<0>(GetInput()); | |
| 35 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | std::size_t m = std::get<1>(GetInput()); |
| 36 | const auto &data = std::get<2>(GetInput()); | ||
| 37 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | if (data.size() != n) { |
| 38 | return false; | ||
| 39 | } | ||
| 40 |
2/2✓ Branch 0 taken 55 times.
✓ Branch 1 taken 2 times.
|
57 | for (std::size_t i = 0; i < n; ++i) { |
| 41 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 55 times.
|
55 | if (data[i].size() != m) { |
| 42 | return false; | ||
| 43 | } | ||
| 44 | } | ||
| 45 | return true; | ||
| 46 | } | ||
| 47 | return true; | ||
| 48 | } | ||
| 49 | |||
| 50 | 4 | bool BoltenkovSGaussianKernelALL::PreProcessingImpl() { | |
| 51 | 4 | int rank = 0; | |
| 52 | 4 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 53 | 4 | auto n_size_t = std::get<0>(GetInput()); | |
| 54 | 4 | auto m_size_t = std::get<1>(GetInput()); | |
| 55 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
|
4 | if (n_size_t > INT_MAX || m_size_t > INT_MAX) { |
| 56 | return false; | ||
| 57 | } | ||
| 58 | 4 | int n_val = 0; | |
| 59 | 4 | int m_val = 0; | |
| 60 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | if (rank == 0) { |
| 61 | 2 | n_val = static_cast<int>(n_size_t); | |
| 62 | 2 | m_val = static_cast<int>(m_size_t); | |
| 63 | } | ||
| 64 | 4 | MPI_Bcast(&n_val, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 65 | 4 | MPI_Bcast(&m_val, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 66 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
|
4 | if (n_val < 1e6 && n_val > 0) { |
| 67 | 4 | GetOutput().resize(static_cast<std::size_t>(n_val)); | |
| 68 | } else { | ||
| 69 | return false; | ||
| 70 | } | ||
| 71 |
2/2✓ Branch 0 taken 110 times.
✓ Branch 1 taken 4 times.
|
114 | for (int i = 0; i < n_val; ++i) { |
| 72 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 110 times.
|
110 | if (m_val < 1e6 && m_val > 0) { |
| 73 | 110 | GetOutput()[i].resize(static_cast<std::size_t>(m_val)); | |
| 74 | } else { | ||
| 75 | return false; | ||
| 76 | } | ||
| 77 | } | ||
| 78 | return true; | ||
| 79 | } | ||
| 80 | |||
| 81 | ✗ | bool BoltenkovSGaussianKernelALL::IsValidSize(int n, int m) { | |
| 82 | ✗ | return n > 0 && m > 0 && n < 1e6 && m < 1e6; | |
| 83 | } | ||
| 84 | |||
| 85 | 2 | void BoltenkovSGaussianKernelALL::ComputeScatterParams(int n, int m, int size, int rows_per_proc, | |
| 86 | std::vector<int> &send_counts, std::vector<int> &displs) { | ||
| 87 | 2 | send_counts.assign(size, 0); | |
| 88 | 2 | displs.assign(size, 0); | |
| 89 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2 times.
|
6 | for (int i = 0; i < size; ++i) { |
| 90 | 4 | int s = i * rows_per_proc; | |
| 91 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | int e = std::min(s + rows_per_proc, n) - 1; |
| 92 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | int rows = (s < n) ? (e - s + 1) : 0; |
| 93 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (rows > 0) { |
| 94 | 4 | int h_first = std::max(0, s - 1); | |
| 95 | 4 | int h_last = std::min(n - 1, e + 1); | |
| 96 | 4 | int h_rows = h_last - h_first + 1; | |
| 97 | 4 | send_counts[i] = h_rows * m; | |
| 98 | 4 | displs[i] = h_first * m; | |
| 99 | } | ||
| 100 | } | ||
| 101 | 2 | } | |
| 102 | |||
| 103 | 2 | void BoltenkovSGaussianKernelALL::ComputeGatherDispls(int m, const std::vector<int> &gather_counts, | |
| 104 | std::vector<int> &recv_counts, std::vector<int> &recv_displs) { | ||
| 105 | 2 | int size = static_cast<int>(gather_counts.size()); | |
| 106 | 2 | recv_counts.resize(size); | |
| 107 | 2 | recv_displs.resize(size); | |
| 108 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 2 times.
|
6 | for (int i = 0; i < size; ++i) { |
| 109 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | recv_counts[i] = gather_counts[i] * m; |
| 110 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | recv_displs[i] = (i == 0) ? 0 : recv_displs[i - 1] + recv_counts[i - 1]; |
| 111 | } | ||
| 112 | 2 | } | |
| 113 | |||
| 114 | 4 | std::vector<int> BoltenkovSGaussianKernelALL::ApplyGaussianFilterFlat(const std::vector<int> &local_halo_flat, | |
| 115 | int halo_rows, int local_start_row, | ||
| 116 | int local_rows, int m, | ||
| 117 | const std::array<std::array<int, 3>, 3> &kernel, | ||
| 118 | int shift) { | ||
| 119 | 4 | const int tmp_rows = local_rows + 2; | |
| 120 | 4 | const int tmp_cols = m + 2; | |
| 121 | 4 | std::vector<int> tmp(static_cast<size_t>(tmp_rows) * static_cast<size_t>(tmp_cols), 0); | |
| 122 | |||
| 123 | 4 | const int halo_first = std::max(0, local_start_row - 1); | |
| 124 | |||
| 125 |
2/2✓ Branch 0 taken 63 times.
✓ Branch 1 taken 4 times.
|
67 | for (int i = 0; i < tmp_rows; ++i) { |
| 126 | 63 | int global_row = local_start_row - 1 + i; | |
| 127 |
4/4✓ Branch 0 taken 61 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 59 times.
✓ Branch 3 taken 2 times.
|
63 | if (global_row >= halo_first && global_row < halo_first + halo_rows) { |
| 128 | 59 | const int src_offset = (global_row - halo_first) * m; | |
| 129 |
1/2✓ Branch 0 taken 59 times.
✗ Branch 1 not taken.
|
59 | int *dst_row = &tmp[(static_cast<size_t>(i) * static_cast<size_t>(tmp_cols)) + 1]; |
| 130 |
1/2✓ Branch 0 taken 59 times.
✗ Branch 1 not taken.
|
59 | std::copy_n(&local_halo_flat[src_offset], m, dst_row); |
| 131 | } | ||
| 132 | } | ||
| 133 | |||
| 134 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<int> local_res(static_cast<size_t>(local_rows) * static_cast<size_t>(m), 0); |
| 135 | |||
| 136 |
2/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 4 times.
✗ Branch 4 not taken.
|
4 | #pragma omp parallel for num_threads(ppc::util::GetNumThreads()) default(none) \ |
| 137 | shared(tmp, local_res, local_rows, m, kernel, shift, tmp_cols) | ||
| 138 | for (int i = 0; i < local_rows; ++i) { | ||
| 139 | const int *row0 = &tmp[static_cast<size_t>(i) * static_cast<size_t>(tmp_cols)]; | ||
| 140 | const int *row1 = row0 + tmp_cols; | ||
| 141 | const int *row2 = row1 + tmp_cols; | ||
| 142 | int *out_row = &local_res[static_cast<size_t>(i) * static_cast<size_t>(m)]; | ||
| 143 | |||
| 144 | const int k00 = kernel[0][0]; | ||
| 145 | const int k01 = kernel[0][1]; | ||
| 146 | const int k02 = kernel[0][2]; | ||
| 147 | const int k10 = kernel[1][0]; | ||
| 148 | const int k11 = kernel[1][1]; | ||
| 149 | const int k12 = kernel[1][2]; | ||
| 150 | const int k20 = kernel[2][0]; | ||
| 151 | const int k21 = kernel[2][1]; | ||
| 152 | const int k22 = kernel[2][2]; | ||
| 153 | |||
| 154 | for (int j = 0; j < m; ++j) { | ||
| 155 | int val = (row0[j] * k00) + (row0[j + 1] * k01) + (row0[j + 2] * k02) + (row1[j] * k10) + (row1[j + 1] * k11) + | ||
| 156 | (row1[j + 2] * k12) + (row2[j] * k20) + (row2[j + 1] * k21) + (row2[j + 2] * k22); | ||
| 157 | out_row[j] = val >> shift; | ||
| 158 | } | ||
| 159 | } | ||
| 160 | |||
| 161 | 4 | return local_res; | |
| 162 | } | ||
| 163 | |||
| 164 | 4 | bool BoltenkovSGaussianKernelALL::RunImpl() { | |
| 165 | 4 | int rank = 0; | |
| 166 | 4 | int size = 0; | |
| 167 | 4 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 168 | 4 | MPI_Comm_size(MPI_COMM_WORLD, &size); | |
| 169 | |||
| 170 | 4 | int n = static_cast<int>(GetOutput().size()); | |
| 171 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | int m = static_cast<int>(GetOutput()[0].size()); |
| 172 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (!IsValidSize(n, m)) { |
| 173 | return false; | ||
| 174 | } | ||
| 175 | |||
| 176 | 4 | MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 177 | 4 | MPI_Bcast(&m, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 178 | |||
| 179 | 4 | std::vector<int> data_flat(static_cast<size_t>(n) * static_cast<size_t>(m)); | |
| 180 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | if (rank == 0) { |
| 181 | const auto &global_data = std::get<2>(GetInput()); | ||
| 182 |
2/2✓ Branch 0 taken 55 times.
✓ Branch 1 taken 2 times.
|
57 | for (int i = 0; i < n; ++i) { |
| 183 |
1/2✓ Branch 0 taken 55 times.
✗ Branch 1 not taken.
|
55 | std::copy_n(global_data[i].data(), m, &data_flat[static_cast<size_t>(i) * static_cast<size_t>(m)]); |
| 184 | } | ||
| 185 | } | ||
| 186 | |||
| 187 | 4 | int rows_per_proc = (n + size - 1) / size; | |
| 188 | 4 | int local_start = rank * rows_per_proc; | |
| 189 | 4 | int local_rows = 0; | |
| 190 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (local_start < n) { |
| 191 | 4 | local_rows = std::min(rows_per_proc, n - local_start); | |
| 192 | } | ||
| 193 | |||
| 194 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | int halo_first = std::max(0, local_start - 1); |
| 195 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | int halo_last = std::min(n - 1, local_start + local_rows); |
| 196 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | int halo_rows = (local_rows > 0) ? (halo_last - halo_first + 1) : 0; |
| 197 | |||
| 198 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<int> send_counts(size, 0); |
| 199 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<int> displs(size, 0); |
| 200 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | if (rank == 0) { |
| 201 |
1/2✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
|
2 | ComputeScatterParams(n, m, size, rows_per_proc, send_counts, displs); |
| 202 | } | ||
| 203 | |||
| 204 |
2/6✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 4 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
4 | std::vector<int> local_halo_flat(static_cast<size_t>(halo_rows) * static_cast<size_t>(m)); |
| 205 | |||
| 206 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | MPI_Scatterv(data_flat.data(), send_counts.data(), displs.data(), MPI_INT, local_halo_flat.data(), |
| 207 | static_cast<int>(local_halo_flat.size()), MPI_INT, 0, MPI_COMM_WORLD); | ||
| 208 | |||
| 209 | 4 | std::vector<int> local_res_flat; | |
| 210 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (local_rows > 0) { |
| 211 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
8 | local_res_flat = ApplyGaussianFilterFlat(local_halo_flat, halo_rows, local_start, local_rows, m, kernel_, shift_); |
| 212 | } | ||
| 213 | |||
| 214 |
2/6✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 4 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
4 | std::vector<int> gather_counts(size, 0); |
| 215 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | MPI_Gather(&local_rows, 1, MPI_INT, gather_counts.data(), 1, MPI_INT, 0, MPI_COMM_WORLD); |
| 216 | |||
| 217 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<int> recv_counts(size, 0); |
| 218 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<int> recv_displs(size, 0); |
| 219 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
|
4 | if (rank == 0) { |
| 220 |
1/2✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
|
2 | ComputeGatherDispls(m, gather_counts, recv_counts, recv_displs); |
| 221 | } | ||
| 222 | |||
| 223 |
2/6✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 4 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
4 | std::vector<int> out_flat(static_cast<size_t>(n) * static_cast<size_t>(m)); |
| 224 | |||
| 225 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | MPI_Gatherv(local_res_flat.data(), static_cast<int>(local_res_flat.size()), MPI_INT, out_flat.data(), |
| 226 | recv_counts.data(), recv_displs.data(), MPI_INT, 0, MPI_COMM_WORLD); | ||
| 227 | |||
| 228 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | MPI_Bcast(out_flat.data(), static_cast<int>(out_flat.size()), MPI_INT, 0, MPI_COMM_WORLD); |
| 229 | |||
| 230 | auto &output = GetOutput(); | ||
| 231 |
2/2✓ Branch 0 taken 110 times.
✓ Branch 1 taken 4 times.
|
114 | for (int i = 0; i < n; ++i) { |
| 232 |
1/2✓ Branch 0 taken 110 times.
✗ Branch 1 not taken.
|
110 | std::copy_n(&out_flat[static_cast<size_t>(i) * static_cast<size_t>(m)], m, output[i].data()); |
| 233 | } | ||
| 234 | |||
| 235 | return true; | ||
| 236 | } | ||
| 237 | |||
| 238 | 4 | bool BoltenkovSGaussianKernelALL::PostProcessingImpl() { | |
| 239 | 4 | return true; | |
| 240 | } | ||
| 241 | |||
| 242 | } // namespace boltenkov_s_gaussian_kernel | ||
| 243 |