| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "romanov_a_gauss_block/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <mpi.h> | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <array> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <cstdint> | ||
| 9 | #include <thread> | ||
| 10 | #include <utility> | ||
| 11 | #include <vector> | ||
| 12 | |||
| 13 | #include "romanov_a_gauss_block/common/include/common.hpp" | ||
| 14 | #include "util/include/util.hpp" | ||
| 15 | |||
| 16 | namespace romanov_a_gauss_block { | ||
| 17 | |||
| 18 | namespace { | ||
| 19 | |||
| 20 | constexpr int kBlockSize = 32; | ||
| 21 | |||
| 22 | struct Distribution { | ||
| 23 | std::vector<int> rows_per_proc; | ||
| 24 | std::vector<int> row_displs; | ||
| 25 | int halo_top{0}; | ||
| 26 | int halo_bottom{0}; | ||
| 27 | int buffer_height{0}; | ||
| 28 | }; | ||
| 29 | |||
| 30 | 30084 | int ApplyKernel(const std::vector<uint8_t> &img, int row, int col, int channel, int width, int buffer_height, | |
| 31 | int halo_top, const std::array<std::array<int, 3>, 3> &kernel) { | ||
| 32 | int sum = 0; | ||
| 33 |
2/2✓ Branch 0 taken 90252 times.
✓ Branch 1 taken 30084 times.
|
120336 | for (size_t kr = 0; kr < 3; ++kr) { |
| 34 |
2/2✓ Branch 0 taken 270756 times.
✓ Branch 1 taken 90252 times.
|
361008 | for (size_t kc = 0; kc < 3; ++kc) { |
| 35 | 270756 | int nr_local = row + static_cast<int>(kr) - 1; | |
| 36 | 270756 | int nc = col + static_cast<int>(kc) - 1; | |
| 37 | 270756 | int buffer_row = nr_local + halo_top; | |
| 38 |
4/4✓ Branch 0 taken 268686 times.
✓ Branch 1 taken 2070 times.
✓ Branch 2 taken 266748 times.
✓ Branch 3 taken 1938 times.
|
270756 | if (buffer_row >= 0 && buffer_row < buffer_height && nc >= 0 && nc < width) { |
| 39 | 266748 | size_t idx = (((static_cast<size_t>(buffer_row) * width) + nc) * 3) + channel; | |
| 40 | 266748 | sum += (static_cast<int>(img[idx]) * kernel.at(kr).at(kc)); | |
| 41 | } | ||
| 42 | } | ||
| 43 | } | ||
| 44 | 30084 | return sum; | |
| 45 | } | ||
| 46 | |||
| 47 | 9 | void ProcessFullBlock(const std::vector<uint8_t> &input, std::vector<uint8_t> &output, int width, int buffer_height, | |
| 48 | int halo_top, int start_row, int start_col) { | ||
| 49 | static constexpr std::array<std::array<int, 3>, 3> kKernel = {{{1, 2, 1}, {2, 4, 2}, {1, 2, 1}}}; | ||
| 50 | |||
| 51 |
2/2✓ Branch 0 taken 288 times.
✓ Branch 1 taken 9 times.
|
297 | for (int row = start_row; row < start_row + kBlockSize; ++row) { |
| 52 |
2/2✓ Branch 0 taken 9216 times.
✓ Branch 1 taken 288 times.
|
9504 | for (int col = start_col; col < start_col + kBlockSize; ++col) { |
| 53 |
2/2✓ Branch 0 taken 27648 times.
✓ Branch 1 taken 9216 times.
|
36864 | for (int channel = 0; channel < 3; ++channel) { |
| 54 | 27648 | int sum = ApplyKernel(input, row, col, channel, width, buffer_height, halo_top, kKernel); | |
| 55 |
1/2✓ Branch 0 taken 27648 times.
✗ Branch 1 not taken.
|
27648 | int result_value = (sum + 8) / 16; |
| 56 | result_value = std::clamp(result_value, 0, 255); | ||
| 57 | 27648 | auto idx = ((static_cast<size_t>(row) * width + col) * 3) + channel; | |
| 58 | 27648 | output[idx] = static_cast<uint8_t>(result_value); | |
| 59 | } | ||
| 60 | } | ||
| 61 | } | ||
| 62 | 9 | } | |
| 63 | |||
| 64 | 14 | void ProcessPartBlock(const std::vector<uint8_t> &input, std::vector<uint8_t> &output, int width, int local_rows, | |
| 65 | int buffer_height, int halo_top, int start_row, int start_col) { | ||
| 66 | static constexpr std::array<std::array<int, 3>, 3> kKernel = {{{1, 2, 1}, {2, 4, 2}, {1, 2, 1}}}; | ||
| 67 | |||
| 68 | 14 | const int end_row = std::min(local_rows, start_row + kBlockSize); | |
| 69 | 14 | const int end_col = std::min(width, start_col + kBlockSize); | |
| 70 | |||
| 71 |
2/2✓ Branch 0 taken 125 times.
✓ Branch 1 taken 14 times.
|
139 | for (int row = start_row; row < end_row; ++row) { |
| 72 |
2/2✓ Branch 0 taken 812 times.
✓ Branch 1 taken 125 times.
|
937 | for (int col = start_col; col < end_col; ++col) { |
| 73 |
2/2✓ Branch 0 taken 2436 times.
✓ Branch 1 taken 812 times.
|
3248 | for (int channel = 0; channel < 3; ++channel) { |
| 74 | 2436 | int sum = ApplyKernel(input, row, col, channel, width, buffer_height, halo_top, kKernel); | |
| 75 |
1/2✓ Branch 0 taken 2436 times.
✗ Branch 1 not taken.
|
2436 | int result_value = (sum + 8) / 16; |
| 76 | result_value = std::clamp(result_value, 0, 255); | ||
| 77 | 2436 | auto idx = ((static_cast<size_t>(row) * width + col) * 3) + channel; | |
| 78 | 2436 | output[idx] = static_cast<uint8_t>(result_value); | |
| 79 | } | ||
| 80 | } | ||
| 81 | } | ||
| 82 | 14 | } | |
| 83 | |||
| 84 | 16 | Distribution BuildDistribution(int rank, int world_size, int height) { | |
| 85 | 16 | const int total_block_rows = height / kBlockSize; | |
| 86 | 16 | const int height_remainder = height % kBlockSize; | |
| 87 | |||
| 88 | 16 | std::vector<int> block_rows_per_proc(world_size); | |
| 89 | 16 | const int base_blocks = total_block_rows / world_size; | |
| 90 | 16 | const int extra_blocks = total_block_rows % world_size; | |
| 91 |
2/2✓ Branch 0 taken 32 times.
✓ Branch 1 taken 16 times.
|
48 | for (int proc = 0; proc < world_size; ++proc) { |
| 92 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 2 times.
|
62 | block_rows_per_proc[proc] = base_blocks + (proc < extra_blocks ? 1 : 0); |
| 93 | } | ||
| 94 | |||
| 95 | 16 | Distribution dist; | |
| 96 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | dist.rows_per_proc.resize(world_size); |
| 97 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | dist.row_displs.resize(world_size); |
| 98 | int pixel_offset = 0; | ||
| 99 |
2/2✓ Branch 0 taken 32 times.
✓ Branch 1 taken 16 times.
|
48 | for (int proc = 0; proc < world_size; ++proc) { |
| 100 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 16 times.
|
32 | int rows = block_rows_per_proc[proc] * kBlockSize; |
| 101 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 16 times.
|
32 | if (proc == world_size - 1) { |
| 102 | 16 | rows += height_remainder; | |
| 103 | } | ||
| 104 | 32 | dist.rows_per_proc[proc] = rows; | |
| 105 | 32 | dist.row_displs[proc] = pixel_offset; | |
| 106 | 32 | pixel_offset += rows; | |
| 107 | } | ||
| 108 | |||
| 109 | // halo для текущего ранга | ||
| 110 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 7 times.
|
16 | if (dist.rows_per_proc[rank] > 0) { |
| 111 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 1 times.
|
9 | dist.halo_top = (dist.row_displs[rank] > 0) ? 1 : 0; |
| 112 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 1 times.
|
17 | dist.halo_bottom = (dist.row_displs[rank] + dist.rows_per_proc[rank] < height) ? 1 : 0; |
| 113 | } | ||
| 114 |
1/2✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
|
16 | dist.buffer_height = dist.rows_per_proc[rank] + dist.halo_top + dist.halo_bottom; |
| 115 | |||
| 116 | 16 | return dist; | |
| 117 | ✗ | } | |
| 118 | |||
| 119 | std::pair<int, int> HaloFor(int proc, const Distribution &dist, int height) { | ||
| 120 |
2/2✓ Branch 0 taken 18 times.
✓ Branch 1 taken 14 times.
|
32 | if (dist.rows_per_proc[proc] == 0) { |
| 121 | return {0, 0}; | ||
| 122 | } | ||
| 123 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | int top = (dist.row_displs[proc] > 0) ? 1 : 0; |
| 124 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 2 times.
|
18 | int bot = (dist.row_displs[proc] + dist.rows_per_proc[proc] < height) ? 1 : 0; |
| 125 | return {top, bot}; | ||
| 126 | } | ||
| 127 | |||
| 128 | 16 | void ScatterWithHalo(int rank, int world_size, int width, int height, const Distribution &dist, | |
| 129 | const uint8_t *full_image, std::vector<uint8_t> &local_input) { | ||
| 130 | 16 | std::vector<int> scatter_counts(world_size); | |
| 131 |
1/4✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
16 | std::vector<int> scatter_displs(world_size); |
| 132 |
2/2✓ Branch 0 taken 32 times.
✓ Branch 1 taken 16 times.
|
48 | for (int proc = 0; proc < world_size; ++proc) { |
| 133 | auto [proc_top, proc_bot] = HaloFor(proc, dist, height); | ||
| 134 | 32 | int proc_buffer_rows = dist.rows_per_proc[proc] + proc_top + proc_bot; | |
| 135 | 32 | scatter_counts[proc] = proc_buffer_rows * width * 3; | |
| 136 | 32 | scatter_displs[proc] = (dist.row_displs[proc] - proc_top) * width * 3; | |
| 137 | } | ||
| 138 | |||
| 139 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | const uint8_t *send_buf = (rank == 0) ? full_image : nullptr; |
| 140 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | MPI_Scatterv(send_buf, scatter_counts.data(), scatter_displs.data(), MPI_UNSIGNED_CHAR, local_input.data(), |
| 141 | static_cast<int>(local_input.size()), MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); | ||
| 142 | 16 | } | |
| 143 | |||
| 144 | 15 | void ProcessThreadShare(int current_part, int num_threads, int local_block_rows, int num_col_blocks, int local_rows, | |
| 145 | int height_remainder, bool is_last, bool width_has_remainder, int width, int buffer_height, | ||
| 146 | int halo_top, const std::vector<uint8_t> &local_input, std::vector<uint8_t> &local_output) { | ||
| 147 | 15 | const int start_col_tail = num_col_blocks * kBlockSize; | |
| 148 | 15 | const int bottom_row_start = local_block_rows * kBlockSize; | |
| 149 | |||
| 150 | 15 | int left_border_r = (local_block_rows * current_part) / num_threads; | |
| 151 | 15 | int right_border_r = (local_block_rows * (current_part + 1)) / num_threads; | |
| 152 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 15 times.
|
18 | for (int bi = left_border_r; bi < right_border_r; ++bi) { |
| 153 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 3 times.
|
12 | for (int bj = 0; bj < num_col_blocks; ++bj) { |
| 154 | 9 | ProcessFullBlock(local_input, local_output, width, buffer_height, halo_top, bi * kBlockSize, bj * kBlockSize); | |
| 155 | } | ||
| 156 | } | ||
| 157 | |||
| 158 |
1/2✓ Branch 0 taken 15 times.
✗ Branch 1 not taken.
|
15 | if (width_has_remainder) { |
| 159 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 15 times.
|
18 | for (int bi = left_border_r; bi < right_border_r; ++bi) { |
| 160 | 3 | ProcessPartBlock(local_input, local_output, width, local_rows, buffer_height, halo_top, bi * kBlockSize, | |
| 161 | start_col_tail); | ||
| 162 | } | ||
| 163 | } | ||
| 164 | |||
| 165 |
2/2✓ Branch 0 taken 13 times.
✓ Branch 1 taken 2 times.
|
15 | if (is_last && height_remainder > 0) { |
| 166 | 13 | int left_border_l = (num_col_blocks * current_part) / num_threads; | |
| 167 | 13 | int right_border_l = (num_col_blocks * (current_part + 1)) / num_threads; | |
| 168 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 13 times.
|
16 | for (int bj = left_border_l; bj < right_border_l; ++bj) { |
| 169 | 3 | ProcessPartBlock(local_input, local_output, width, local_rows, buffer_height, halo_top, bottom_row_start, | |
| 170 | bj * kBlockSize); | ||
| 171 | } | ||
| 172 | } | ||
| 173 | 15 | } | |
| 174 | |||
| 175 | 16 | void RunLocal(int rank, int world_size, int width, int height, const Distribution &dist, | |
| 176 | const std::vector<uint8_t> &local_input, std::vector<uint8_t> &local_output) { | ||
| 177 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 9 times.
|
16 | const int local_rows = dist.rows_per_proc[rank]; |
| 178 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 9 times.
|
16 | if (local_rows == 0) { |
| 179 | 7 | return; | |
| 180 | } | ||
| 181 | |||
| 182 | 9 | const int total_block_rows = height / kBlockSize; | |
| 183 | 9 | const int height_remainder = height % kBlockSize; | |
| 184 | 9 | const int num_col_blocks = width / kBlockSize; | |
| 185 | 9 | const bool width_has_remainder = (width % kBlockSize) != 0; | |
| 186 | 9 | const int local_block_rows = | |
| 187 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 8 times.
|
9 | (rank < total_block_rows % world_size) ? ((total_block_rows / world_size) + 1) : (total_block_rows / world_size); |
| 188 | 9 | const bool is_last = (rank == world_size - 1); | |
| 189 | |||
| 190 | 9 | int num_threads = std::max(1, ppc::util::GetNumThreads()); | |
| 191 | 9 | num_threads = std::min(num_threads, local_rows); | |
| 192 | |||
| 193 | 9 | std::vector<std::thread> threads; | |
| 194 |
1/2✓ Branch 1 taken 9 times.
✗ Branch 2 not taken.
|
9 | threads.reserve(num_threads); |
| 195 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 9 times.
|
24 | for (int tid = 0; tid < num_threads; ++tid) { |
| 196 |
1/2✓ Branch 1 taken 15 times.
✗ Branch 2 not taken.
|
15 | threads.emplace_back([&, tid]() { |
| 197 | 15 | ProcessThreadShare(tid, num_threads, local_block_rows, num_col_blocks, local_rows, height_remainder, is_last, | |
| 198 | 15 | width_has_remainder, width, dist.buffer_height, dist.halo_top, local_input, local_output); | |
| 199 | 15 | }); | |
| 200 | } | ||
| 201 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 9 times.
|
24 | for (auto &th : threads) { |
| 202 |
1/2✓ Branch 1 taken 15 times.
✗ Branch 2 not taken.
|
15 | th.join(); |
| 203 | } | ||
| 204 | |||
| 205 |
3/4✓ Branch 0 taken 8 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 8 times.
✗ Branch 3 not taken.
|
9 | if (is_last && height_remainder > 0) { |
| 206 | 8 | ProcessPartBlock(local_input, local_output, width, local_rows, dist.buffer_height, dist.halo_top, | |
| 207 | 8 | local_block_rows * kBlockSize, num_col_blocks * kBlockSize); | |
| 208 | } | ||
| 209 | 9 | } | |
| 210 | |||
| 211 | 16 | void GatherAndBroadcast(int world_size, int width, int height, const Distribution &dist, | |
| 212 | const std::vector<uint8_t> &local_output, std::vector<uint8_t> &result) { | ||
| 213 | 16 | std::vector<int> recv_counts(world_size); | |
| 214 |
1/4✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
16 | std::vector<int> recv_displs(world_size); |
| 215 |
2/2✓ Branch 0 taken 32 times.
✓ Branch 1 taken 16 times.
|
48 | for (int proc = 0; proc < world_size; ++proc) { |
| 216 | 32 | recv_counts[proc] = dist.rows_per_proc[proc] * width * 3; | |
| 217 | 32 | recv_displs[proc] = dist.row_displs[proc] * width * 3; | |
| 218 | } | ||
| 219 |
2/6✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 16 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
16 | result.assign(static_cast<size_t>(height) * width * 3, 0); |
| 220 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | MPI_Gatherv(local_output.data(), static_cast<int>(local_output.size()), MPI_UNSIGNED_CHAR, result.data(), |
| 221 | recv_counts.data(), recv_displs.data(), MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); | ||
| 222 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | MPI_Bcast(result.data(), static_cast<int>(result.size()), MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); |
| 223 | 16 | } | |
| 224 | |||
| 225 | } // namespace | ||
| 226 | |||
| 227 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | RomanovAGaussBlockALL::RomanovAGaussBlockALL(const InType &in) { |
| 228 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 229 | 16 | int rank = 0; | |
| 230 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); |
| 231 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | if (rank == 0) { |
| 232 | GetInput() = in; | ||
| 233 | } | ||
| 234 | 16 | GetOutput() = std::vector<uint8_t>(); | |
| 235 | 16 | } | |
| 236 | |||
| 237 | 16 | bool RomanovAGaussBlockALL::ValidationImpl() { | |
| 238 | 16 | int rank = 0; | |
| 239 | 16 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 240 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | if (rank != 0) { |
| 241 | return true; | ||
| 242 | } | ||
| 243 | 8 | return std::get<0>(GetInput()) * std::get<1>(GetInput()) * 3 == static_cast<int>(std::get<2>(GetInput()).size()); | |
| 244 | } | ||
| 245 | |||
| 246 | 16 | bool RomanovAGaussBlockALL::PreProcessingImpl() { | |
| 247 | 16 | return true; | |
| 248 | } | ||
| 249 | |||
| 250 | 16 | bool RomanovAGaussBlockALL::RunImpl() { | |
| 251 | 16 | int rank = 0; | |
| 252 | 16 | int world_size = 1; | |
| 253 | 16 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 254 | 16 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); | |
| 255 | |||
| 256 | 16 | std::array<int, 2> dims{}; | |
| 257 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | if (rank == 0) { |
| 258 | 8 | dims[0] = std::get<0>(GetInput()); | |
| 259 | 8 | dims[1] = std::get<1>(GetInput()); | |
| 260 | } | ||
| 261 | 16 | MPI_Bcast(dims.data(), 2, MPI_INT, 0, MPI_COMM_WORLD); | |
| 262 | 16 | const int width = dims[0]; | |
| 263 | 16 | const int height = dims[1]; | |
| 264 | |||
| 265 | 16 | const Distribution dist = BuildDistribution(rank, world_size, height); | |
| 266 | |||
| 267 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | std::vector<uint8_t> local_input(static_cast<size_t>(dist.buffer_height) * width * 3); |
| 268 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | const uint8_t *full_image = (rank == 0) ? std::get<2>(GetInput()).data() : nullptr; |
| 269 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | ScatterWithHalo(rank, world_size, width, height, dist, full_image, local_input); |
| 270 | |||
| 271 |
1/4✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
16 | std::vector<uint8_t> local_output(static_cast<size_t>(dist.rows_per_proc[rank]) * width * 3); |
| 272 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | RunLocal(rank, world_size, width, height, dist, local_input, local_output); |
| 273 | |||
| 274 | 16 | std::vector<uint8_t> result; | |
| 275 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | GatherAndBroadcast(world_size, width, height, dist, local_output, result); |
| 276 | GetOutput() = std::move(result); | ||
| 277 | 16 | return true; | |
| 278 | 16 | } | |
| 279 | |||
| 280 | 16 | bool RomanovAGaussBlockALL::PostProcessingImpl() { | |
| 281 | 16 | return true; | |
| 282 | } | ||
| 283 | |||
| 284 | } // namespace romanov_a_gauss_block | ||
| 285 |