| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "nikolaev_d_block_linear_image_filtering/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <mpi.h> | ||
| 4 | #include <tbb/blocked_range2d.h> | ||
| 5 | #include <tbb/parallel_for.h> | ||
| 6 | |||
| 7 | #include <algorithm> | ||
| 8 | #include <array> | ||
| 9 | #include <cstddef> | ||
| 10 | #include <cstdint> | ||
| 11 | #include <utility> | ||
| 12 | #include <vector> | ||
| 13 | |||
| 14 | #include "nikolaev_d_block_linear_image_filtering/common/include/common.hpp" | ||
| 15 | |||
| 16 | namespace nikolaev_d_block_linear_image_filtering { | ||
| 17 | |||
| 18 | namespace { | ||
| 19 | |||
| 20 | constexpr int kBlockSize = 32; | ||
| 21 | |||
| 22 | struct GridSlicing { | ||
| 23 | std::vector<int> rows_per_proc; | ||
| 24 | std::vector<int> row_displs; | ||
| 25 | int halo_top{0}; | ||
| 26 | int halo_bottom{0}; | ||
| 27 | int buffer_height{0}; | ||
| 28 | }; | ||
| 29 | |||
| 30 | 54 | int ApplyKernel(const std::vector<uint8_t> &img, int row, int col, int channel, int width, int height, int rank, | |
| 31 | const GridSlicing &dist, const std::array<std::array<int, 3>, 3> &kernel) { | ||
| 32 | int sum = 0; | ||
| 33 | 54 | int input_start_row = dist.row_displs[rank] - dist.halo_top; | |
| 34 | |||
| 35 |
2/2✓ Branch 0 taken 162 times.
✓ Branch 1 taken 54 times.
|
216 | for (size_t kr = 0; kr < 3; ++kr) { |
| 36 | 162 | int target_global_row = dist.row_displs[rank] + row + (static_cast<int>(kr) - 1); | |
| 37 | 162 | int clamped_global_row = std::clamp(target_global_row, 0, height - 1); | |
| 38 | 162 | int buffer_row = clamped_global_row - input_start_row; | |
| 39 | |||
| 40 |
2/2✓ Branch 0 taken 486 times.
✓ Branch 1 taken 162 times.
|
648 | for (size_t kc = 0; kc < 3; ++kc) { |
| 41 | 486 | int target_col = col + (static_cast<int>(kc) - 1); | |
| 42 | 486 | int clamped_col = std::clamp(target_col, 0, width - 1); | |
| 43 | |||
| 44 | 486 | size_t idx = (((static_cast<size_t>(buffer_row) * width) + clamped_col) * 3) + channel; | |
| 45 | 486 | sum += (static_cast<int>(img[idx]) * kernel.at(kr).at(kc)); | |
| 46 | } | ||
| 47 | } | ||
| 48 | 54 | return sum; | |
| 49 | } | ||
| 50 | |||
| 51 | ✗ | void ProcessFullBlock(const std::vector<uint8_t> &input, std::vector<uint8_t> &output, int width, int height, int rank, | |
| 52 | const GridSlicing &dist, int start_row, int start_col) { | ||
| 53 | static constexpr std::array<std::array<int, 3>, 3> kKernel = {{{1, 2, 1}, {2, 4, 2}, {1, 2, 1}}}; | ||
| 54 | |||
| 55 | ✗ | for (int row = start_row; row < start_row + kBlockSize; ++row) { | |
| 56 | ✗ | for (int col = start_col; col < start_col + kBlockSize; ++col) { | |
| 57 | ✗ | for (int channel = 0; channel < 3; ++channel) { | |
| 58 | ✗ | int sum = ApplyKernel(input, row, col, channel, width, height, rank, dist, kKernel); | |
| 59 | ✗ | int result_value = (sum + 8) / 16; | |
| 60 | result_value = std::clamp(result_value, 0, 255); | ||
| 61 | ✗ | auto idx = ((static_cast<size_t>(row) * width + col) * 3) + channel; | |
| 62 | ✗ | output[idx] = static_cast<uint8_t>(result_value); | |
| 63 | } | ||
| 64 | } | ||
| 65 | } | ||
| 66 | ✗ | } | |
| 67 | |||
| 68 | 5 | void ProcessPartBlock(const std::vector<uint8_t> &input, std::vector<uint8_t> &output, int width, int height, | |
| 69 | int local_rows, int rank, const GridSlicing &dist, int start_row, int start_col) { | ||
| 70 | static constexpr std::array<std::array<int, 3>, 3> kKernel = {{{1, 2, 1}, {2, 4, 2}, {1, 2, 1}}}; | ||
| 71 | |||
| 72 | 5 | const int end_row = std::min(local_rows, start_row + kBlockSize); | |
| 73 | 5 | const int end_col = std::min(width, start_col + kBlockSize); | |
| 74 | |||
| 75 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 5 times.
|
13 | for (int row = start_row; row < end_row; ++row) { |
| 76 |
2/2✓ Branch 0 taken 18 times.
✓ Branch 1 taken 8 times.
|
26 | for (int col = start_col; col < end_col; ++col) { |
| 77 |
2/2✓ Branch 0 taken 54 times.
✓ Branch 1 taken 18 times.
|
72 | for (int channel = 0; channel < 3; ++channel) { |
| 78 | 54 | int sum = ApplyKernel(input, row, col, channel, width, height, rank, dist, kKernel); | |
| 79 |
1/2✓ Branch 0 taken 54 times.
✗ Branch 1 not taken.
|
54 | int result_value = (sum + 8) / 16; |
| 80 | result_value = std::clamp(result_value, 0, 255); | ||
| 81 | 54 | auto idx = ((static_cast<size_t>(row) * width + col) * 3) + channel; | |
| 82 | 54 | output[idx] = static_cast<uint8_t>(result_value); | |
| 83 | } | ||
| 84 | } | ||
| 85 | } | ||
| 86 | 5 | } | |
| 87 | |||
| 88 | 5 | void ProcessGridRange(const tbb::blocked_range2d<int> &r, int local_block_rows, int num_col_blocks, int local_rows, | |
| 89 | int width, int height, int rank, const GridSlicing &dist, const std::vector<uint8_t> &local_input, | ||
| 90 | std::vector<uint8_t> &local_output) { | ||
| 91 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | for (int bi = r.rows().begin(); bi < r.rows().end(); ++bi) { |
| 92 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | for (int bj = r.cols().begin(); bj < r.cols().end(); ++bj) { |
| 93 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | if (bi < local_block_rows && bj < num_col_blocks) { |
| 94 | ✗ | ProcessFullBlock(local_input, local_output, width, height, rank, dist, bi * kBlockSize, bj * kBlockSize); | |
| 95 | } else { | ||
| 96 | 5 | ProcessPartBlock(local_input, local_output, width, height, local_rows, rank, dist, bi * kBlockSize, | |
| 97 | bj * kBlockSize); | ||
| 98 | } | ||
| 99 | } | ||
| 100 | } | ||
| 101 | 5 | } | |
| 102 | |||
| 103 | 10 | GridSlicing BuildGridSlicing(int rank, int world_size, int height) { | |
| 104 | 10 | const int total_block_rows = height / kBlockSize; | |
| 105 | 10 | const int height_remainder = height % kBlockSize; | |
| 106 | |||
| 107 | 10 | std::vector<int> block_rows_per_proc(world_size); | |
| 108 | 10 | const int base_blocks = total_block_rows / world_size; | |
| 109 | 10 | const int extra_blocks = total_block_rows % world_size; | |
| 110 |
2/2✓ Branch 0 taken 20 times.
✓ Branch 1 taken 10 times.
|
30 | for (int proc = 0; proc < world_size; ++proc) { |
| 111 |
1/2✓ Branch 0 taken 20 times.
✗ Branch 1 not taken.
|
40 | block_rows_per_proc[proc] = base_blocks + (proc < extra_blocks ? 1 : 0); |
| 112 | } | ||
| 113 | |||
| 114 | 10 | GridSlicing dist; | |
| 115 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | dist.rows_per_proc.resize(world_size); |
| 116 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | dist.row_displs.resize(world_size); |
| 117 | int pixel_offset = 0; | ||
| 118 |
2/2✓ Branch 0 taken 20 times.
✓ Branch 1 taken 10 times.
|
30 | for (int proc = 0; proc < world_size; ++proc) { |
| 119 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
20 | int rows = block_rows_per_proc[proc] * kBlockSize; |
| 120 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
20 | if (proc == world_size - 1) { |
| 121 | 10 | rows += height_remainder; | |
| 122 | } | ||
| 123 | 20 | dist.rows_per_proc[proc] = rows; | |
| 124 | 20 | dist.row_displs[proc] = pixel_offset; | |
| 125 | 20 | pixel_offset += rows; | |
| 126 | } | ||
| 127 | |||
| 128 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | if (dist.rows_per_proc[rank] > 0) { |
| 129 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
5 | dist.halo_top = (dist.row_displs[rank] > 0) ? 1 : 0; |
| 130 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
10 | dist.halo_bottom = (dist.row_displs[rank] + dist.rows_per_proc[rank] < height) ? 1 : 0; |
| 131 | } | ||
| 132 |
1/2✓ Branch 0 taken 10 times.
✗ Branch 1 not taken.
|
10 | dist.buffer_height = dist.rows_per_proc[rank] + dist.halo_top + dist.halo_bottom; |
| 133 | |||
| 134 | 10 | return dist; | |
| 135 | ✗ | } | |
| 136 | |||
| 137 | std::pair<int, int> HaloFor(int proc, const GridSlicing &dist, int height) { | ||
| 138 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 10 times.
|
20 | if (dist.rows_per_proc[proc] == 0) { |
| 139 | return {0, 0}; | ||
| 140 | } | ||
| 141 |
1/2✓ Branch 0 taken 10 times.
✗ Branch 1 not taken.
|
10 | int top = (dist.row_displs[proc] > 0) ? 1 : 0; |
| 142 |
1/2✓ Branch 0 taken 10 times.
✗ Branch 1 not taken.
|
10 | int bot = (dist.row_displs[proc] + dist.rows_per_proc[proc] < height) ? 1 : 0; |
| 143 | return {top, bot}; | ||
| 144 | } | ||
| 145 | |||
| 146 | 10 | void ScatterWithHalo(int rank, int world_size, int width, int height, const GridSlicing &dist, | |
| 147 | const uint8_t *full_image, std::vector<uint8_t> &local_input) { | ||
| 148 | 10 | std::vector<int> scatter_counts(world_size); | |
| 149 |
1/4✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
10 | std::vector<int> scatter_displs(world_size); |
| 150 |
2/2✓ Branch 0 taken 20 times.
✓ Branch 1 taken 10 times.
|
30 | for (int proc = 0; proc < world_size; ++proc) { |
| 151 | auto [proc_top, proc_bot] = HaloFor(proc, dist, height); | ||
| 152 | 20 | int proc_buffer_rows = dist.rows_per_proc[proc] + proc_top + proc_bot; | |
| 153 | 20 | scatter_counts[proc] = proc_buffer_rows * width * 3; | |
| 154 | 20 | scatter_displs[proc] = (dist.row_displs[proc] - proc_top) * width * 3; | |
| 155 | } | ||
| 156 | |||
| 157 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | const uint8_t *send_buf = (rank == 0) ? full_image : nullptr; |
| 158 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | MPI_Scatterv(send_buf, scatter_counts.data(), scatter_displs.data(), MPI_UNSIGNED_CHAR, local_input.data(), |
| 159 | static_cast<int>(local_input.size()), MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); | ||
| 160 | 10 | } | |
| 161 | |||
| 162 | 10 | void RunLocalTBB(int rank, int world_size, int width, int height, const GridSlicing &dist, | |
| 163 | const std::vector<uint8_t> &local_input, std::vector<uint8_t> &local_output) { | ||
| 164 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | const int local_rows = dist.rows_per_proc[rank]; |
| 165 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | if (local_rows == 0) { |
| 166 | 5 | return; | |
| 167 | } | ||
| 168 | |||
| 169 | 5 | const int total_block_rows = height / kBlockSize; | |
| 170 | 5 | const int height_remainder = height % kBlockSize; | |
| 171 | 5 | const int num_col_blocks = width / kBlockSize; | |
| 172 | 5 | const bool width_has_remainder = (width % kBlockSize) != 0; | |
| 173 | |||
| 174 | 5 | const int local_block_rows = | |
| 175 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 5 times.
|
5 | (rank < total_block_rows % world_size) ? ((total_block_rows / world_size) + 1) : (total_block_rows / world_size); |
| 176 | 5 | const bool is_last = (rank == world_size - 1); | |
| 177 | |||
| 178 | 5 | const int total_local_bi = local_block_rows + static_cast<int>(is_last && height_remainder > 0); | |
| 179 | 5 | const int total_local_bj = num_col_blocks + static_cast<int>(width_has_remainder); | |
| 180 | |||
| 181 | 5 | tbb::parallel_for(tbb::blocked_range2d<int>(0, total_local_bi, 0, total_local_bj), | |
| 182 | 5 | [&](const tbb::blocked_range2d<int> &r) { | |
| 183 | 5 | ProcessGridRange(r, local_block_rows, num_col_blocks, local_rows, width, height, rank, dist, local_input, | |
| 184 | local_output); | ||
| 185 | 5 | }); | |
| 186 | } | ||
| 187 | |||
| 188 | 10 | void GatherAndBroadcast(int world_size, int width, int height, const GridSlicing &dist, | |
| 189 | const std::vector<uint8_t> &local_output, std::vector<uint8_t> &result) { | ||
| 190 | 10 | std::vector<int> recv_counts(world_size); | |
| 191 |
1/4✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
10 | std::vector<int> recv_displs(world_size); |
| 192 |
2/2✓ Branch 0 taken 20 times.
✓ Branch 1 taken 10 times.
|
30 | for (int proc = 0; proc < world_size; ++proc) { |
| 193 | 20 | recv_counts[proc] = dist.rows_per_proc[proc] * width * 3; | |
| 194 | 20 | recv_displs[proc] = dist.row_displs[proc] * width * 3; | |
| 195 | } | ||
| 196 |
2/6✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 10 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
10 | result.assign(static_cast<size_t>(height) * width * 3, 0); |
| 197 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | MPI_Gatherv(local_output.data(), static_cast<int>(local_output.size()), MPI_UNSIGNED_CHAR, result.data(), |
| 198 | recv_counts.data(), recv_displs.data(), MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); | ||
| 199 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | MPI_Bcast(result.data(), static_cast<int>(result.size()), MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); |
| 200 | 10 | } | |
| 201 | |||
| 202 | } // namespace | ||
| 203 | |||
| 204 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | NikolaevDBlockLinearImageFilteringALL::NikolaevDBlockLinearImageFilteringALL(const InType &in) { |
| 205 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 206 | 10 | int rank = 0; | |
| 207 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); |
| 208 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | if (rank == 0) { |
| 209 | GetInput() = in; | ||
| 210 | } | ||
| 211 | 10 | GetOutput() = std::vector<uint8_t>(); | |
| 212 | 10 | } | |
| 213 | |||
| 214 | 10 | bool NikolaevDBlockLinearImageFilteringALL::ValidationImpl() { | |
| 215 | 10 | int rank = 0; | |
| 216 | 10 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 217 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | if (rank != 0) { |
| 218 | return true; | ||
| 219 | } | ||
| 220 | 5 | return std::get<0>(GetInput()) * std::get<1>(GetInput()) * 3 == static_cast<int>(std::get<2>(GetInput()).size()); | |
| 221 | } | ||
| 222 | |||
| 223 | 10 | bool NikolaevDBlockLinearImageFilteringALL::PreProcessingImpl() { | |
| 224 | 10 | return true; | |
| 225 | } | ||
| 226 | |||
| 227 | 10 | bool NikolaevDBlockLinearImageFilteringALL::RunImpl() { | |
| 228 | 10 | int rank = 0; | |
| 229 | 10 | int world_size = 1; | |
| 230 | 10 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 231 | 10 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); | |
| 232 | |||
| 233 | 10 | std::array<int, 2> dims{}; | |
| 234 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | if (rank == 0) { |
| 235 | 5 | dims[0] = std::get<0>(GetInput()); | |
| 236 | 5 | dims[1] = std::get<1>(GetInput()); | |
| 237 | } | ||
| 238 | 10 | MPI_Bcast(dims.data(), 2, MPI_INT, 0, MPI_COMM_WORLD); | |
| 239 | 10 | const int width = dims[0]; | |
| 240 | 10 | const int height = dims[1]; | |
| 241 | |||
| 242 | 10 | const GridSlicing dist = BuildGridSlicing(rank, world_size, height); | |
| 243 | |||
| 244 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | std::vector<uint8_t> local_input(static_cast<size_t>(dist.buffer_height) * width * 3); |
| 245 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | const uint8_t *full_image = (rank == 0) ? std::get<2>(GetInput()).data() : nullptr; |
| 246 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | ScatterWithHalo(rank, world_size, width, height, dist, full_image, local_input); |
| 247 | |||
| 248 |
1/4✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
10 | std::vector<uint8_t> local_output(static_cast<size_t>(dist.rows_per_proc[rank]) * width * 3); |
| 249 | |||
| 250 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | RunLocalTBB(rank, world_size, width, height, dist, local_input, local_output); |
| 251 | |||
| 252 | 10 | std::vector<uint8_t> result; | |
| 253 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | GatherAndBroadcast(world_size, width, height, dist, local_output, result); |
| 254 | GetOutput() = std::move(result); | ||
| 255 | 10 | return true; | |
| 256 | 10 | } | |
| 257 | |||
| 258 | 10 | bool NikolaevDBlockLinearImageFilteringALL::PostProcessingImpl() { | |
| 259 | 10 | return true; | |
| 260 | } | ||
| 261 | |||
| 262 | } // namespace nikolaev_d_block_linear_image_filtering | ||
| 263 |