| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "moskaev_v_lin_filt_block_gauss_3/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <mpi.h> | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <cmath> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <cstdint> | ||
| 9 | #include <functional> | ||
| 10 | #include <thread> | ||
| 11 | #include <utility> | ||
| 12 | #include <vector> | ||
| 13 | |||
| 14 | #include "moskaev_v_lin_filt_block_gauss_3/common/include/common.hpp" | ||
| 15 | |||
| 16 | namespace moskaev_v_lin_filt_block_gauss_3 { | ||
| 17 | |||
| 18 | namespace { | ||
| 19 | |||
| 20 | 4 | void CopyBlockWithHalo(const std::vector<uint8_t> &src, std::vector<uint8_t> &dst, int src_width, int src_height, | |
| 21 | int channels, int block_x, int block_y, int block_w, int block_h, int padded_w) { | ||
| 22 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 4 times.
|
20 | for (int row = -1; row <= block_h; ++row) { |
| 23 |
2/2✓ Branch 0 taken 66 times.
✓ Branch 1 taken 16 times.
|
82 | for (int col = -1; col <= block_w; ++col) { |
| 24 | 66 | int src_row = std::clamp(block_y + row, 0, src_height - 1); | |
| 25 | 66 | int src_col = std::clamp(block_x + col, 0, src_width - 1); | |
| 26 | 66 | int dst_row = row + 1; | |
| 27 | 66 | int dst_col = col + 1; | |
| 28 |
2/2✓ Branch 0 taken 98 times.
✓ Branch 1 taken 66 times.
|
164 | for (int ch = 0; ch < channels; ++ch) { |
| 29 | 98 | size_t src_idx = ((static_cast<size_t>(src_row) * src_width + src_col) * channels) + ch; | |
| 30 | 98 | size_t dst_idx = ((static_cast<size_t>(dst_row) * padded_w + dst_col) * channels) + ch; | |
| 31 | 98 | dst[dst_idx] = src[src_idx]; | |
| 32 | } | ||
| 33 | } | ||
| 34 | } | ||
| 35 | 4 | } | |
| 36 | |||
| 37 | 26 | void FilterPixelInBlock(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w, | |
| 38 | int channels, int row, int col, int ch) { | ||
| 39 | float sum = 0.0F; | ||
| 40 |
2/2✓ Branch 0 taken 78 times.
✓ Branch 1 taken 26 times.
|
104 | for (int ky = -1; ky <= 1; ++ky) { |
| 41 |
2/2✓ Branch 0 taken 234 times.
✓ Branch 1 taken 78 times.
|
312 | for (int kx = -1; kx <= 1; ++kx) { |
| 42 | 234 | int ny = row + 1 + ky; | |
| 43 | 234 | int nx = col + 1 + kx; | |
| 44 | 234 | size_t idx = ((static_cast<size_t>(ny) * (block_w + 2) + nx) * channels) + ch; | |
| 45 | 234 | int kidx = ((ky + 1) * 3) + (kx + 1); | |
| 46 | 234 | sum += static_cast<float>(input_block[idx]) * kGaussianKernel[kidx]; | |
| 47 | } | ||
| 48 | } | ||
| 49 | 26 | size_t out_idx = ((static_cast<size_t>(row) * block_w + col) * channels) + ch; | |
| 50 | 26 | output_block[out_idx] = static_cast<uint8_t>(std::round(sum)); | |
| 51 | 26 | } | |
| 52 | |||
| 53 | 8 | void FilterBlockRange(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w, | |
| 54 | int channels, int start_row, int end_row) { | ||
| 55 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
|
16 | for (int row = start_row; row < end_row; ++row) { |
| 56 |
2/2✓ Branch 0 taken 18 times.
✓ Branch 1 taken 8 times.
|
26 | for (int col = 0; col < block_w; ++col) { |
| 57 |
2/2✓ Branch 0 taken 26 times.
✓ Branch 1 taken 18 times.
|
44 | for (int ch = 0; ch < channels; ++ch) { |
| 58 | 26 | FilterPixelInBlock(input_block, output_block, block_w, channels, row, col, ch); | |
| 59 | } | ||
| 60 | } | ||
| 61 | } | ||
| 62 | 8 | } | |
| 63 | |||
| 64 | 4 | void FilterBlock(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w, int block_h, | |
| 65 | int channels) { | ||
| 66 | 4 | int num_threads = static_cast<int>(std::thread::hardware_concurrency()); | |
| 67 |
3/4✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 3 times.
|
4 | if (num_threads <= 1 || block_h < 2) { |
| 68 | 1 | FilterBlockRange(input_block, output_block, block_w, channels, 0, block_h); | |
| 69 | 1 | return; | |
| 70 | } | ||
| 71 | |||
| 72 | num_threads = std::min(num_threads, 8); | ||
| 73 | 3 | num_threads = std::min(num_threads, block_h); | |
| 74 | 3 | int rows_per_thread = (block_h + num_threads - 1) / num_threads; | |
| 75 | 3 | std::vector<std::thread> threads; | |
| 76 | |||
| 77 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 3 times.
|
10 | for (int tid = 0; tid < num_threads; ++tid) { |
| 78 | 7 | int start = tid * rows_per_thread; | |
| 79 |
1/2✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
|
7 | int end = std::min(start + rows_per_thread, block_h); |
| 80 |
1/2✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
|
7 | threads.emplace_back(FilterBlockRange, std::cref(input_block), std::ref(output_block), block_w, channels, start, |
| 81 | end); | ||
| 82 | } | ||
| 83 | |||
| 84 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 3 times.
|
10 | for (auto &t : threads) { |
| 85 |
1/2✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
|
7 | t.join(); |
| 86 | } | ||
| 87 | 3 | } | |
| 88 | |||
| 89 | 4 | void ProcessOneBlock(int idx, int blocks_x, int width, int height, int channels, int block_size, | |
| 90 | const std::vector<uint8_t> &image_data, std::vector<uint8_t> &output, int &output_offset) { | ||
| 91 | 4 | int bx = idx % blocks_x; | |
| 92 | 4 | int by = idx / blocks_x; | |
| 93 | |||
| 94 | 4 | int block_x = bx * block_size; | |
| 95 | 4 | int block_y = by * block_size; | |
| 96 | 4 | int block_w = std::min(block_size, width - block_x); | |
| 97 | 4 | int block_h = std::min(block_size, height - block_y); | |
| 98 | 4 | int padded_w = block_w + 2; | |
| 99 | |||
| 100 | 4 | size_t input_size = static_cast<size_t>(padded_w) * static_cast<size_t>(block_h + 2) * static_cast<size_t>(channels); | |
| 101 | 4 | std::vector<uint8_t> input_block(input_size, 0); | |
| 102 | |||
| 103 | 4 | size_t output_size = static_cast<size_t>(block_w) * static_cast<size_t>(block_h) * static_cast<size_t>(channels); | |
| 104 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<uint8_t> output_block(output_size, 0); |
| 105 | |||
| 106 | 4 | CopyBlockWithHalo(image_data, input_block, width, height, channels, block_x, block_y, block_w, block_h, padded_w); | |
| 107 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | FilterBlock(input_block, output_block, block_w, block_h, channels); |
| 108 | |||
| 109 |
2/2✓ Branch 0 taken 26 times.
✓ Branch 1 taken 4 times.
|
30 | for (size_t i = 0; i < output_size; ++i) { |
| 110 | 26 | output[output_offset + i] = output_block[i]; | |
| 111 | } | ||
| 112 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | output_offset += static_cast<int>(output_size); |
| 113 | 4 | } | |
| 114 | |||
| 115 | 8 | void BroadcastImageData(int rank, int &width, int &height, int &channels, std::vector<uint8_t> &image_data, | |
| 116 | const InType &input) { | ||
| 117 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (rank == 0) { |
| 118 | 4 | width = std::get<0>(input); | |
| 119 | 4 | height = std::get<1>(input); | |
| 120 | 4 | channels = std::get<2>(input); | |
| 121 | 4 | image_data = std::get<4>(input); | |
| 122 | } | ||
| 123 | |||
| 124 | 8 | MPI_Bcast(&width, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 125 | 8 | MPI_Bcast(&height, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 126 | 8 | MPI_Bcast(&channels, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 127 | |||
| 128 | 8 | int data_size = static_cast<int>(image_data.size()); | |
| 129 | 8 | MPI_Bcast(&data_size, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 130 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (rank != 0) { |
| 131 | 4 | image_data.resize(data_size); | |
| 132 | } | ||
| 133 | 8 | MPI_Bcast(image_data.data(), data_size, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); | |
| 134 | 8 | } | |
| 135 | |||
| 136 | 8 | void ScatterBlocks(int rank, int num_procs, int total_blocks, std::vector<int> &local_blocks, int &local_cnt) { | |
| 137 | 8 | int per_proc = total_blocks / num_procs; | |
| 138 | 8 | int rem = total_blocks % num_procs; | |
| 139 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | local_cnt = per_proc + (rank < rem ? 1 : 0); |
| 140 | |||
| 141 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (local_cnt <= 0) { |
| 142 | local_blocks.clear(); | ||
| 143 | 4 | return; | |
| 144 | } | ||
| 145 | |||
| 146 | 4 | std::vector<int> all(total_blocks); | |
| 147 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | for (int i = 0; i < total_blocks; ++i) { |
| 148 | 4 | all[i] = i; | |
| 149 | } | ||
| 150 | |||
| 151 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<int> counts(num_procs); |
| 152 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<int> displs(num_procs); |
| 153 | int off = 0; | ||
| 154 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 4 times.
|
12 | for (int proc = 0; proc < num_procs; ++proc) { |
| 155 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | int cnt = per_proc + (proc < rem ? 1 : 0); |
| 156 | 8 | counts[proc] = cnt; | |
| 157 | 8 | displs[proc] = off; | |
| 158 | 8 | off += cnt; | |
| 159 | } | ||
| 160 | |||
| 161 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | local_blocks.resize(local_cnt); |
| 162 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | MPI_Scatterv(all.data(), counts.data(), displs.data(), MPI_INT, local_blocks.data(), local_cnt, MPI_INT, 0, |
| 163 | MPI_COMM_WORLD); | ||
| 164 | } | ||
| 165 | |||
| 166 | void ProcessBlockRange(const std::vector<int> &blocks, int start, int end, int blocks_x, int width, int height, | ||
| 167 | int channels, int block_size, const std::vector<uint8_t> &image_data, | ||
| 168 | std::vector<uint8_t> &output, int &output_offset) { | ||
| 169 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | for (int i = start; i < end; ++i) { |
| 170 | 4 | ProcessOneBlock(blocks[i], blocks_x, width, height, channels, block_size, image_data, output, output_offset); | |
| 171 | } | ||
| 172 | } | ||
| 173 | |||
| 174 | 4 | void ProcessAssignedBlocksSequential(const std::vector<int> &local_blocks, int blocks_x, int width, int height, | |
| 175 | int channels, int block_size, const std::vector<uint8_t> &image_data, | ||
| 176 | std::vector<uint8_t> &output) { | ||
| 177 | 4 | int local_cnt = static_cast<int>(local_blocks.size()); | |
| 178 | int total_bytes = 0; | ||
| 179 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | for (int i = 0; i < local_cnt; ++i) { |
| 180 | 4 | int idx = local_blocks[i]; | |
| 181 | 4 | int bx = idx % blocks_x; | |
| 182 | 4 | int by = idx / blocks_x; | |
| 183 | 4 | int block_x = bx * block_size; | |
| 184 | 4 | int block_y = by * block_size; | |
| 185 | 4 | int block_w = std::min(block_size, width - block_x); | |
| 186 | 4 | int block_h = std::min(block_size, height - block_y); | |
| 187 | 4 | total_bytes += block_w * block_h * channels; | |
| 188 | } | ||
| 189 | 4 | output.resize(total_bytes); | |
| 190 | 4 | int output_offset = 0; | |
| 191 | 4 | ProcessBlockRange(local_blocks, 0, local_cnt, blocks_x, width, height, channels, block_size, image_data, output, | |
| 192 | output_offset); | ||
| 193 | 4 | } | |
| 194 | |||
| 195 | ✗ | void ProcessBlocksInThread(int start, int blocks_in_thread, int blocks_x, int width, int height, int channels, | |
| 196 | int block_size, const std::vector<uint8_t> &image_data, const std::vector<int> &local_blocks, | ||
| 197 | std::vector<uint8_t> &local_output) { | ||
| 198 | int offset = 0; | ||
| 199 | ✗ | for (int i = start; i < start + blocks_in_thread; ++i) { | |
| 200 | ✗ | int idx = local_blocks[i]; | |
| 201 | ✗ | int bx = idx % blocks_x; | |
| 202 | ✗ | int by = idx / blocks_x; | |
| 203 | ✗ | int block_x = bx * block_size; | |
| 204 | ✗ | int block_y = by * block_size; | |
| 205 | ✗ | int block_w = std::min(block_size, width - block_x); | |
| 206 | ✗ | int block_h = std::min(block_size, height - block_y); | |
| 207 | ✗ | int padded_w = block_w + 2; | |
| 208 | |||
| 209 | ✗ | size_t input_size = | |
| 210 | ✗ | static_cast<size_t>(padded_w) * static_cast<size_t>(block_h + 2) * static_cast<size_t>(channels); | |
| 211 | ✗ | std::vector<uint8_t> input_block(input_size, 0); | |
| 212 | ✗ | size_t output_size = static_cast<size_t>(block_w) * static_cast<size_t>(block_h) * static_cast<size_t>(channels); | |
| 213 | ✗ | std::vector<uint8_t> output_block(output_size, 0); | |
| 214 | |||
| 215 | ✗ | CopyBlockWithHalo(image_data, input_block, width, height, channels, block_x, block_y, block_w, block_h, padded_w); | |
| 216 | ✗ | FilterBlock(input_block, output_block, block_w, block_h, channels); | |
| 217 | |||
| 218 | ✗ | for (size_t j = 0; j < output_size; ++j) { | |
| 219 | ✗ | local_output[offset + j] = output_block[j]; | |
| 220 | } | ||
| 221 | ✗ | offset += static_cast<int>(output_size); | |
| 222 | } | ||
| 223 | ✗ | } | |
| 224 | |||
| 225 | ✗ | void ProcessAssignedBlocksParallel(const std::vector<int> &local_blocks, int blocks_x, int width, int height, | |
| 226 | int channels, int block_size, const std::vector<uint8_t> &image_data, | ||
| 227 | std::vector<uint8_t> &output) { | ||
| 228 | ✗ | int local_cnt = static_cast<int>(local_blocks.size()); | |
| 229 | ✗ | int num_threads = static_cast<int>(std::thread::hardware_concurrency()); | |
| 230 | num_threads = std::min(num_threads, 8); | ||
| 231 | ✗ | num_threads = std::min(num_threads, local_cnt); | |
| 232 | ✗ | int blocks_per_thread_base = local_cnt / num_threads; | |
| 233 | ✗ | int blocks_remainder = local_cnt % num_threads; | |
| 234 | |||
| 235 | ✗ | std::vector<std::vector<uint8_t>> thread_outputs(num_threads); | |
| 236 | ✗ | std::vector<std::thread> threads; | |
| 237 | |||
| 238 | ✗ | for (int tid = 0; tid < num_threads; ++tid) { | |
| 239 | ✗ | int blocks_in_thread = blocks_per_thread_base + (tid < blocks_remainder ? 1 : 0); | |
| 240 | ✗ | int start = (tid * blocks_per_thread_base) + std::min(tid, blocks_remainder); | |
| 241 | |||
| 242 | ✗ | threads.emplace_back([&, tid, start, blocks_in_thread]() { | |
| 243 | int bytes_in_thread = 0; | ||
| 244 | ✗ | for (int i = start; i < start + blocks_in_thread; ++i) { | |
| 245 | ✗ | int idx = local_blocks[i]; | |
| 246 | ✗ | int bx = idx % blocks_x; | |
| 247 | ✗ | int by = idx / blocks_x; | |
| 248 | ✗ | int block_x = bx * block_size; | |
| 249 | ✗ | int block_y = by * block_size; | |
| 250 | ✗ | int block_w = std::min(block_size, width - block_x); | |
| 251 | ✗ | int block_h = std::min(block_size, height - block_y); | |
| 252 | ✗ | bytes_in_thread += block_w * block_h * channels; | |
| 253 | } | ||
| 254 | |||
| 255 | ✗ | std::vector<uint8_t> local_output(bytes_in_thread); | |
| 256 | ✗ | ProcessBlocksInThread(start, blocks_in_thread, blocks_x, width, height, channels, block_size, image_data, | |
| 257 | local_blocks, local_output); | ||
| 258 | ✗ | thread_outputs[tid] = std::move(local_output); | |
| 259 | ✗ | }); | |
| 260 | } | ||
| 261 | |||
| 262 | ✗ | for (auto &t : threads) { | |
| 263 | ✗ | t.join(); | |
| 264 | } | ||
| 265 | |||
| 266 | int total_bytes = 0; | ||
| 267 | ✗ | for (const auto &to : thread_outputs) { | |
| 268 | ✗ | total_bytes += static_cast<int>(to.size()); | |
| 269 | } | ||
| 270 | ✗ | output.resize(total_bytes); | |
| 271 | int pos = 0; | ||
| 272 | ✗ | for (const auto &to : thread_outputs) { | |
| 273 | std::ranges::copy(to, output.begin() + pos); | ||
| 274 | ✗ | pos += static_cast<int>(to.size()); | |
| 275 | } | ||
| 276 | ✗ | } | |
| 277 | |||
| 278 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | void ProcessAssignedBlocks(const std::vector<int> &local_blocks, int blocks_x, int width, int height, int channels, |
| 279 | int block_size, const std::vector<uint8_t> &image_data, std::vector<uint8_t> &output) { | ||
| 280 | 8 | int local_cnt = static_cast<int>(local_blocks.size()); | |
| 281 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (local_cnt == 0) { |
| 282 | output.clear(); | ||
| 283 | 4 | return; | |
| 284 | } | ||
| 285 | |||
| 286 | 4 | int num_threads = static_cast<int>(std::thread::hardware_concurrency()); | |
| 287 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (num_threads <= 1 || local_cnt < 2) { |
| 288 | 4 | ProcessAssignedBlocksSequential(local_blocks, blocks_x, width, height, channels, block_size, image_data, output); | |
| 289 | } else { | ||
| 290 | ✗ | ProcessAssignedBlocksParallel(local_blocks, blocks_x, width, height, channels, block_size, image_data, output); | |
| 291 | } | ||
| 292 | } | ||
| 293 | |||
| 294 | 8 | void GatherAndBroadcastResult(int rank, int num_procs, const std::vector<uint8_t> &output, OutType &out) { | |
| 295 | 8 | int send_count = static_cast<int>(output.size()); | |
| 296 | |||
| 297 |
1/2✓ Branch 2 taken 8 times.
✗ Branch 3 not taken.
|
8 | std::vector<int> recv_counts(num_procs); |
| 298 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | MPI_Allgather(&send_count, 1, MPI_INT, recv_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); |
| 299 | |||
| 300 |
1/4✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
8 | std::vector<int> displs(num_procs); |
| 301 | int total_bytes = 0; | ||
| 302 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 8 times.
|
24 | for (int i = 0; i < num_procs; ++i) { |
| 303 | 16 | displs[i] = total_bytes; | |
| 304 | 16 | total_bytes += recv_counts[i]; | |
| 305 | } | ||
| 306 | |||
| 307 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (rank == 0) { |
| 308 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | out.resize(total_bytes); |
| 309 | |||
| 310 |
1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
|
4 | if (send_count > 0) { |
| 311 | std::ranges::copy(output, out.begin()); | ||
| 312 | } | ||
| 313 | |||
| 314 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | for (int src = 1; src < num_procs; ++src) { |
| 315 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
|
4 | if (recv_counts[src] > 0) { |
| 316 | ✗ | MPI_Recv(out.data() + displs[src], recv_counts[src], MPI_UNSIGNED_CHAR, src, 0, MPI_COMM_WORLD, | |
| 317 | MPI_STATUS_IGNORE); | ||
| 318 | } | ||
| 319 | } | ||
| 320 | } else { | ||
| 321 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
|
4 | if (send_count > 0) { |
| 322 | ✗ | MPI_Send(output.data(), send_count, MPI_UNSIGNED_CHAR, 0, 0, MPI_COMM_WORLD); | |
| 323 | } | ||
| 324 | } | ||
| 325 | |||
| 326 | 8 | int out_size = static_cast<int>(out.size()); | |
| 327 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | MPI_Bcast(&out_size, 1, MPI_INT, 0, MPI_COMM_WORLD); |
| 328 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (rank != 0) { |
| 329 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | out.resize(out_size); |
| 330 | } | ||
| 331 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | MPI_Bcast(out.data(), out_size, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); |
| 332 | 8 | } | |
| 333 | |||
| 334 | } // namespace | ||
| 335 | |||
| 336 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | MoskaevVLinFiltBlockGauss3ALL::MoskaevVLinFiltBlockGauss3ALL(const InType &in) { |
| 337 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 338 | GetInput() = in; | ||
| 339 | 8 | GetOutput() = OutType(); | |
| 340 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | MPI_Comm_rank(MPI_COMM_WORLD, &rank_); |
| 341 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | MPI_Comm_size(MPI_COMM_WORLD, &num_procs_); |
| 342 | 8 | } | |
| 343 | |||
| 344 | 8 | bool MoskaevVLinFiltBlockGauss3ALL::ValidationImpl() { | |
| 345 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | if (rank_ != 0) { |
| 346 | return true; | ||
| 347 | } | ||
| 348 | const auto &input = GetInput(); | ||
| 349 | const auto &data = std::get<4>(input); | ||
| 350 | 4 | return !data.empty(); | |
| 351 | } | ||
| 352 | |||
| 353 | 8 | bool MoskaevVLinFiltBlockGauss3ALL::PreProcessingImpl() { | |
| 354 | 8 | return true; | |
| 355 | } | ||
| 356 | |||
| 357 | 8 | bool MoskaevVLinFiltBlockGauss3ALL::PostProcessingImpl() { | |
| 358 | 8 | return !GetOutput().empty(); | |
| 359 | } | ||
| 360 | |||
| 361 | 8 | bool MoskaevVLinFiltBlockGauss3ALL::RunImpl() { | |
| 362 | 8 | int width = 0; | |
| 363 | 8 | int height = 0; | |
| 364 | 8 | int channels = 0; | |
| 365 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | std::vector<uint8_t> image_data; |
| 366 | |||
| 367 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | BroadcastImageData(rank_, width, height, channels, image_data, GetInput()); |
| 368 | |||
| 369 |
2/4✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 8 times.
✗ Branch 3 not taken.
|
8 | if (width == 0 || height == 0) { |
| 370 | return false; | ||
| 371 | } | ||
| 372 | |||
| 373 | 8 | int blocks_x = (width + block_size_ - 1) / block_size_; | |
| 374 | 8 | int blocks_y = (height + block_size_ - 1) / block_size_; | |
| 375 | 8 | int total_blocks = blocks_x * blocks_y; | |
| 376 | |||
| 377 |
1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
|
8 | if (total_blocks == 0) { |
| 378 | return false; | ||
| 379 | } | ||
| 380 | |||
| 381 | 8 | std::vector<int> local_blocks; | |
| 382 | 8 | int local_cnt = 0; | |
| 383 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | ScatterBlocks(rank_, num_procs_, total_blocks, local_blocks, local_cnt); |
| 384 | |||
| 385 | 8 | std::vector<uint8_t> output; | |
| 386 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | ProcessAssignedBlocks(local_blocks, blocks_x, width, height, channels, block_size_, image_data, output); |
| 387 | |||
| 388 |
1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
|
8 | GatherAndBroadcastResult(rank_, num_procs_, output, GetOutput()); |
| 389 | |||
| 390 | return true; | ||
| 391 | } | ||
| 392 | |||
| 393 | } // namespace moskaev_v_lin_filt_block_gauss_3 | ||
| 394 |