GCC Code Coverage Report


Directory: ./
File: tasks/moskaev_v_lin_filt_block_gauss_3/all/src/ops_all.cpp
Date: 2026-06-04 20:25:32
Exec Total Coverage
Lines: 168 225 74.7%
Functions: 15 18 83.3%
Branches: 89 174 51.1%

Line Branch Exec Source
1 #include "moskaev_v_lin_filt_block_gauss_3/all/include/ops_all.hpp"
2
3 #include <mpi.h>
4
5 #include <algorithm>
6 #include <cmath>
7 #include <cstddef>
8 #include <cstdint>
9 #include <functional>
10 #include <thread>
11 #include <utility>
12 #include <vector>
13
14 #include "moskaev_v_lin_filt_block_gauss_3/common/include/common.hpp"
15
16 namespace moskaev_v_lin_filt_block_gauss_3 {
17
18 namespace {
19
20 4 void CopyBlockWithHalo(const std::vector<uint8_t> &src, std::vector<uint8_t> &dst, int src_width, int src_height,
21 int channels, int block_x, int block_y, int block_w, int block_h, int padded_w) {
22
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 4 times.
20 for (int row = -1; row <= block_h; ++row) {
23
2/2
✓ Branch 0 taken 66 times.
✓ Branch 1 taken 16 times.
82 for (int col = -1; col <= block_w; ++col) {
24 66 int src_row = std::clamp(block_y + row, 0, src_height - 1);
25 66 int src_col = std::clamp(block_x + col, 0, src_width - 1);
26 66 int dst_row = row + 1;
27 66 int dst_col = col + 1;
28
2/2
✓ Branch 0 taken 98 times.
✓ Branch 1 taken 66 times.
164 for (int ch = 0; ch < channels; ++ch) {
29 98 size_t src_idx = ((static_cast<size_t>(src_row) * src_width + src_col) * channels) + ch;
30 98 size_t dst_idx = ((static_cast<size_t>(dst_row) * padded_w + dst_col) * channels) + ch;
31 98 dst[dst_idx] = src[src_idx];
32 }
33 }
34 }
35 4 }
36
37 26 void FilterPixelInBlock(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w,
38 int channels, int row, int col, int ch) {
39 float sum = 0.0F;
40
2/2
✓ Branch 0 taken 78 times.
✓ Branch 1 taken 26 times.
104 for (int ky = -1; ky <= 1; ++ky) {
41
2/2
✓ Branch 0 taken 234 times.
✓ Branch 1 taken 78 times.
312 for (int kx = -1; kx <= 1; ++kx) {
42 234 int ny = row + 1 + ky;
43 234 int nx = col + 1 + kx;
44 234 size_t idx = ((static_cast<size_t>(ny) * (block_w + 2) + nx) * channels) + ch;
45 234 int kidx = ((ky + 1) * 3) + (kx + 1);
46 234 sum += static_cast<float>(input_block[idx]) * kGaussianKernel[kidx];
47 }
48 }
49 26 size_t out_idx = ((static_cast<size_t>(row) * block_w + col) * channels) + ch;
50 26 output_block[out_idx] = static_cast<uint8_t>(std::round(sum));
51 26 }
52
53 8 void FilterBlockRange(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w,
54 int channels, int start_row, int end_row) {
55
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.
16 for (int row = start_row; row < end_row; ++row) {
56
2/2
✓ Branch 0 taken 18 times.
✓ Branch 1 taken 8 times.
26 for (int col = 0; col < block_w; ++col) {
57
2/2
✓ Branch 0 taken 26 times.
✓ Branch 1 taken 18 times.
44 for (int ch = 0; ch < channels; ++ch) {
58 26 FilterPixelInBlock(input_block, output_block, block_w, channels, row, col, ch);
59 }
60 }
61 }
62 8 }
63
64 4 void FilterBlock(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w, int block_h,
65 int channels) {
66 4 int num_threads = static_cast<int>(std::thread::hardware_concurrency());
67
3/4
✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 3 times.
4 if (num_threads <= 1 || block_h < 2) {
68 1 FilterBlockRange(input_block, output_block, block_w, channels, 0, block_h);
69 1 return;
70 }
71
72 num_threads = std::min(num_threads, 8);
73 3 num_threads = std::min(num_threads, block_h);
74 3 int rows_per_thread = (block_h + num_threads - 1) / num_threads;
75 3 std::vector<std::thread> threads;
76
77
2/2
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 3 times.
10 for (int tid = 0; tid < num_threads; ++tid) {
78 7 int start = tid * rows_per_thread;
79
1/2
✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
7 int end = std::min(start + rows_per_thread, block_h);
80
1/2
✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
7 threads.emplace_back(FilterBlockRange, std::cref(input_block), std::ref(output_block), block_w, channels, start,
81 end);
82 }
83
84
2/2
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 3 times.
10 for (auto &t : threads) {
85
1/2
✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
7 t.join();
86 }
87 3 }
88
89 4 void ProcessOneBlock(int idx, int blocks_x, int width, int height, int channels, int block_size,
90 const std::vector<uint8_t> &image_data, std::vector<uint8_t> &output, int &output_offset) {
91 4 int bx = idx % blocks_x;
92 4 int by = idx / blocks_x;
93
94 4 int block_x = bx * block_size;
95 4 int block_y = by * block_size;
96 4 int block_w = std::min(block_size, width - block_x);
97 4 int block_h = std::min(block_size, height - block_y);
98 4 int padded_w = block_w + 2;
99
100 4 size_t input_size = static_cast<size_t>(padded_w) * static_cast<size_t>(block_h + 2) * static_cast<size_t>(channels);
101 4 std::vector<uint8_t> input_block(input_size, 0);
102
103 4 size_t output_size = static_cast<size_t>(block_w) * static_cast<size_t>(block_h) * static_cast<size_t>(channels);
104
1/4
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
4 std::vector<uint8_t> output_block(output_size, 0);
105
106 4 CopyBlockWithHalo(image_data, input_block, width, height, channels, block_x, block_y, block_w, block_h, padded_w);
107
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 FilterBlock(input_block, output_block, block_w, block_h, channels);
108
109
2/2
✓ Branch 0 taken 26 times.
✓ Branch 1 taken 4 times.
30 for (size_t i = 0; i < output_size; ++i) {
110 26 output[output_offset + i] = output_block[i];
111 }
112
1/2
✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
4 output_offset += static_cast<int>(output_size);
113 4 }
114
115 8 void BroadcastImageData(int rank, int &width, int &height, int &channels, std::vector<uint8_t> &image_data,
116 const InType &input) {
117
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (rank == 0) {
118 4 width = std::get<0>(input);
119 4 height = std::get<1>(input);
120 4 channels = std::get<2>(input);
121 4 image_data = std::get<4>(input);
122 }
123
124 8 MPI_Bcast(&width, 1, MPI_INT, 0, MPI_COMM_WORLD);
125 8 MPI_Bcast(&height, 1, MPI_INT, 0, MPI_COMM_WORLD);
126 8 MPI_Bcast(&channels, 1, MPI_INT, 0, MPI_COMM_WORLD);
127
128 8 int data_size = static_cast<int>(image_data.size());
129 8 MPI_Bcast(&data_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
130
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (rank != 0) {
131 4 image_data.resize(data_size);
132 }
133 8 MPI_Bcast(image_data.data(), data_size, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);
134 8 }
135
136 8 void ScatterBlocks(int rank, int num_procs, int total_blocks, std::vector<int> &local_blocks, int &local_cnt) {
137 8 int per_proc = total_blocks / num_procs;
138 8 int rem = total_blocks % num_procs;
139
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 local_cnt = per_proc + (rank < rem ? 1 : 0);
140
141
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (local_cnt <= 0) {
142 local_blocks.clear();
143 4 return;
144 }
145
146 4 std::vector<int> all(total_blocks);
147
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 for (int i = 0; i < total_blocks; ++i) {
148 4 all[i] = i;
149 }
150
151
1/4
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
4 std::vector<int> counts(num_procs);
152
1/4
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
4 std::vector<int> displs(num_procs);
153 int off = 0;
154
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 4 times.
12 for (int proc = 0; proc < num_procs; ++proc) {
155
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 int cnt = per_proc + (proc < rem ? 1 : 0);
156 8 counts[proc] = cnt;
157 8 displs[proc] = off;
158 8 off += cnt;
159 }
160
161
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 local_blocks.resize(local_cnt);
162
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 MPI_Scatterv(all.data(), counts.data(), displs.data(), MPI_INT, local_blocks.data(), local_cnt, MPI_INT, 0,
163 MPI_COMM_WORLD);
164 }
165
166 void ProcessBlockRange(const std::vector<int> &blocks, int start, int end, int blocks_x, int width, int height,
167 int channels, int block_size, const std::vector<uint8_t> &image_data,
168 std::vector<uint8_t> &output, int &output_offset) {
169
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 for (int i = start; i < end; ++i) {
170 4 ProcessOneBlock(blocks[i], blocks_x, width, height, channels, block_size, image_data, output, output_offset);
171 }
172 }
173
174 4 void ProcessAssignedBlocksSequential(const std::vector<int> &local_blocks, int blocks_x, int width, int height,
175 int channels, int block_size, const std::vector<uint8_t> &image_data,
176 std::vector<uint8_t> &output) {
177 4 int local_cnt = static_cast<int>(local_blocks.size());
178 int total_bytes = 0;
179
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 for (int i = 0; i < local_cnt; ++i) {
180 4 int idx = local_blocks[i];
181 4 int bx = idx % blocks_x;
182 4 int by = idx / blocks_x;
183 4 int block_x = bx * block_size;
184 4 int block_y = by * block_size;
185 4 int block_w = std::min(block_size, width - block_x);
186 4 int block_h = std::min(block_size, height - block_y);
187 4 total_bytes += block_w * block_h * channels;
188 }
189 4 output.resize(total_bytes);
190 4 int output_offset = 0;
191 4 ProcessBlockRange(local_blocks, 0, local_cnt, blocks_x, width, height, channels, block_size, image_data, output,
192 output_offset);
193 4 }
194
195 void ProcessBlocksInThread(int start, int blocks_in_thread, int blocks_x, int width, int height, int channels,
196 int block_size, const std::vector<uint8_t> &image_data, const std::vector<int> &local_blocks,
197 std::vector<uint8_t> &local_output) {
198 int offset = 0;
199 for (int i = start; i < start + blocks_in_thread; ++i) {
200 int idx = local_blocks[i];
201 int bx = idx % blocks_x;
202 int by = idx / blocks_x;
203 int block_x = bx * block_size;
204 int block_y = by * block_size;
205 int block_w = std::min(block_size, width - block_x);
206 int block_h = std::min(block_size, height - block_y);
207 int padded_w = block_w + 2;
208
209 size_t input_size =
210 static_cast<size_t>(padded_w) * static_cast<size_t>(block_h + 2) * static_cast<size_t>(channels);
211 std::vector<uint8_t> input_block(input_size, 0);
212 size_t output_size = static_cast<size_t>(block_w) * static_cast<size_t>(block_h) * static_cast<size_t>(channels);
213 std::vector<uint8_t> output_block(output_size, 0);
214
215 CopyBlockWithHalo(image_data, input_block, width, height, channels, block_x, block_y, block_w, block_h, padded_w);
216 FilterBlock(input_block, output_block, block_w, block_h, channels);
217
218 for (size_t j = 0; j < output_size; ++j) {
219 local_output[offset + j] = output_block[j];
220 }
221 offset += static_cast<int>(output_size);
222 }
223 }
224
225 void ProcessAssignedBlocksParallel(const std::vector<int> &local_blocks, int blocks_x, int width, int height,
226 int channels, int block_size, const std::vector<uint8_t> &image_data,
227 std::vector<uint8_t> &output) {
228 int local_cnt = static_cast<int>(local_blocks.size());
229 int num_threads = static_cast<int>(std::thread::hardware_concurrency());
230 num_threads = std::min(num_threads, 8);
231 num_threads = std::min(num_threads, local_cnt);
232 int blocks_per_thread_base = local_cnt / num_threads;
233 int blocks_remainder = local_cnt % num_threads;
234
235 std::vector<std::vector<uint8_t>> thread_outputs(num_threads);
236 std::vector<std::thread> threads;
237
238 for (int tid = 0; tid < num_threads; ++tid) {
239 int blocks_in_thread = blocks_per_thread_base + (tid < blocks_remainder ? 1 : 0);
240 int start = (tid * blocks_per_thread_base) + std::min(tid, blocks_remainder);
241
242 threads.emplace_back([&, tid, start, blocks_in_thread]() {
243 int bytes_in_thread = 0;
244 for (int i = start; i < start + blocks_in_thread; ++i) {
245 int idx = local_blocks[i];
246 int bx = idx % blocks_x;
247 int by = idx / blocks_x;
248 int block_x = bx * block_size;
249 int block_y = by * block_size;
250 int block_w = std::min(block_size, width - block_x);
251 int block_h = std::min(block_size, height - block_y);
252 bytes_in_thread += block_w * block_h * channels;
253 }
254
255 std::vector<uint8_t> local_output(bytes_in_thread);
256 ProcessBlocksInThread(start, blocks_in_thread, blocks_x, width, height, channels, block_size, image_data,
257 local_blocks, local_output);
258 thread_outputs[tid] = std::move(local_output);
259 });
260 }
261
262 for (auto &t : threads) {
263 t.join();
264 }
265
266 int total_bytes = 0;
267 for (const auto &to : thread_outputs) {
268 total_bytes += static_cast<int>(to.size());
269 }
270 output.resize(total_bytes);
271 int pos = 0;
272 for (const auto &to : thread_outputs) {
273 std::ranges::copy(to, output.begin() + pos);
274 pos += static_cast<int>(to.size());
275 }
276 }
277
278
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 void ProcessAssignedBlocks(const std::vector<int> &local_blocks, int blocks_x, int width, int height, int channels,
279 int block_size, const std::vector<uint8_t> &image_data, std::vector<uint8_t> &output) {
280 8 int local_cnt = static_cast<int>(local_blocks.size());
281
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (local_cnt == 0) {
282 output.clear();
283 4 return;
284 }
285
286 4 int num_threads = static_cast<int>(std::thread::hardware_concurrency());
287
1/2
✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
4 if (num_threads <= 1 || local_cnt < 2) {
288 4 ProcessAssignedBlocksSequential(local_blocks, blocks_x, width, height, channels, block_size, image_data, output);
289 } else {
290 ProcessAssignedBlocksParallel(local_blocks, blocks_x, width, height, channels, block_size, image_data, output);
291 }
292 }
293
294 8 void GatherAndBroadcastResult(int rank, int num_procs, const std::vector<uint8_t> &output, OutType &out) {
295 8 int send_count = static_cast<int>(output.size());
296
297
1/2
✓ Branch 2 taken 8 times.
✗ Branch 3 not taken.
8 std::vector<int> recv_counts(num_procs);
298
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 MPI_Allgather(&send_count, 1, MPI_INT, recv_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
299
300
1/4
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
8 std::vector<int> displs(num_procs);
301 int total_bytes = 0;
302
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 8 times.
24 for (int i = 0; i < num_procs; ++i) {
303 16 displs[i] = total_bytes;
304 16 total_bytes += recv_counts[i];
305 }
306
307
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (rank == 0) {
308
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 out.resize(total_bytes);
309
310
1/2
✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
4 if (send_count > 0) {
311 std::ranges::copy(output, out.begin());
312 }
313
314
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 for (int src = 1; src < num_procs; ++src) {
315
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
4 if (recv_counts[src] > 0) {
316 MPI_Recv(out.data() + displs[src], recv_counts[src], MPI_UNSIGNED_CHAR, src, 0, MPI_COMM_WORLD,
317 MPI_STATUS_IGNORE);
318 }
319 }
320 } else {
321
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
4 if (send_count > 0) {
322 MPI_Send(output.data(), send_count, MPI_UNSIGNED_CHAR, 0, 0, MPI_COMM_WORLD);
323 }
324 }
325
326 8 int out_size = static_cast<int>(out.size());
327
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 MPI_Bcast(&out_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
328
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (rank != 0) {
329
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 out.resize(out_size);
330 }
331
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 MPI_Bcast(out.data(), out_size, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);
332 8 }
333
334 } // namespace
335
336
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 MoskaevVLinFiltBlockGauss3ALL::MoskaevVLinFiltBlockGauss3ALL(const InType &in) {
337 SetTypeOfTask(GetStaticTypeOfTask());
338 GetInput() = in;
339 8 GetOutput() = OutType();
340
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 MPI_Comm_rank(MPI_COMM_WORLD, &rank_);
341
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 MPI_Comm_size(MPI_COMM_WORLD, &num_procs_);
342 8 }
343
344 8 bool MoskaevVLinFiltBlockGauss3ALL::ValidationImpl() {
345
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
8 if (rank_ != 0) {
346 return true;
347 }
348 const auto &input = GetInput();
349 const auto &data = std::get<4>(input);
350 4 return !data.empty();
351 }
352
353 8 bool MoskaevVLinFiltBlockGauss3ALL::PreProcessingImpl() {
354 8 return true;
355 }
356
357 8 bool MoskaevVLinFiltBlockGauss3ALL::PostProcessingImpl() {
358 8 return !GetOutput().empty();
359 }
360
361 8 bool MoskaevVLinFiltBlockGauss3ALL::RunImpl() {
362 8 int width = 0;
363 8 int height = 0;
364 8 int channels = 0;
365
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 std::vector<uint8_t> image_data;
366
367
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 BroadcastImageData(rank_, width, height, channels, image_data, GetInput());
368
369
2/4
✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 8 times.
✗ Branch 3 not taken.
8 if (width == 0 || height == 0) {
370 return false;
371 }
372
373 8 int blocks_x = (width + block_size_ - 1) / block_size_;
374 8 int blocks_y = (height + block_size_ - 1) / block_size_;
375 8 int total_blocks = blocks_x * blocks_y;
376
377
1/2
✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
8 if (total_blocks == 0) {
378 return false;
379 }
380
381 8 std::vector<int> local_blocks;
382 8 int local_cnt = 0;
383
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 ScatterBlocks(rank_, num_procs_, total_blocks, local_blocks, local_cnt);
384
385 8 std::vector<uint8_t> output;
386
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 ProcessAssignedBlocks(local_blocks, blocks_x, width, height, channels, block_size_, image_data, output);
387
388
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 GatherAndBroadcastResult(rank_, num_procs_, output, GetOutput());
389
390 return true;
391 }
392
393 } // namespace moskaev_v_lin_filt_block_gauss_3
394