GCC Code Coverage Report

Directory:	./
File:	tasks/moskaev_v_lin_filt_block_gauss_3/all/src/ops_all.cpp
Date:	2026-06-04 20:25:32

	Exec	Total	Coverage
Lines:	168	225	74.7%
Functions:	15	18	83.3%
Branches:	89	174	51.1%

  
      Line
      Branch
      Exec
      Source
    
      #include "moskaev_v_lin_filt_block_gauss_3/all/include/ops_all.hpp"
    
      #include <mpi.h>
    
      #include <algorithm>
    
      #include <cmath>
    
      #include <cstddef>
    
      #include <cstdint>
    
      #include <functional>
    
      #include <thread>
    
      #include <utility>
    
      #include <vector>
    
      #include "moskaev_v_lin_filt_block_gauss_3/common/include/common.hpp"
    
      namespace moskaev_v_lin_filt_block_gauss_3 {
    
      namespace {
    
      4
      void CopyBlockWithHalo(const std::vector<uint8_t> &src, std::vector<uint8_t> &dst, int src_width, int src_height,
    
                             int channels, int block_x, int block_y, int block_w, int block_h, int padded_w) {
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 4 times.

      20
        for (int row = -1; row <= block_h; ++row) {
    
        2/2✓ Branch 0 taken 66 times.
✓ Branch 1 taken 16 times.

      82
          for (int col = -1; col <= block_w; ++col) {
    
      66
            int src_row = std::clamp(block_y + row, 0, src_height - 1);
    
      66
            int src_col = std::clamp(block_x + col, 0, src_width - 1);
    
      66
            int dst_row = row + 1;
    
      66
            int dst_col = col + 1;
    
        2/2✓ Branch 0 taken 98 times.
✓ Branch 1 taken 66 times.

      164
            for (int ch = 0; ch < channels; ++ch) {
    
      98
              size_t src_idx = ((static_cast<size_t>(src_row) * src_width + src_col) * channels) + ch;
    
      98
              size_t dst_idx = ((static_cast<size_t>(dst_row) * padded_w + dst_col) * channels) + ch;
    
      98
              dst[dst_idx] = src[src_idx];
    
            }
    
          }
    
        }
    
      4
      }
    
      26
      void FilterPixelInBlock(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w,
    
                              int channels, int row, int col, int ch) {
    
        float sum = 0.0F;
    
        2/2✓ Branch 0 taken 78 times.
✓ Branch 1 taken 26 times.

      104
        for (int ky = -1; ky <= 1; ++ky) {
    
        2/2✓ Branch 0 taken 234 times.
✓ Branch 1 taken 78 times.

      312
          for (int kx = -1; kx <= 1; ++kx) {
    
      234
            int ny = row + 1 + ky;
    
      234
            int nx = col + 1 + kx;
    
      234
            size_t idx = ((static_cast<size_t>(ny) * (block_w + 2) + nx) * channels) + ch;
    
      234
            int kidx = ((ky + 1) * 3) + (kx + 1);
    
      234
            sum += static_cast<float>(input_block[idx]) * kGaussianKernel[kidx];
    
          }
    
        }
    
      26
        size_t out_idx = ((static_cast<size_t>(row) * block_w + col) * channels) + ch;
    
      26
        output_block[out_idx] = static_cast<uint8_t>(std::round(sum));
    
      26
      }
    
      8
      void FilterBlockRange(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w,
    
                            int channels, int start_row, int end_row) {
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 8 times.

      16
        for (int row = start_row; row < end_row; ++row) {
    
        2/2✓ Branch 0 taken 18 times.
✓ Branch 1 taken 8 times.

      26
          for (int col = 0; col < block_w; ++col) {
    
        2/2✓ Branch 0 taken 26 times.
✓ Branch 1 taken 18 times.

      44
            for (int ch = 0; ch < channels; ++ch) {
    
      26
              FilterPixelInBlock(input_block, output_block, block_w, channels, row, col, ch);
    
            }
    
          }
    
        }
    
      8
      }
    
      4
      void FilterBlock(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w, int block_h,
    
                       int channels) {
    
      4
        int num_threads = static_cast<int>(std::thread::hardware_concurrency());
    
        3/4✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 3 times.

      4
        if (num_threads <= 1 || block_h < 2) {
    
      1
          FilterBlockRange(input_block, output_block, block_w, channels, 0, block_h);
    
      1
          return;
    
        }
    
        num_threads = std::min(num_threads, 8);
    
      3
        num_threads = std::min(num_threads, block_h);
    
      3
        int rows_per_thread = (block_h + num_threads - 1) / num_threads;
    
      3
        std::vector<std::thread> threads;
    
        2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 3 times.

      10
        for (int tid = 0; tid < num_threads; ++tid) {
    
      7
          int start = tid * rows_per_thread;
    
        1/2✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.

      7
          int end = std::min(start + rows_per_thread, block_h);
    
        1/2✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.

      7
          threads.emplace_back(FilterBlockRange, std::cref(input_block), std::ref(output_block), block_w, channels, start,
    
                               end);
    
        }
    
        2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 3 times.

      10
        for (auto &t : threads) {
    
        1/2✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.

      7
          t.join();
    
        }
    
      3
      }
    
      4
      void ProcessOneBlock(int idx, int blocks_x, int width, int height, int channels, int block_size,
    
                           const std::vector<uint8_t> &image_data, std::vector<uint8_t> &output, int &output_offset) {
    
      4
        int bx = idx % blocks_x;
    
      4
        int by = idx / blocks_x;
    
      4
        int block_x = bx * block_size;
    
      4
        int block_y = by * block_size;
    
      4
        int block_w = std::min(block_size, width - block_x);
    
      4
        int block_h = std::min(block_size, height - block_y);
    
      4
        int padded_w = block_w + 2;
    
      4
        size_t input_size = static_cast<size_t>(padded_w) * static_cast<size_t>(block_h + 2) * static_cast<size_t>(channels);
    
      4
        std::vector<uint8_t> input_block(input_size, 0);
    
      4
        size_t output_size = static_cast<size_t>(block_w) * static_cast<size_t>(block_h) * static_cast<size_t>(channels);
    
        1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      4
        std::vector<uint8_t> output_block(output_size, 0);
    
      4
        CopyBlockWithHalo(image_data, input_block, width, height, channels, block_x, block_y, block_w, block_h, padded_w);
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        FilterBlock(input_block, output_block, block_w, block_h, channels);
    
        2/2✓ Branch 0 taken 26 times.
✓ Branch 1 taken 4 times.

      30
        for (size_t i = 0; i < output_size; ++i) {
    
      26
          output[output_offset + i] = output_block[i];
    
        }
    
        1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.

      4
        output_offset += static_cast<int>(output_size);
    
      4
      }
    
      8
      void BroadcastImageData(int rank, int &width, int &height, int &channels, std::vector<uint8_t> &image_data,
    
                              const InType &input) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (rank == 0) {
    
      4
          width = std::get<0>(input);
    
      4
          height = std::get<1>(input);
    
      4
          channels = std::get<2>(input);
    
      4
          image_data = std::get<4>(input);
    
        }
    
      8
        MPI_Bcast(&width, 1, MPI_INT, 0, MPI_COMM_WORLD);
    
      8
        MPI_Bcast(&height, 1, MPI_INT, 0, MPI_COMM_WORLD);
    
      8
        MPI_Bcast(&channels, 1, MPI_INT, 0, MPI_COMM_WORLD);
    
      8
        int data_size = static_cast<int>(image_data.size());
    
      8
        MPI_Bcast(&data_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (rank != 0) {
    
      4
          image_data.resize(data_size);
    
        }
    
      8
        MPI_Bcast(image_data.data(), data_size, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);
    
      8
      }
    
      8
      void ScatterBlocks(int rank, int num_procs, int total_blocks, std::vector<int> &local_blocks, int &local_cnt) {
    
      8
        int per_proc = total_blocks / num_procs;
    
      8
        int rem = total_blocks % num_procs;
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        local_cnt = per_proc + (rank < rem ? 1 : 0);
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (local_cnt <= 0) {
    
          local_blocks.clear();
    
      4
          return;
    
        }
    
      4
        std::vector<int> all(total_blocks);
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        for (int i = 0; i < total_blocks; ++i) {
    
      4
          all[i] = i;
    
        }
    
        1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      4
        std::vector<int> counts(num_procs);
    
        1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      4
        std::vector<int> displs(num_procs);
    
        int off = 0;
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 4 times.

      12
        for (int proc = 0; proc < num_procs; ++proc) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
          int cnt = per_proc + (proc < rem ? 1 : 0);
    
      8
          counts[proc] = cnt;
    
      8
          displs[proc] = off;
    
      8
          off += cnt;
    
        }
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        local_blocks.resize(local_cnt);
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        MPI_Scatterv(all.data(), counts.data(), displs.data(), MPI_INT, local_blocks.data(), local_cnt, MPI_INT, 0,
    
                     MPI_COMM_WORLD);
    
      }
    
      void ProcessBlockRange(const std::vector<int> &blocks, int start, int end, int blocks_x, int width, int height,
    
                             int channels, int block_size, const std::vector<uint8_t> &image_data,
    
                             std::vector<uint8_t> &output, int &output_offset) {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        for (int i = start; i < end; ++i) {
    
      4
          ProcessOneBlock(blocks[i], blocks_x, width, height, channels, block_size, image_data, output, output_offset);
    
        }
    
      }
    
      4
      void ProcessAssignedBlocksSequential(const std::vector<int> &local_blocks, int blocks_x, int width, int height,
    
                                           int channels, int block_size, const std::vector<uint8_t> &image_data,
    
                                           std::vector<uint8_t> &output) {
    
      4
        int local_cnt = static_cast<int>(local_blocks.size());
    
        int total_bytes = 0;
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        for (int i = 0; i < local_cnt; ++i) {
    
      4
          int idx = local_blocks[i];
    
      4
          int bx = idx % blocks_x;
    
      4
          int by = idx / blocks_x;
    
      4
          int block_x = bx * block_size;
    
      4
          int block_y = by * block_size;
    
      4
          int block_w = std::min(block_size, width - block_x);
    
      4
          int block_h = std::min(block_size, height - block_y);
    
      4
          total_bytes += block_w * block_h * channels;
    
        }
    
      4
        output.resize(total_bytes);
    
      4
        int output_offset = 0;
    
      4
        ProcessBlockRange(local_blocks, 0, local_cnt, blocks_x, width, height, channels, block_size, image_data, output,
    
                          output_offset);
    
      4
      }
    
      ✗
      void ProcessBlocksInThread(int start, int blocks_in_thread, int blocks_x, int width, int height, int channels,
    
                                 int block_size, const std::vector<uint8_t> &image_data, const std::vector<int> &local_blocks,
    
                                 std::vector<uint8_t> &local_output) {
    
        int offset = 0;
    
      ✗
        for (int i = start; i < start + blocks_in_thread; ++i) {
    
      ✗
          int idx = local_blocks[i];
    
      ✗
          int bx = idx % blocks_x;
    
      ✗
          int by = idx / blocks_x;
    
      ✗
          int block_x = bx * block_size;
    
      ✗
          int block_y = by * block_size;
    
      ✗
          int block_w = std::min(block_size, width - block_x);
    
      ✗
          int block_h = std::min(block_size, height - block_y);
    
      ✗
          int padded_w = block_w + 2;
    
      ✗
          size_t input_size =
    
      ✗
              static_cast<size_t>(padded_w) * static_cast<size_t>(block_h + 2) * static_cast<size_t>(channels);
    
      ✗
          std::vector<uint8_t> input_block(input_size, 0);
    
      ✗
          size_t output_size = static_cast<size_t>(block_w) * static_cast<size_t>(block_h) * static_cast<size_t>(channels);
    
      ✗
          std::vector<uint8_t> output_block(output_size, 0);
    
      ✗
          CopyBlockWithHalo(image_data, input_block, width, height, channels, block_x, block_y, block_w, block_h, padded_w);
    
      ✗
          FilterBlock(input_block, output_block, block_w, block_h, channels);
    
      ✗
          for (size_t j = 0; j < output_size; ++j) {
    
      ✗
            local_output[offset + j] = output_block[j];
    
          }
    
      ✗
          offset += static_cast<int>(output_size);
    
        }
    
      ✗
      }
    
      ✗
      void ProcessAssignedBlocksParallel(const std::vector<int> &local_blocks, int blocks_x, int width, int height,
    
                                         int channels, int block_size, const std::vector<uint8_t> &image_data,
    
                                         std::vector<uint8_t> &output) {
    
      ✗
        int local_cnt = static_cast<int>(local_blocks.size());
    
      ✗
        int num_threads = static_cast<int>(std::thread::hardware_concurrency());
    
        num_threads = std::min(num_threads, 8);
    
      ✗
        num_threads = std::min(num_threads, local_cnt);
    
      ✗
        int blocks_per_thread_base = local_cnt / num_threads;
    
      ✗
        int blocks_remainder = local_cnt % num_threads;
    
      ✗
        std::vector<std::vector<uint8_t>> thread_outputs(num_threads);
    
      ✗
        std::vector<std::thread> threads;
    
      ✗
        for (int tid = 0; tid < num_threads; ++tid) {
    
      ✗
          int blocks_in_thread = blocks_per_thread_base + (tid < blocks_remainder ? 1 : 0);
    
      ✗
          int start = (tid * blocks_per_thread_base) + std::min(tid, blocks_remainder);
    
      ✗
          threads.emplace_back([&, tid, start, blocks_in_thread]() {
    
            int bytes_in_thread = 0;
    
      ✗
            for (int i = start; i < start + blocks_in_thread; ++i) {
    
      ✗
              int idx = local_blocks[i];
    
      ✗
              int bx = idx % blocks_x;
    
      ✗
              int by = idx / blocks_x;
    
      ✗
              int block_x = bx * block_size;
    
      ✗
              int block_y = by * block_size;
    
      ✗
              int block_w = std::min(block_size, width - block_x);
    
      ✗
              int block_h = std::min(block_size, height - block_y);
    
      ✗
              bytes_in_thread += block_w * block_h * channels;
    
            }
    
      ✗
            std::vector<uint8_t> local_output(bytes_in_thread);
    
      ✗
            ProcessBlocksInThread(start, blocks_in_thread, blocks_x, width, height, channels, block_size, image_data,
    
                                  local_blocks, local_output);
    
      ✗
            thread_outputs[tid] = std::move(local_output);
    
      ✗
          });
    
        }
    
      ✗
        for (auto &t : threads) {
    
      ✗
          t.join();
    
        }
    
        int total_bytes = 0;
    
      ✗
        for (const auto &to : thread_outputs) {
    
      ✗
          total_bytes += static_cast<int>(to.size());
    
        }
    
      ✗
        output.resize(total_bytes);
    
        int pos = 0;
    
      ✗
        for (const auto &to : thread_outputs) {
    
          std::ranges::copy(to, output.begin() + pos);
    
      ✗
          pos += static_cast<int>(to.size());
    
        }
    
      ✗
      }
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
      void ProcessAssignedBlocks(const std::vector<int> &local_blocks, int blocks_x, int width, int height, int channels,
    
                                 int block_size, const std::vector<uint8_t> &image_data, std::vector<uint8_t> &output) {
    
      8
        int local_cnt = static_cast<int>(local_blocks.size());
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (local_cnt == 0) {
    
          output.clear();
    
      4
          return;
    
        }
    
      4
        int num_threads = static_cast<int>(std::thread::hardware_concurrency());
    
        1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.

      4
        if (num_threads <= 1 || local_cnt < 2) {
    
      4
          ProcessAssignedBlocksSequential(local_blocks, blocks_x, width, height, channels, block_size, image_data, output);
    
        } else {
    
      ✗
          ProcessAssignedBlocksParallel(local_blocks, blocks_x, width, height, channels, block_size, image_data, output);
    
        }
    
      }
    
      8
      void GatherAndBroadcastResult(int rank, int num_procs, const std::vector<uint8_t> &output, OutType &out) {
    
      8
        int send_count = static_cast<int>(output.size());
    
        1/2✓ Branch 2 taken 8 times.
✗ Branch 3 not taken.

      8
        std::vector<int> recv_counts(num_procs);
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        MPI_Allgather(&send_count, 1, MPI_INT, recv_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
    
        1/4✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      8
        std::vector<int> displs(num_procs);
    
        int total_bytes = 0;
    
        2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 8 times.

      24
        for (int i = 0; i < num_procs; ++i) {
    
      16
          displs[i] = total_bytes;
    
      16
          total_bytes += recv_counts[i];
    
        }
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (rank == 0) {
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
          out.resize(total_bytes);
    
        1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.

      4
          if (send_count > 0) {
    
            std::ranges::copy(output, out.begin());
    
          }
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
          for (int src = 1; src < num_procs; ++src) {
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.

      4
            if (recv_counts[src] > 0) {
    
      ✗
              MPI_Recv(out.data() + displs[src], recv_counts[src], MPI_UNSIGNED_CHAR, src, 0, MPI_COMM_WORLD,
    
                       MPI_STATUS_IGNORE);
    
            }
    
          }
    
        } else {
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.

      4
          if (send_count > 0) {
    
      ✗
            MPI_Send(output.data(), send_count, MPI_UNSIGNED_CHAR, 0, 0, MPI_COMM_WORLD);
    
          }
    
        }
    
      8
        int out_size = static_cast<int>(out.size());
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        MPI_Bcast(&out_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (rank != 0) {
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
          out.resize(out_size);
    
        }
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        MPI_Bcast(out.data(), out_size, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);
    
      8
      }
    
      }  // namespace
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
      MoskaevVLinFiltBlockGauss3ALL::MoskaevVLinFiltBlockGauss3ALL(const InType &in) {
    
        SetTypeOfTask(GetStaticTypeOfTask());
    
        GetInput() = in;
    
      8
        GetOutput() = OutType();
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        MPI_Comm_rank(MPI_COMM_WORLD, &rank_);
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        MPI_Comm_size(MPI_COMM_WORLD, &num_procs_);
    
      8
      }
    
      8
      bool MoskaevVLinFiltBlockGauss3ALL::ValidationImpl() {
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (rank_ != 0) {
    
          return true;
    
        }
    
        const auto &input = GetInput();
    
        const auto &data = std::get<4>(input);
    
      4
        return !data.empty();
    
      }
    
      8
      bool MoskaevVLinFiltBlockGauss3ALL::PreProcessingImpl() {
    
      8
        return true;
    
      }
    
      8
      bool MoskaevVLinFiltBlockGauss3ALL::PostProcessingImpl() {
    
      8
        return !GetOutput().empty();
    
      }
    
      8
      bool MoskaevVLinFiltBlockGauss3ALL::RunImpl() {
    
      8
        int width = 0;
    
      8
        int height = 0;
    
      8
        int channels = 0;
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        std::vector<uint8_t> image_data;
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        BroadcastImageData(rank_, width, height, channels, image_data, GetInput());
    
        2/4✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 8 times.
✗ Branch 3 not taken.

      8
        if (width == 0 || height == 0) {
    
          return false;
    
        }
    
      8
        int blocks_x = (width + block_size_ - 1) / block_size_;
    
      8
        int blocks_y = (height + block_size_ - 1) / block_size_;
    
      8
        int total_blocks = blocks_x * blocks_y;
    
        1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.

      8
        if (total_blocks == 0) {
    
          return false;
    
        }
    
      8
        std::vector<int> local_blocks;
    
      8
        int local_cnt = 0;
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        ScatterBlocks(rank_, num_procs_, total_blocks, local_blocks, local_cnt);
    
      8
        std::vector<uint8_t> output;
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        ProcessAssignedBlocks(local_blocks, blocks_x, width, height, channels, block_size_, image_data, output);
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        GatherAndBroadcastResult(rank_, num_procs_, output, GetOutput());
    
        return true;
    
      }
    
      }  // namespace moskaev_v_lin_filt_block_gauss_3

Line	Branch	Exec	Source
1			#include "moskaev_v_lin_filt_block_gauss_3/all/include/ops_all.hpp"
2
3			#include <mpi.h>
4
5			#include <algorithm>
6			#include <cmath>
7			#include <cstddef>
8			#include <cstdint>
9			#include <functional>
10			#include <thread>
11			#include <utility>
12			#include <vector>
13
14			#include "moskaev_v_lin_filt_block_gauss_3/common/include/common.hpp"
15
16			namespace moskaev_v_lin_filt_block_gauss_3 {
17
18			namespace {
19
20		4	void CopyBlockWithHalo(const std::vector<uint8_t> &src, std::vector<uint8_t> &dst, int src_width, int src_height,
21			int channels, int block_x, int block_y, int block_w, int block_h, int padded_w) {
22	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 4 times.	20	for (int row = -1; row <= block_h; ++row) {
23	2/2 ✓ Branch 0 taken 66 times. ✓ Branch 1 taken 16 times.	82	for (int col = -1; col <= block_w; ++col) {
24		66	int src_row = std::clamp(block_y + row, 0, src_height - 1);
25		66	int src_col = std::clamp(block_x + col, 0, src_width - 1);
26		66	int dst_row = row + 1;
27		66	int dst_col = col + 1;
28	2/2 ✓ Branch 0 taken 98 times. ✓ Branch 1 taken 66 times.	164	for (int ch = 0; ch < channels; ++ch) {
29		98	size_t src_idx = ((static_cast<size_t>(src_row) * src_width + src_col) * channels) + ch;
30		98	size_t dst_idx = ((static_cast<size_t>(dst_row) * padded_w + dst_col) * channels) + ch;
31		98	dst[dst_idx] = src[src_idx];
32			}
33			}
34			}
35		4	}
36
37		26	void FilterPixelInBlock(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w,
38			int channels, int row, int col, int ch) {
39			float sum = 0.0F;
40	2/2 ✓ Branch 0 taken 78 times. ✓ Branch 1 taken 26 times.	104	for (int ky = -1; ky <= 1; ++ky) {
41	2/2 ✓ Branch 0 taken 234 times. ✓ Branch 1 taken 78 times.	312	for (int kx = -1; kx <= 1; ++kx) {
42		234	int ny = row + 1 + ky;
43		234	int nx = col + 1 + kx;
44		234	size_t idx = ((static_cast<size_t>(ny) * (block_w + 2) + nx) * channels) + ch;
45		234	int kidx = ((ky + 1) * 3) + (kx + 1);
46		234	sum += static_cast<float>(input_block[idx]) * kGaussianKernel[kidx];
47			}
48			}
49		26	size_t out_idx = ((static_cast<size_t>(row) * block_w + col) * channels) + ch;
50		26	output_block[out_idx] = static_cast<uint8_t>(std::round(sum));
51		26	}
52
53		8	void FilterBlockRange(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w,
54			int channels, int start_row, int end_row) {
55	2/2 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 8 times.	16	for (int row = start_row; row < end_row; ++row) {
56	2/2 ✓ Branch 0 taken 18 times. ✓ Branch 1 taken 8 times.	26	for (int col = 0; col < block_w; ++col) {
57	2/2 ✓ Branch 0 taken 26 times. ✓ Branch 1 taken 18 times.	44	for (int ch = 0; ch < channels; ++ch) {
58		26	FilterPixelInBlock(input_block, output_block, block_w, channels, row, col, ch);
59			}
60			}
61			}
62		8	}
63
64		4	void FilterBlock(const std::vector<uint8_t> &input_block, std::vector<uint8_t> &output_block, int block_w, int block_h,
65			int channels) {
66		4	int num_threads = static_cast<int>(std::thread::hardware_concurrency());
67	3/4 ✓ Branch 0 taken 4 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 1 times. ✓ Branch 3 taken 3 times.	4	if (num_threads <= 1 \|\| block_h < 2) {
68		1	FilterBlockRange(input_block, output_block, block_w, channels, 0, block_h);
69		1	return;
70			}
71
72			num_threads = std::min(num_threads, 8);
73		3	num_threads = std::min(num_threads, block_h);
74		3	int rows_per_thread = (block_h + num_threads - 1) / num_threads;
75		3	std::vector<std::thread> threads;
76
77	2/2 ✓ Branch 0 taken 7 times. ✓ Branch 1 taken 3 times.	10	for (int tid = 0; tid < num_threads; ++tid) {
78		7	int start = tid * rows_per_thread;
79	1/2 ✓ Branch 1 taken 7 times. ✗ Branch 2 not taken.	7	int end = std::min(start + rows_per_thread, block_h);
80	1/2 ✓ Branch 1 taken 7 times. ✗ Branch 2 not taken.	7	threads.emplace_back(FilterBlockRange, std::cref(input_block), std::ref(output_block), block_w, channels, start,
81			end);
82			}
83
84	2/2 ✓ Branch 0 taken 7 times. ✓ Branch 1 taken 3 times.	10	for (auto &t : threads) {
85	1/2 ✓ Branch 1 taken 7 times. ✗ Branch 2 not taken.	7	t.join();
86			}
87		3	}
88
89		4	void ProcessOneBlock(int idx, int blocks_x, int width, int height, int channels, int block_size,
90			const std::vector<uint8_t> &image_data, std::vector<uint8_t> &output, int &output_offset) {
91		4	int bx = idx % blocks_x;
92		4	int by = idx / blocks_x;
93
94		4	int block_x = bx * block_size;
95		4	int block_y = by * block_size;
96		4	int block_w = std::min(block_size, width - block_x);
97		4	int block_h = std::min(block_size, height - block_y);
98		4	int padded_w = block_w + 2;
99
100		4	size_t input_size = static_cast<size_t>(padded_w) * static_cast<size_t>(block_h + 2) * static_cast<size_t>(channels);
101		4	std::vector<uint8_t> input_block(input_size, 0);
102
103		4	size_t output_size = static_cast<size_t>(block_w) * static_cast<size_t>(block_h) * static_cast<size_t>(channels);
104	1/4 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	4	std::vector<uint8_t> output_block(output_size, 0);
105
106		4	CopyBlockWithHalo(image_data, input_block, width, height, channels, block_x, block_y, block_w, block_h, padded_w);
107	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	FilterBlock(input_block, output_block, block_w, block_h, channels);
108
109	2/2 ✓ Branch 0 taken 26 times. ✓ Branch 1 taken 4 times.	30	for (size_t i = 0; i < output_size; ++i) {
110		26	output[output_offset + i] = output_block[i];
111			}
112	1/2 ✓ Branch 0 taken 4 times. ✗ Branch 1 not taken.	4	output_offset += static_cast<int>(output_size);
113		4	}
114
115		8	void BroadcastImageData(int rank, int &width, int &height, int &channels, std::vector<uint8_t> &image_data,
116			const InType &input) {
117	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (rank == 0) {
118		4	width = std::get<0>(input);
119		4	height = std::get<1>(input);
120		4	channels = std::get<2>(input);
121		4	image_data = std::get<4>(input);
122			}
123
124		8	MPI_Bcast(&width, 1, MPI_INT, 0, MPI_COMM_WORLD);
125		8	MPI_Bcast(&height, 1, MPI_INT, 0, MPI_COMM_WORLD);
126		8	MPI_Bcast(&channels, 1, MPI_INT, 0, MPI_COMM_WORLD);
127
128		8	int data_size = static_cast<int>(image_data.size());
129		8	MPI_Bcast(&data_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
130	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (rank != 0) {
131		4	image_data.resize(data_size);
132			}
133		8	MPI_Bcast(image_data.data(), data_size, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);
134		8	}
135
136		8	void ScatterBlocks(int rank, int num_procs, int total_blocks, std::vector<int> &local_blocks, int &local_cnt) {
137		8	int per_proc = total_blocks / num_procs;
138		8	int rem = total_blocks % num_procs;
139	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	local_cnt = per_proc + (rank < rem ? 1 : 0);
140
141	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (local_cnt <= 0) {
142			local_blocks.clear();
143		4	return;
144			}
145
146		4	std::vector<int> all(total_blocks);
147	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	for (int i = 0; i < total_blocks; ++i) {
148		4	all[i] = i;
149			}
150
151	1/4 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	4	std::vector<int> counts(num_procs);
152	1/4 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	4	std::vector<int> displs(num_procs);
153			int off = 0;
154	2/2 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 4 times.	12	for (int proc = 0; proc < num_procs; ++proc) {
155	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	int cnt = per_proc + (proc < rem ? 1 : 0);
156		8	counts[proc] = cnt;
157		8	displs[proc] = off;
158		8	off += cnt;
159			}
160
161	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	local_blocks.resize(local_cnt);
162	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	MPI_Scatterv(all.data(), counts.data(), displs.data(), MPI_INT, local_blocks.data(), local_cnt, MPI_INT, 0,
163			MPI_COMM_WORLD);
164			}
165
166			void ProcessBlockRange(const std::vector<int> &blocks, int start, int end, int blocks_x, int width, int height,
167			int channels, int block_size, const std::vector<uint8_t> &image_data,
168			std::vector<uint8_t> &output, int &output_offset) {
169	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	for (int i = start; i < end; ++i) {
170		4	ProcessOneBlock(blocks[i], blocks_x, width, height, channels, block_size, image_data, output, output_offset);
171			}
172			}
173
174		4	void ProcessAssignedBlocksSequential(const std::vector<int> &local_blocks, int blocks_x, int width, int height,
175			int channels, int block_size, const std::vector<uint8_t> &image_data,
176			std::vector<uint8_t> &output) {
177		4	int local_cnt = static_cast<int>(local_blocks.size());
178			int total_bytes = 0;
179	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	for (int i = 0; i < local_cnt; ++i) {
180		4	int idx = local_blocks[i];
181		4	int bx = idx % blocks_x;
182		4	int by = idx / blocks_x;
183		4	int block_x = bx * block_size;
184		4	int block_y = by * block_size;
185		4	int block_w = std::min(block_size, width - block_x);
186		4	int block_h = std::min(block_size, height - block_y);
187		4	total_bytes += block_w * block_h * channels;
188			}
189		4	output.resize(total_bytes);
190		4	int output_offset = 0;
191		4	ProcessBlockRange(local_blocks, 0, local_cnt, blocks_x, width, height, channels, block_size, image_data, output,
192			output_offset);
193		4	}
194
195		✗	void ProcessBlocksInThread(int start, int blocks_in_thread, int blocks_x, int width, int height, int channels,
196			int block_size, const std::vector<uint8_t> &image_data, const std::vector<int> &local_blocks,
197			std::vector<uint8_t> &local_output) {
198			int offset = 0;
199		✗	for (int i = start; i < start + blocks_in_thread; ++i) {
200		✗	int idx = local_blocks[i];
201		✗	int bx = idx % blocks_x;
202		✗	int by = idx / blocks_x;
203		✗	int block_x = bx * block_size;
204		✗	int block_y = by * block_size;
205		✗	int block_w = std::min(block_size, width - block_x);
206		✗	int block_h = std::min(block_size, height - block_y);
207		✗	int padded_w = block_w + 2;
208
209		✗	size_t input_size =
210		✗	static_cast<size_t>(padded_w) * static_cast<size_t>(block_h + 2) * static_cast<size_t>(channels);
211		✗	std::vector<uint8_t> input_block(input_size, 0);
212		✗	size_t output_size = static_cast<size_t>(block_w) * static_cast<size_t>(block_h) * static_cast<size_t>(channels);
213		✗	std::vector<uint8_t> output_block(output_size, 0);
214
215		✗	CopyBlockWithHalo(image_data, input_block, width, height, channels, block_x, block_y, block_w, block_h, padded_w);
216		✗	FilterBlock(input_block, output_block, block_w, block_h, channels);
217
218		✗	for (size_t j = 0; j < output_size; ++j) {
219		✗	local_output[offset + j] = output_block[j];
220			}
221		✗	offset += static_cast<int>(output_size);
222			}
223		✗	}
224
225		✗	void ProcessAssignedBlocksParallel(const std::vector<int> &local_blocks, int blocks_x, int width, int height,
226			int channels, int block_size, const std::vector<uint8_t> &image_data,
227			std::vector<uint8_t> &output) {
228		✗	int local_cnt = static_cast<int>(local_blocks.size());
229		✗	int num_threads = static_cast<int>(std::thread::hardware_concurrency());
230			num_threads = std::min(num_threads, 8);
231		✗	num_threads = std::min(num_threads, local_cnt);
232		✗	int blocks_per_thread_base = local_cnt / num_threads;
233		✗	int blocks_remainder = local_cnt % num_threads;
234
235		✗	std::vector<std::vector<uint8_t>> thread_outputs(num_threads);
236		✗	std::vector<std::thread> threads;
237
238		✗	for (int tid = 0; tid < num_threads; ++tid) {
239		✗	int blocks_in_thread = blocks_per_thread_base + (tid < blocks_remainder ? 1 : 0);
240		✗	int start = (tid * blocks_per_thread_base) + std::min(tid, blocks_remainder);
241
242		✗	threads.emplace_back([&, tid, start, blocks_in_thread]() {
243			int bytes_in_thread = 0;
244		✗	for (int i = start; i < start + blocks_in_thread; ++i) {
245		✗	int idx = local_blocks[i];
246		✗	int bx = idx % blocks_x;
247		✗	int by = idx / blocks_x;
248		✗	int block_x = bx * block_size;
249		✗	int block_y = by * block_size;
250		✗	int block_w = std::min(block_size, width - block_x);
251		✗	int block_h = std::min(block_size, height - block_y);
252		✗	bytes_in_thread += block_w * block_h * channels;
253			}
254
255		✗	std::vector<uint8_t> local_output(bytes_in_thread);
256		✗	ProcessBlocksInThread(start, blocks_in_thread, blocks_x, width, height, channels, block_size, image_data,
257			local_blocks, local_output);
258		✗	thread_outputs[tid] = std::move(local_output);
259		✗	});
260			}
261
262		✗	for (auto &t : threads) {
263		✗	t.join();
264			}
265
266			int total_bytes = 0;
267		✗	for (const auto &to : thread_outputs) {
268		✗	total_bytes += static_cast<int>(to.size());
269			}
270		✗	output.resize(total_bytes);
271			int pos = 0;
272		✗	for (const auto &to : thread_outputs) {
273			std::ranges::copy(to, output.begin() + pos);
274		✗	pos += static_cast<int>(to.size());
275			}
276		✗	}
277
278	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	void ProcessAssignedBlocks(const std::vector<int> &local_blocks, int blocks_x, int width, int height, int channels,
279			int block_size, const std::vector<uint8_t> &image_data, std::vector<uint8_t> &output) {
280		8	int local_cnt = static_cast<int>(local_blocks.size());
281	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (local_cnt == 0) {
282			output.clear();
283		4	return;
284			}
285
286		4	int num_threads = static_cast<int>(std::thread::hardware_concurrency());
287	1/2 ✓ Branch 0 taken 4 times. ✗ Branch 1 not taken.	4	if (num_threads <= 1 \|\| local_cnt < 2) {
288		4	ProcessAssignedBlocksSequential(local_blocks, blocks_x, width, height, channels, block_size, image_data, output);
289			} else {
290		✗	ProcessAssignedBlocksParallel(local_blocks, blocks_x, width, height, channels, block_size, image_data, output);
291			}
292			}
293
294		8	void GatherAndBroadcastResult(int rank, int num_procs, const std::vector<uint8_t> &output, OutType &out) {
295		8	int send_count = static_cast<int>(output.size());
296
297	1/2 ✓ Branch 2 taken 8 times. ✗ Branch 3 not taken.	8	std::vector<int> recv_counts(num_procs);
298	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	MPI_Allgather(&send_count, 1, MPI_INT, recv_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
299
300	1/4 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	8	std::vector<int> displs(num_procs);
301			int total_bytes = 0;
302	2/2 ✓ Branch 0 taken 16 times. ✓ Branch 1 taken 8 times.	24	for (int i = 0; i < num_procs; ++i) {
303		16	displs[i] = total_bytes;
304		16	total_bytes += recv_counts[i];
305			}
306
307	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (rank == 0) {
308	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	out.resize(total_bytes);
309
310	1/2 ✓ Branch 0 taken 4 times. ✗ Branch 1 not taken.	4	if (send_count > 0) {
311			std::ranges::copy(output, out.begin());
312			}
313
314	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	for (int src = 1; src < num_procs; ++src) {
315	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 4 times.	4	if (recv_counts[src] > 0) {
316		✗	MPI_Recv(out.data() + displs[src], recv_counts[src], MPI_UNSIGNED_CHAR, src, 0, MPI_COMM_WORLD,
317			MPI_STATUS_IGNORE);
318			}
319			}
320			} else {
321	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 4 times.	4	if (send_count > 0) {
322		✗	MPI_Send(output.data(), send_count, MPI_UNSIGNED_CHAR, 0, 0, MPI_COMM_WORLD);
323			}
324			}
325
326		8	int out_size = static_cast<int>(out.size());
327	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	MPI_Bcast(&out_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
328	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (rank != 0) {
329	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	out.resize(out_size);
330			}
331	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	MPI_Bcast(out.data(), out_size, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);
332		8	}
333
334			} // namespace
335
336	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	MoskaevVLinFiltBlockGauss3ALL::MoskaevVLinFiltBlockGauss3ALL(const InType &in) {
337			SetTypeOfTask(GetStaticTypeOfTask());
338			GetInput() = in;
339		8	GetOutput() = OutType();
340	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	MPI_Comm_rank(MPI_COMM_WORLD, &rank_);
341	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	MPI_Comm_size(MPI_COMM_WORLD, &num_procs_);
342		8	}
343
344		8	bool MoskaevVLinFiltBlockGauss3ALL::ValidationImpl() {
345	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (rank_ != 0) {
346			return true;
347			}
348			const auto &input = GetInput();
349			const auto &data = std::get<4>(input);
350		4	return !data.empty();
351			}
352
353		8	bool MoskaevVLinFiltBlockGauss3ALL::PreProcessingImpl() {
354		8	return true;
355			}
356
357		8	bool MoskaevVLinFiltBlockGauss3ALL::PostProcessingImpl() {
358		8	return !GetOutput().empty();
359			}
360
361		8	bool MoskaevVLinFiltBlockGauss3ALL::RunImpl() {
362		8	int width = 0;
363		8	int height = 0;
364		8	int channels = 0;
365	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	std::vector<uint8_t> image_data;
366
367	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	BroadcastImageData(rank_, width, height, channels, image_data, GetInput());
368
369	2/4 ✓ Branch 0 taken 8 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 8 times. ✗ Branch 3 not taken.	8	if (width == 0 \|\| height == 0) {
370			return false;
371			}
372
373		8	int blocks_x = (width + block_size_ - 1) / block_size_;
374		8	int blocks_y = (height + block_size_ - 1) / block_size_;
375		8	int total_blocks = blocks_x * blocks_y;
376
377	1/2 ✓ Branch 0 taken 8 times. ✗ Branch 1 not taken.	8	if (total_blocks == 0) {
378			return false;
379			}
380
381		8	std::vector<int> local_blocks;
382		8	int local_cnt = 0;
383	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	ScatterBlocks(rank_, num_procs_, total_blocks, local_blocks, local_cnt);
384
385		8	std::vector<uint8_t> output;
386	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	ProcessAssignedBlocks(local_blocks, blocks_x, width, height, channels, block_size_, image_data, output);
387
388	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	GatherAndBroadcastResult(rank_, num_procs_, output, GetOutput());
389
390			return true;
391			}
392
393			} // namespace moskaev_v_lin_filt_block_gauss_3
394