GCC Code Coverage Report

Directory:	./
File:	tasks/batushin_i_striped_matrix_multiplication/mpi/src/ops_mpi.cpp
Date:	2026-01-27 01:59:34

	Exec	Total	Coverage
Lines:	43	171	25.1%
Functions:	6	18	33.3%
Branches:	21	166	12.7%

  
      Line
      Branch
      Exec
      Source
    
      #include "batushin_i_striped_matrix_multiplication/mpi/include/ops_mpi.hpp"
    
      #include <mpi.h>
    
      #include <cstddef>
    
      #include <cstdint>
    
      #include <tuple>
    
      #include <utility>
    
      #include <vector>
    
      #include "batushin_i_striped_matrix_multiplication/common/include/common.hpp"
    
      namespace batushin_i_striped_matrix_multiplication {
    
        1/2✓ Branch 1 taken 30 times.
✗ Branch 2 not taken.

      30
      BatushinIStripedMatrixMultiplicationMPI::BatushinIStripedMatrixMultiplicationMPI(const InType &in) {
    
        SetTypeOfTask(GetStaticTypeOfTask());
    
        GetInput() = in;
    
      30
      }
    
      60
      bool BatushinIStripedMatrixMultiplicationMPI::ValidationImpl() {
    
        const auto &input = GetInput();
    
      60
        const size_t rows_a = std::get<0>(input);
    
      60
        const size_t columns_a = std::get<1>(input);
    
        const auto &matrix_a = std::get<2>(input);
    
      60
        const size_t rows_b = std::get<3>(input);
    
      60
        const size_t columns_b = std::get<4>(input);
    
        const auto &matrix_b = std::get<5>(input);
    
        2/4✓ Branch 0 taken 60 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 60 times.
✗ Branch 3 not taken.

      60
        if (rows_a == 0 || columns_a == 0 || rows_b == 0 || columns_b == 0) {
    
          return false;
    
        }
    
        1/2✓ Branch 0 taken 60 times.
✗ Branch 1 not taken.

      60
        if (columns_a != rows_b) {
    
          return false;
    
        }
    
        1/2✓ Branch 0 taken 60 times.
✗ Branch 1 not taken.

      60
        if (matrix_a.size() != rows_a * columns_a) {
    
          return false;
    
        }
    
        1/2✓ Branch 0 taken 60 times.
✗ Branch 1 not taken.

      60
        if (matrix_b.size() != rows_b * columns_b) {
    
          return false;
    
        }
    
      60
        return GetOutput().empty();
    
      }
    
      30
      bool BatushinIStripedMatrixMultiplicationMPI::PreProcessingImpl() {
    
      30
        return ValidationImpl();
    
      }
    
      namespace {
    
      enum class MPITag : std::uint8_t { kMatrixB = 101 };
    
      ✗
      std::vector<int> ComputeBlockSizes(int total, int num_procs) {
    
      ✗
        std::vector<int> sizes(num_procs, 0);
    
      ✗
        int base = total / num_procs;
    
      ✗
        int extra = total % num_procs;
    
      ✗
        for (int i = 0; i < num_procs; ++i) {
    
      ✗
          sizes[i] = base + (i < extra ? 1 : 0);
    
        }
    
      ✗
        return sizes;
    
      }
    
      ✗
      std::vector<int> ComputeBlockOffsets(const std::vector<int> &sizes) {
    
      ✗
        std::vector<int> offsets(sizes.size(), 0);
    
      ✗
        for (size_t i = 1; i < sizes.size(); ++i) {
    
      ✗
          offsets[i] = offsets[i - 1] + sizes[i - 1];
    
        }
    
      ✗
        return offsets;
    
      }
    
      30
      bool RunSequentialFallback(int rank, size_t rows_a, size_t cols_a, size_t cols_b, const std::vector<double> &matrix_a,
    
                                 const std::vector<double> &matrix_b, std::vector<double> &output) {
    
        2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 15 times.

      30
        if (rank == 0) {
    
      15
          output.resize(rows_a * cols_b, 0.0);
    
        2/2✓ Branch 0 taken 38 times.
✓ Branch 1 taken 15 times.

      53
          for (size_t i = 0; i < rows_a; ++i) {
    
        2/2✓ Branch 0 taken 94 times.
✓ Branch 1 taken 38 times.

      132
            for (size_t j = 0; j < cols_b; ++j) {
    
              double sum = 0.0;
    
        2/2✓ Branch 0 taken 274 times.
✓ Branch 1 taken 94 times.

      368
              for (size_t k = 0; k < cols_a; ++k) {
    
      274
                sum += matrix_a[(i * cols_a) + k] * matrix_b[(k * cols_b) + j];
    
              }
    
      94
              output[(i * cols_b) + j] = sum;
    
            }
    
          }
    
        }
    
      15
        int total_size = (rank == 0) ? static_cast<int>(output.size()) : 0;
    
      30
        MPI_Bcast(&total_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
    
        2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 15 times.

      30
        if (rank != 0) {
    
      15
          output.resize(total_size);
    
        }
    
        1/2✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.

      30
        if (total_size > 0) {
    
      30
          MPI_Bcast(output.data(), total_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    
        }
    
      30
        return true;
    
      }
    
      ✗
      void DistributeMatrixAFromRoot(int size, int m, const std::vector<double> &matrix_a, const std::vector<int> &row_counts,
    
                                     const std::vector<int> &row_displs, std::vector<double> &local_a) {
    
      ✗
        std::vector<int> sendcounts_a(size);
    
      ✗
        std::vector<int> displs_a(size);
    
      ✗
        for (int i = 0; i < size; ++i) {
    
      ✗
          sendcounts_a[i] = row_counts[i] * m;
    
      ✗
          displs_a[i] = row_displs[i] * m;
    
        }
    
      ✗
        int my_rows = (local_a.empty()) ? 0 : static_cast<int>(local_a.size()) / m;
    
      ✗
        if (my_rows > 0) {
    
      ✗
          MPI_Scatterv(matrix_a.data(), sendcounts_a.data(), displs_a.data(), MPI_DOUBLE, local_a.data(), my_rows * m,
    
                       MPI_DOUBLE, 0, MPI_COMM_WORLD);
    
        } else {
    
      ✗
          MPI_Scatterv(matrix_a.data(), sendcounts_a.data(), displs_a.data(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0,
    
                       MPI_COMM_WORLD);
    
        }
    
      ✗
      }
    
      ✗
      std::tuple<std::vector<int>, std::vector<int>, std::vector<double>> DistributeMatrixA(
    
          int rank, int size, int n, int m, const std::vector<double> &matrix_a) {
    
      ✗
        auto row_counts = ComputeBlockSizes(n, size);
    
      ✗
        auto row_displs = ComputeBlockOffsets(row_counts);
    
      ✗
        int my_rows = row_counts[rank];
    
      ✗
        std::vector<double> local_a;
    
      ✗
        if (my_rows > 0) {
    
      ✗
          local_a.resize(static_cast<size_t>(my_rows) * static_cast<size_t>(m));
    
        }
    
      ✗
        DistributeMatrixAFromRoot(size, m, matrix_a, row_counts, row_displs, local_a);
    
      ✗
        return {row_counts, row_displs, local_a};
    
      }
    
      ✗
      std::vector<double> ExtractColumnBlock(int m, int p, int dest_col_start, int dest_col_count,
    
                                             const std::vector<double> &matrix_b) {
    
      ✗
        std::vector<double> buf(static_cast<size_t>(m) * static_cast<size_t>(dest_col_count));
    
      ✗
        for (int row = 0; row < m; ++row) {
    
      ✗
          for (int col = 0; col < dest_col_count; ++col) {
    
      ✗
            int global_col = dest_col_start + col;
    
      ✗
            buf[(static_cast<size_t>(row) * static_cast<size_t>(dest_col_count)) + static_cast<size_t>(col)] =
    
      ✗
                matrix_b[(static_cast<size_t>(row) * static_cast<size_t>(p)) + static_cast<size_t>(global_col)];
    
          }
    
        }
    
      ✗
        return buf;
    
      }
    
      ✗
      void DistributeMatrixBFromRoot(int size, int m, int p, const std::vector<double> &matrix_b,
    
                                     const std::vector<int> &col_counts, const std::vector<int> &col_displs,
    
                                     std::vector<double> &current_b, int &current_cols) {
    
      ✗
        for (int dest = 0; dest < size; ++dest) {
    
      ✗
          if (col_counts[dest] > 0) {
    
      ✗
            std::vector<double> buf = ExtractColumnBlock(m, p, col_displs[dest], col_counts[dest], matrix_b);
    
      ✗
            if (dest == 0) {
    
              current_b = std::move(buf);
    
      ✗
              current_cols = col_counts[0];
    
            } else {
    
      ✗
              MPI_Send(buf.data(), static_cast<int>(buf.size()), MPI_DOUBLE, dest, 100, MPI_COMM_WORLD);
    
            }
    
          } else {
    
      ✗
            if (dest == 0) {
    
      ✗
              current_cols = 0;
    
            } else {
    
      ✗
              MPI_Send(nullptr, 0, MPI_DOUBLE, dest, 100, MPI_COMM_WORLD);
    
            }
    
          }
    
        }
    
      ✗
      }
    
      ✗
      std::tuple<std::vector<int>, std::vector<int>, std::vector<double>, int> DistributeMatrixB(
    
          int rank, int size, int m, int p, const std::vector<double> &matrix_b) {
    
      ✗
        auto col_counts = ComputeBlockSizes(p, size);
    
      ✗
        auto col_displs = ComputeBlockOffsets(col_counts);
    
      ✗
        std::vector<double> current_b;
    
      ✗
        int current_cols = 0;
    
      ✗
        if (rank == 0) {
    
      ✗
          DistributeMatrixBFromRoot(size, m, p, matrix_b, col_counts, col_displs, current_b, current_cols);
    
        } else {
    
      ✗
          if (col_counts[rank] > 0) {
    
      ✗
            current_b.resize(static_cast<size_t>(m) * static_cast<size_t>(col_counts[rank]));
    
      ✗
            MPI_Recv(current_b.data(), static_cast<int>(current_b.size()), MPI_DOUBLE, 0, 100, MPI_COMM_WORLD,
    
                     MPI_STATUS_IGNORE);
    
      ✗
            current_cols = col_counts[rank];
    
          } else {
    
      ✗
            MPI_Recv(nullptr, 0, MPI_DOUBLE, 0, 100, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    
            current_cols = 0;
    
          }
    
        }
    
      ✗
        return {col_counts, col_displs, current_b, current_cols};
    
      }
    
      ✗
      void PerformLocalComputation(int my_rows, int current_cols, int m, int p, int stripe_offset,
    
                                   const std::vector<double> &local_a, const std::vector<double> &current_b,
    
                                   std::vector<double> &local_c) {
    
      ✗
        for (int i = 0; i < my_rows; ++i) {
    
      ✗
          for (int j = 0; j < current_cols; ++j) {
    
            double sum = 0.0;
    
      ✗
            for (int k = 0; k < m; ++k) {
    
      ✗
              sum += local_a[(static_cast<size_t>(i) * static_cast<size_t>(m)) + static_cast<size_t>(k)] *
    
      ✗
                     current_b[static_cast<size_t>(k) + (static_cast<size_t>(j) * static_cast<size_t>(m))];
    
            }
    
      ✗
            local_c[(static_cast<size_t>(i) * static_cast<size_t>(p)) + static_cast<size_t>(stripe_offset + j)] = sum;
    
          }
    
        }
    
      ✗
      }
    
      ✗
      std::pair<std::vector<double>, int> ShiftMatrixB(int rank, int size, int m, std::vector<double> &current_b,
    
                                                       int current_cols) {
    
      ✗
        int next = (rank + 1) % size;
    
      ✗
        int prev = (rank - 1 + size) % size;
    
      ✗
        int send_cols = current_cols;
    
      ✗
        int recv_cols = 0;
    
      ✗
        MPI_Sendrecv(&send_cols, 1, MPI_INT, next, 200, &recv_cols, 1, MPI_INT, prev, 200, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    
      ✗
        std::vector<double> recv_buffer;
    
      ✗
        if (recv_cols > 0) {
    
      ✗
          recv_buffer.resize(static_cast<size_t>(m) * static_cast<size_t>(recv_cols));
    
        }
    
      ✗
        int send_count = send_cols * m;
    
      ✗
        int recv_count = recv_cols * m;
    
      ✗
        const double *send_ptr = (send_count > 0 && !current_b.empty()) ? current_b.data() : nullptr;
    
      ✗
        double *recv_ptr = (recv_count > 0) ? recv_buffer.data() : nullptr;
    
      ✗
        MPI_Sendrecv(send_ptr, send_count, MPI_DOUBLE, next, 201, recv_ptr, recv_count, MPI_DOUBLE, prev, 201, MPI_COMM_WORLD,
    
                     MPI_STATUS_IGNORE);
    
      ✗
        if (recv_cols > 0) {
    
          return {std::move(recv_buffer), recv_cols};
    
        }
    
        return {{}, 0};
    
      }
    
      ✗
      std::vector<double> ComputeWithCyclicShift(int rank, int size, int m, int p, const std::vector<double> &local_a,
    
                                                 std::vector<double> current_b, int current_cols,
    
                                                 const std::vector<int> &col_displs) {
    
      ✗
        int my_rows = (local_a.empty()) ? 0 : static_cast<int>(local_a.size()) / m;
    
      ✗
        std::vector<double> local_c;
    
      ✗
        if (my_rows > 0) {
    
      ✗
          local_c.resize(static_cast<size_t>(my_rows) * static_cast<size_t>(p), 0.0);
    
        }
    
        int stripe_owner = rank;
    
      ✗
        for (int step = 0; step < size; ++step) {
    
      ✗
          if (my_rows > 0 && current_cols > 0 && !current_b.empty()) {
    
      ✗
            int stripe_offset = (static_cast<size_t>(stripe_owner) < col_displs.size()) ? col_displs[stripe_owner] : 0;
    
      ✗
            PerformLocalComputation(my_rows, current_cols, m, p, stripe_offset, local_a, current_b, local_c);
    
          }
    
      ✗
          if (step == size - 1) {
    
            break;
    
          }
    
      ✗
          auto [new_b, new_cols] = ShiftMatrixB(rank, size, m, current_b, current_cols);
    
          current_b = std::move(new_b);
    
      ✗
          current_cols = new_cols;
    
      ✗
          stripe_owner = (stripe_owner - 1 + size) % size;
    
        }
    
      ✗
        return local_c;
    
      }
    
      ✗
      void BroadcastFinalResult(int rank, int size, int n, int p, const std::vector<int> &row_counts,
    
                                const std::vector<int> &row_displs, const std::vector<double> &local_c,
    
                                std::vector<double> &result) {
    
      ✗
        std::vector<int> result_counts(size);
    
      ✗
        std::vector<int> result_displs(size);
    
      ✗
        for (int i = 0; i < size; ++i) {
    
      ✗
          result_counts[i] = row_counts[i] * p;
    
      ✗
          result_displs[i] = row_displs[i] * p;
    
        }
    
      ✗
        if (rank == 0) {
    
      ✗
          result.resize(static_cast<size_t>(n) * static_cast<size_t>(p));
    
        }
    
      ✗
        int my_rows = (local_c.empty()) ? 0 : static_cast<int>(local_c.size()) / p;
    
      ✗
        int local_result_elements = my_rows * p;
    
      ✗
        const double *local_result_ptr = (my_rows > 0) ? local_c.data() : nullptr;
    
      ✗
        MPI_Gatherv(local_result_ptr, local_result_elements, MPI_DOUBLE, result.data(), result_counts.data(),
    
                    result_displs.data(), MPI_DOUBLE, 0, MPI_COMM_WORLD);
    
      ✗
        int total_size = n * p;
    
      ✗
        if (rank != 0) {
    
      ✗
          result.resize(static_cast<size_t>(total_size));
    
        }
    
      ✗
        MPI_Bcast(result.data(), total_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    
      ✗
      }
    
      ✗
      bool RunStripedScheme(int rank, int size, size_t rows_a, size_t cols_a, size_t cols_b,
    
                            const std::vector<double> &matrix_a, const std::vector<double> &matrix_b,
    
                            std::vector<double> &output) {
    
      ✗
        const int n = static_cast<int>(rows_a);
    
      ✗
        const int m = static_cast<int>(cols_a);
    
      ✗
        const int p = static_cast<int>(cols_b);
    
      ✗
        auto [row_counts, row_displs, local_a] = DistributeMatrixA(rank, size, n, m, matrix_a);
    
      ✗
        auto [col_counts, col_displs, current_b, current_cols] = DistributeMatrixB(rank, size, m, p, matrix_b);
    
      ✗
        auto local_c = ComputeWithCyclicShift(rank, size, m, p, local_a, current_b, current_cols, col_displs);
    
      ✗
        BroadcastFinalResult(rank, size, n, p, row_counts, row_displs, local_c, output);
    
      ✗
        return true;
    
      }
    
      }  // namespace
    
      30
      bool BatushinIStripedMatrixMultiplicationMPI::RunImpl() {
    
      30
        int rank = 0;
    
      30
        int size = 0;
    
      30
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    
      30
        MPI_Comm_size(MPI_COMM_WORLD, &size);
    
        const auto &input = GetInput();
    
      30
        const size_t rows_a = std::get<0>(input);
    
        1/2✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.

      30
        const size_t cols_a = std::get<1>(input);
    
        const auto &matrix_a = std::get<2>(input);
    
      30
        const size_t cols_b = std::get<4>(input);
    
        const auto &matrix_b = std::get<5>(input);
    
      30
        std::vector<double> output;
    
        2/4✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 26 times.

      56
        if (std::cmp_greater(size, rows_a) || std::cmp_greater(size, cols_b) || size <= 4) {
    
        1/2✓ Branch 1 taken 30 times.
✗ Branch 2 not taken.

      30
          RunSequentialFallback(rank, rows_a, cols_a, cols_b, matrix_a, matrix_b, output);
    
        } else {
    
      ✗
          RunStripedScheme(rank, size, rows_a, cols_a, cols_b, matrix_a, matrix_b, output);
    
        }
    
        GetOutput() = std::move(output);
    
      30
        return true;
    
      }
    
      30
      bool BatushinIStripedMatrixMultiplicationMPI::PostProcessingImpl() {
    
      30
        return !GetOutput().empty();
    
      }
    
      }  // namespace batushin_i_striped_matrix_multiplication

Line	Branch	Exec	Source
1			#include "batushin_i_striped_matrix_multiplication/mpi/include/ops_mpi.hpp"
2
3			#include <mpi.h>
4
5			#include <cstddef>
6			#include <cstdint>
7			#include <tuple>
8			#include <utility>
9			#include <vector>
10
11			#include "batushin_i_striped_matrix_multiplication/common/include/common.hpp"
12
13			namespace batushin_i_striped_matrix_multiplication {
14
15	1/2 ✓ Branch 1 taken 30 times. ✗ Branch 2 not taken.	30	BatushinIStripedMatrixMultiplicationMPI::BatushinIStripedMatrixMultiplicationMPI(const InType &in) {
16			SetTypeOfTask(GetStaticTypeOfTask());
17			GetInput() = in;
18		30	}
19
20		60	bool BatushinIStripedMatrixMultiplicationMPI::ValidationImpl() {
21			const auto &input = GetInput();
22
23		60	const size_t rows_a = std::get<0>(input);
24		60	const size_t columns_a = std::get<1>(input);
25			const auto &matrix_a = std::get<2>(input);
26
27		60	const size_t rows_b = std::get<3>(input);
28		60	const size_t columns_b = std::get<4>(input);
29			const auto &matrix_b = std::get<5>(input);
30
31	2/4 ✓ Branch 0 taken 60 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 60 times. ✗ Branch 3 not taken.	60	if (rows_a == 0 \|\| columns_a == 0 \|\| rows_b == 0 \|\| columns_b == 0) {
32			return false;
33			}
34
35	1/2 ✓ Branch 0 taken 60 times. ✗ Branch 1 not taken.	60	if (columns_a != rows_b) {
36			return false;
37			}
38
39	1/2 ✓ Branch 0 taken 60 times. ✗ Branch 1 not taken.	60	if (matrix_a.size() != rows_a * columns_a) {
40			return false;
41			}
42
43	1/2 ✓ Branch 0 taken 60 times. ✗ Branch 1 not taken.	60	if (matrix_b.size() != rows_b * columns_b) {
44			return false;
45			}
46
47		60	return GetOutput().empty();
48			}
49
50		30	bool BatushinIStripedMatrixMultiplicationMPI::PreProcessingImpl() {
51		30	return ValidationImpl();
52			}
53
54			namespace {
55
56			enum class MPITag : std::uint8_t { kMatrixB = 101 };
57
58		✗	std::vector<int> ComputeBlockSizes(int total, int num_procs) {
59		✗	std::vector<int> sizes(num_procs, 0);
60		✗	int base = total / num_procs;
61		✗	int extra = total % num_procs;
62		✗	for (int i = 0; i < num_procs; ++i) {
63		✗	sizes[i] = base + (i < extra ? 1 : 0);
64			}
65		✗	return sizes;
66			}
67
68		✗	std::vector<int> ComputeBlockOffsets(const std::vector<int> &sizes) {
69		✗	std::vector<int> offsets(sizes.size(), 0);
70		✗	for (size_t i = 1; i < sizes.size(); ++i) {
71		✗	offsets[i] = offsets[i - 1] + sizes[i - 1];
72			}
73		✗	return offsets;
74			}
75
76		30	bool RunSequentialFallback(int rank, size_t rows_a, size_t cols_a, size_t cols_b, const std::vector<double> &matrix_a,
77			const std::vector<double> &matrix_b, std::vector<double> &output) {
78	2/2 ✓ Branch 0 taken 15 times. ✓ Branch 1 taken 15 times.	30	if (rank == 0) {
79		15	output.resize(rows_a * cols_b, 0.0);
80	2/2 ✓ Branch 0 taken 38 times. ✓ Branch 1 taken 15 times.	53	for (size_t i = 0; i < rows_a; ++i) {
81	2/2 ✓ Branch 0 taken 94 times. ✓ Branch 1 taken 38 times.	132	for (size_t j = 0; j < cols_b; ++j) {
82			double sum = 0.0;
83	2/2 ✓ Branch 0 taken 274 times. ✓ Branch 1 taken 94 times.	368	for (size_t k = 0; k < cols_a; ++k) {
84		274	sum += matrix_a[(i * cols_a) + k] * matrix_b[(k * cols_b) + j];
85			}
86		94	output[(i * cols_b) + j] = sum;
87			}
88			}
89			}
90
91		15	int total_size = (rank == 0) ? static_cast<int>(output.size()) : 0;
92		30	MPI_Bcast(&total_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
93	2/2 ✓ Branch 0 taken 15 times. ✓ Branch 1 taken 15 times.	30	if (rank != 0) {
94		15	output.resize(total_size);
95			}
96	1/2 ✓ Branch 0 taken 30 times. ✗ Branch 1 not taken.	30	if (total_size > 0) {
97		30	MPI_Bcast(output.data(), total_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
98			}
99		30	return true;
100			}
101
102		✗	void DistributeMatrixAFromRoot(int size, int m, const std::vector<double> &matrix_a, const std::vector<int> &row_counts,
103			const std::vector<int> &row_displs, std::vector<double> &local_a) {
104		✗	std::vector<int> sendcounts_a(size);
105		✗	std::vector<int> displs_a(size);
106		✗	for (int i = 0; i < size; ++i) {
107		✗	sendcounts_a[i] = row_counts[i] * m;
108		✗	displs_a[i] = row_displs[i] * m;
109			}
110
111		✗	int my_rows = (local_a.empty()) ? 0 : static_cast<int>(local_a.size()) / m;
112		✗	if (my_rows > 0) {
113		✗	MPI_Scatterv(matrix_a.data(), sendcounts_a.data(), displs_a.data(), MPI_DOUBLE, local_a.data(), my_rows * m,
114			MPI_DOUBLE, 0, MPI_COMM_WORLD);
115			} else {
116		✗	MPI_Scatterv(matrix_a.data(), sendcounts_a.data(), displs_a.data(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0,
117			MPI_COMM_WORLD);
118			}
119		✗	}
120
121		✗	std::tuple<std::vector<int>, std::vector<int>, std::vector<double>> DistributeMatrixA(
122			int rank, int size, int n, int m, const std::vector<double> &matrix_a) {
123		✗	auto row_counts = ComputeBlockSizes(n, size);
124		✗	auto row_displs = ComputeBlockOffsets(row_counts);
125		✗	int my_rows = row_counts[rank];
126
127		✗	std::vector<double> local_a;
128		✗	if (my_rows > 0) {
129		✗	local_a.resize(static_cast<size_t>(my_rows) * static_cast<size_t>(m));
130			}
131
132		✗	DistributeMatrixAFromRoot(size, m, matrix_a, row_counts, row_displs, local_a);
133		✗	return {row_counts, row_displs, local_a};
134			}
135
136		✗	std::vector<double> ExtractColumnBlock(int m, int p, int dest_col_start, int dest_col_count,
137			const std::vector<double> &matrix_b) {
138		✗	std::vector<double> buf(static_cast<size_t>(m) * static_cast<size_t>(dest_col_count));
139		✗	for (int row = 0; row < m; ++row) {
140		✗	for (int col = 0; col < dest_col_count; ++col) {
141		✗	int global_col = dest_col_start + col;
142		✗	buf[(static_cast<size_t>(row) * static_cast<size_t>(dest_col_count)) + static_cast<size_t>(col)] =
143		✗	matrix_b[(static_cast<size_t>(row) * static_cast<size_t>(p)) + static_cast<size_t>(global_col)];
144			}
145			}
146		✗	return buf;
147			}
148
149		✗	void DistributeMatrixBFromRoot(int size, int m, int p, const std::vector<double> &matrix_b,
150			const std::vector<int> &col_counts, const std::vector<int> &col_displs,
151			std::vector<double> &current_b, int &current_cols) {
152		✗	for (int dest = 0; dest < size; ++dest) {
153		✗	if (col_counts[dest] > 0) {
154		✗	std::vector<double> buf = ExtractColumnBlock(m, p, col_displs[dest], col_counts[dest], matrix_b);
155		✗	if (dest == 0) {
156			current_b = std::move(buf);
157		✗	current_cols = col_counts[0];
158			} else {
159		✗	MPI_Send(buf.data(), static_cast<int>(buf.size()), MPI_DOUBLE, dest, 100, MPI_COMM_WORLD);
160			}
161			} else {
162		✗	if (dest == 0) {
163		✗	current_cols = 0;
164			} else {
165		✗	MPI_Send(nullptr, 0, MPI_DOUBLE, dest, 100, MPI_COMM_WORLD);
166			}
167			}
168			}
169		✗	}
170
171		✗	std::tuple<std::vector<int>, std::vector<int>, std::vector<double>, int> DistributeMatrixB(
172			int rank, int size, int m, int p, const std::vector<double> &matrix_b) {
173		✗	auto col_counts = ComputeBlockSizes(p, size);
174		✗	auto col_displs = ComputeBlockOffsets(col_counts);
175
176		✗	std::vector<double> current_b;
177		✗	int current_cols = 0;
178
179		✗	if (rank == 0) {
180		✗	DistributeMatrixBFromRoot(size, m, p, matrix_b, col_counts, col_displs, current_b, current_cols);
181			} else {
182		✗	if (col_counts[rank] > 0) {
183		✗	current_b.resize(static_cast<size_t>(m) * static_cast<size_t>(col_counts[rank]));
184		✗	MPI_Recv(current_b.data(), static_cast<int>(current_b.size()), MPI_DOUBLE, 0, 100, MPI_COMM_WORLD,
185			MPI_STATUS_IGNORE);
186		✗	current_cols = col_counts[rank];
187			} else {
188		✗	MPI_Recv(nullptr, 0, MPI_DOUBLE, 0, 100, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
189			current_cols = 0;
190			}
191			}
192
193		✗	return {col_counts, col_displs, current_b, current_cols};
194			}
195
196		✗	void PerformLocalComputation(int my_rows, int current_cols, int m, int p, int stripe_offset,
197			const std::vector<double> &local_a, const std::vector<double> &current_b,
198			std::vector<double> &local_c) {
199		✗	for (int i = 0; i < my_rows; ++i) {
200		✗	for (int j = 0; j < current_cols; ++j) {
201			double sum = 0.0;
202		✗	for (int k = 0; k < m; ++k) {
203		✗	sum += local_a[(static_cast<size_t>(i) * static_cast<size_t>(m)) + static_cast<size_t>(k)] *
204		✗	current_b[static_cast<size_t>(k) + (static_cast<size_t>(j) * static_cast<size_t>(m))];
205			}
206		✗	local_c[(static_cast<size_t>(i) * static_cast<size_t>(p)) + static_cast<size_t>(stripe_offset + j)] = sum;
207			}
208			}
209		✗	}
210
211		✗	std::pair<std::vector<double>, int> ShiftMatrixB(int rank, int size, int m, std::vector<double> &current_b,
212			int current_cols) {
213		✗	int next = (rank + 1) % size;
214		✗	int prev = (rank - 1 + size) % size;
215
216		✗	int send_cols = current_cols;
217		✗	int recv_cols = 0;
218		✗	MPI_Sendrecv(&send_cols, 1, MPI_INT, next, 200, &recv_cols, 1, MPI_INT, prev, 200, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
219
220		✗	std::vector<double> recv_buffer;
221		✗	if (recv_cols > 0) {
222		✗	recv_buffer.resize(static_cast<size_t>(m) * static_cast<size_t>(recv_cols));
223			}
224
225		✗	int send_count = send_cols * m;
226		✗	int recv_count = recv_cols * m;
227		✗	const double *send_ptr = (send_count > 0 && !current_b.empty()) ? current_b.data() : nullptr;
228		✗	double *recv_ptr = (recv_count > 0) ? recv_buffer.data() : nullptr;
229
230		✗	MPI_Sendrecv(send_ptr, send_count, MPI_DOUBLE, next, 201, recv_ptr, recv_count, MPI_DOUBLE, prev, 201, MPI_COMM_WORLD,
231			MPI_STATUS_IGNORE);
232
233		✗	if (recv_cols > 0) {
234			return {std::move(recv_buffer), recv_cols};
235			}
236			return {{}, 0};
237			}
238
239		✗	std::vector<double> ComputeWithCyclicShift(int rank, int size, int m, int p, const std::vector<double> &local_a,
240			std::vector<double> current_b, int current_cols,
241			const std::vector<int> &col_displs) {
242		✗	int my_rows = (local_a.empty()) ? 0 : static_cast<int>(local_a.size()) / m;
243		✗	std::vector<double> local_c;
244		✗	if (my_rows > 0) {
245		✗	local_c.resize(static_cast<size_t>(my_rows) * static_cast<size_t>(p), 0.0);
246			}
247
248			int stripe_owner = rank;
249		✗	for (int step = 0; step < size; ++step) {
250		✗	if (my_rows > 0 && current_cols > 0 && !current_b.empty()) {
251		✗	int stripe_offset = (static_cast<size_t>(stripe_owner) < col_displs.size()) ? col_displs[stripe_owner] : 0;
252		✗	PerformLocalComputation(my_rows, current_cols, m, p, stripe_offset, local_a, current_b, local_c);
253			}
254
255		✗	if (step == size - 1) {
256			break;
257			}
258
259		✗	auto [new_b, new_cols] = ShiftMatrixB(rank, size, m, current_b, current_cols);
260			current_b = std::move(new_b);
261		✗	current_cols = new_cols;
262		✗	stripe_owner = (stripe_owner - 1 + size) % size;
263			}
264
265		✗	return local_c;
266			}
267
268		✗	void BroadcastFinalResult(int rank, int size, int n, int p, const std::vector<int> &row_counts,
269			const std::vector<int> &row_displs, const std::vector<double> &local_c,
270			std::vector<double> &result) {
271		✗	std::vector<int> result_counts(size);
272		✗	std::vector<int> result_displs(size);
273		✗	for (int i = 0; i < size; ++i) {
274		✗	result_counts[i] = row_counts[i] * p;
275		✗	result_displs[i] = row_displs[i] * p;
276			}
277
278		✗	if (rank == 0) {
279		✗	result.resize(static_cast<size_t>(n) * static_cast<size_t>(p));
280			}
281
282		✗	int my_rows = (local_c.empty()) ? 0 : static_cast<int>(local_c.size()) / p;
283		✗	int local_result_elements = my_rows * p;
284		✗	const double *local_result_ptr = (my_rows > 0) ? local_c.data() : nullptr;
285
286		✗	MPI_Gatherv(local_result_ptr, local_result_elements, MPI_DOUBLE, result.data(), result_counts.data(),
287			result_displs.data(), MPI_DOUBLE, 0, MPI_COMM_WORLD);
288
289		✗	int total_size = n * p;
290		✗	if (rank != 0) {
291		✗	result.resize(static_cast<size_t>(total_size));
292			}
293		✗	MPI_Bcast(result.data(), total_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
294		✗	}
295
296		✗	bool RunStripedScheme(int rank, int size, size_t rows_a, size_t cols_a, size_t cols_b,
297			const std::vector<double> &matrix_a, const std::vector<double> &matrix_b,
298			std::vector<double> &output) {
299		✗	const int n = static_cast<int>(rows_a);
300		✗	const int m = static_cast<int>(cols_a);
301		✗	const int p = static_cast<int>(cols_b);
302
303		✗	auto [row_counts, row_displs, local_a] = DistributeMatrixA(rank, size, n, m, matrix_a);
304		✗	auto [col_counts, col_displs, current_b, current_cols] = DistributeMatrixB(rank, size, m, p, matrix_b);
305		✗	auto local_c = ComputeWithCyclicShift(rank, size, m, p, local_a, current_b, current_cols, col_displs);
306
307		✗	BroadcastFinalResult(rank, size, n, p, row_counts, row_displs, local_c, output);
308		✗	return true;
309			}
310
311			} // namespace
312
313		30	bool BatushinIStripedMatrixMultiplicationMPI::RunImpl() {
314		30	int rank = 0;
315		30	int size = 0;
316		30	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
317		30	MPI_Comm_size(MPI_COMM_WORLD, &size);
318
319			const auto &input = GetInput();
320		30	const size_t rows_a = std::get<0>(input);
321	1/2 ✓ Branch 0 taken 30 times. ✗ Branch 1 not taken.	30	const size_t cols_a = std::get<1>(input);
322			const auto &matrix_a = std::get<2>(input);
323		30	const size_t cols_b = std::get<4>(input);
324			const auto &matrix_b = std::get<5>(input);
325
326		30	std::vector<double> output;
327
328	2/4 ✓ Branch 0 taken 30 times. ✗ Branch 1 not taken. ✗ Branch 2 not taken. ✓ Branch 3 taken 26 times.	56	if (std::cmp_greater(size, rows_a) \|\| std::cmp_greater(size, cols_b) \|\| size <= 4) {
329	1/2 ✓ Branch 1 taken 30 times. ✗ Branch 2 not taken.	30	RunSequentialFallback(rank, rows_a, cols_a, cols_b, matrix_a, matrix_b, output);
330			} else {
331		✗	RunStripedScheme(rank, size, rows_a, cols_a, cols_b, matrix_a, matrix_b, output);
332			}
333
334			GetOutput() = std::move(output);
335		30	return true;
336			}
337
338		30	bool BatushinIStripedMatrixMultiplicationMPI::PostProcessingImpl() {
339		30	return !GetOutput().empty();
340			}
341
342			} // namespace batushin_i_striped_matrix_multiplication
343