GCC Code Coverage Report

Directory:	./
File:	tasks/lukin_i_cannon_algorithm/mpi/src/ops_mpi.cpp
Date:	2026-01-10 02:40:41

	Exec	Total	Coverage
Lines:	43	126	34.1%
Functions:	7	12	58.3%
Branches:	21	96	21.9%

  
      Line
      Branch
      Exec
      Source
    
      #include "lukin_i_cannon_algorithm/mpi/include/ops_mpi.hpp"
    
      #include <mpi.h>
    
      #include <cmath>
    
      #include <cstddef>
    
      #include <tuple>
    
      #include <utility>
    
      #include <vector>
    
      #include "lukin_i_cannon_algorithm/common/include/common.hpp"
    
      namespace lukin_i_cannon_algorithm {
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
      LukinICannonAlgorithmMPI::LukinICannonAlgorithmMPI(const InType &in) {
    
        SetTypeOfTask(GetStaticTypeOfTask());
    
      8
        int rank = 0;
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (rank == 0) {
    
          GetInput() = in;
    
        }
    
      8
        GetOutput() = OutType();
    
      8
      }
    
      8
      bool LukinICannonAlgorithmMPI::ValidationImpl() {
    
      8
        int rank = 0;
    
      8
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (rank == 0) {
    
      4
          int proc_count = 0;
    
      4
          MPI_Comm_size(MPI_COMM_WORLD, &proc_count);
    
        1/2✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.

      4
          int grid_size = static_cast<int>(std::floor(std::sqrt(proc_count)));
    
      4
          int rsize_a = static_cast<int>(std::get<0>(GetInput()).size());
    
      4
          int rsize_b = static_cast<int>(std::get<1>(GetInput()).size());
    
      4
          size_ = std::get<2>(GetInput());
    
        3/6✓ Branch 0 taken 4 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 4 times.
✗ Branch 5 not taken.

      4
          return (rsize_a > 0) && (rsize_b > 0) && (rsize_a == size_ * size_) && (rsize_a == rsize_b) &&
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.

      4
                 (rsize_a % grid_size == 0);
    
        }
    
        return true;
    
      }
    
      8
      bool LukinICannonAlgorithmMPI::PreProcessingImpl() {
    
      8
        return true;
    
      }
    
      8
      bool LukinICannonAlgorithmMPI::RunImpl() {
    
      8
        int global_rank = -1;
    
      8
        int proc_count = 0;
    
      8
        MPI_Comm_rank(MPI_COMM_WORLD, &global_rank);
    
      8
        MPI_Comm_size(MPI_COMM_WORLD, &proc_count);
    
      8
        MPI_Bcast(&size_, 1, MPI_INT, 0, MPI_COMM_WORLD);
    
        // случай, если 1 на 1 решетка процессов - просто seq версия
    
        1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.

      8
        if (proc_count < 4) {
    
      8
          return RunSeq(global_rank);
    
        }
    
        // для процессов, напрямую учавствующих в вычислениях, создается другой коммуникатор
    
      ✗
        return RunCannon(global_rank, proc_count);
    
      }
    
      8
      bool LukinICannonAlgorithmMPI::PostProcessingImpl() {
    
      8
        return true;
    
      }
    
      4
      void LukinICannonAlgorithmMPI::MulNSum(const double *a, const double *b, double *c, int size) {
    
        2/2✓ Branch 0 taken 26 times.
✓ Branch 1 taken 4 times.

      30
        for (int i = 0; i < size; i++) {
    
        2/2✓ Branch 0 taken 228 times.
✓ Branch 1 taken 26 times.

      254
          for (int k = 0; k < size; k++) {
    
      228
            double fixed = a[(i * size) + k];
    
        2/2✓ Branch 0 taken 2312 times.
✓ Branch 1 taken 228 times.

      2540
            for (int j = 0; j < size; j++) {
    
      2312
              c[(i * size) + j] += fixed * b[(k * size) + j];
    
            }
    
          }
    
        }
    
      4
      }
    
      8
      bool LukinICannonAlgorithmMPI::RunSeq(int global_rank) {
    
      8
        std::vector<double> c(static_cast<size_t>(size_ * size_));
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.

      8
        if (global_rank == 0) {
    
          double *a = std::get<0>(GetInput()).data();
    
          double *b = std::get<1>(GetInput()).data();
    
      4
          LukinICannonAlgorithmMPI::MulNSum(a, b, c.data(), size_);
    
        }
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
        MPI_Bcast(c.data(), size_ * size_, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    
        GetOutput() = std::move(c);
    
      8
        return true;
    
      }
    
      ✗
      bool LukinICannonAlgorithmMPI::RunCannon(int global_rank, int proc_count) {
    
      ✗
        int grid_size = static_cast<int>(std::floor(std::sqrt(proc_count)));
    
      ✗
        int working_proc_count = grid_size * grid_size;
    
      ✗
        MPI_Comm mpi_comm_cannon = MPI_COMM_NULL;
    
      ✗
        int color = (global_rank < working_proc_count) ? 0 : MPI_UNDEFINED;
    
      ✗
        MPI_Comm_split(MPI_COMM_WORLD, color, global_rank, &mpi_comm_cannon);
    
      ✗
        if (mpi_comm_cannon != MPI_COMM_NULL) {
    
      ✗
          CannonWorkers(mpi_comm_cannon, grid_size, working_proc_count);
    
      ✗
          MPI_Comm_free(&mpi_comm_cannon);
    
        } else {
    
      ✗
          CannonNonWorkers();
    
        }
    
      ✗
        return true;
    
      }
    
      ✗
      void LukinICannonAlgorithmMPI::CannonWorkers(MPI_Comm comm, int grid_size, int working_proc_count) {
    
      ✗
        int cannon_rank = -1;
    
      ✗
        MPI_Comm_rank(comm, &cannon_rank);
    
      ✗
        int block_size = size_ / grid_size;
    
      ✗
        int block_elems = block_size * block_size;
    
      ✗
        std::vector<double> a_block(block_elems);
    
      ✗
        std::vector<double> b_block(block_elems);
    
      ✗
        std::vector<double> c_block(block_elems, 0);
    
      ✗
        std::vector<double> a_blocks;
    
      ✗
        std::vector<double> b_blocks;
    
        // ручная упаковка
    
      ✗
        if (cannon_rank == 0) {
    
      ✗
          a_blocks.resize(static_cast<size_t>(working_proc_count) * static_cast<size_t>(block_elems));
    
      ✗
          b_blocks.resize(static_cast<size_t>(working_proc_count) * static_cast<size_t>(block_elems));
    
      ✗
          MatrixPack(a_blocks.data(), b_blocks.data(), working_proc_count, block_elems, block_size, grid_size);
    
        }
    
      ✗
        MPI_Scatter(a_blocks.data(), block_elems, MPI_DOUBLE, a_block.data(), block_elems, MPI_DOUBLE, 0, comm);
    
      ✗
        MPI_Scatter(b_blocks.data(), block_elems, MPI_DOUBLE, b_block.data(), block_elems, MPI_DOUBLE, 0, comm);
    
      ✗
        int row = cannon_rank / grid_size;
    
      ✗
        int col = cannon_rank % grid_size;
    
        // начальный сдвиг
    
      ✗
        int left = (row * grid_size) + ((col - row + grid_size) % grid_size);
    
      ✗
        int right = (row * grid_size) + ((col + row) % grid_size);
    
      ✗
        MPI_Sendrecv_replace(a_block.data(), block_elems, MPI_DOUBLE, left, 0, right, 0, comm, MPI_STATUS_IGNORE);
    
      ✗
        int up = (((row - col + grid_size) % grid_size) * grid_size) + col;
    
      ✗
        int down = (((row + col) % grid_size) * grid_size) + col;
    
      ✗
        MPI_Sendrecv_replace(b_block.data(), block_elems, MPI_DOUBLE, up, 0, down, 0, comm, MPI_STATUS_IGNORE);
    
        // цикл умножения и сдвига
    
      ✗
        for (int iter = 0; iter < grid_size; iter++) {
    
      ✗
          LukinICannonAlgorithmMPI::MulNSum(a_block.data(), b_block.data(), c_block.data(), block_size);
    
      ✗
          if (iter < grid_size - 1) {
    
      ✗
            left = (row * grid_size) + ((col - 1 + grid_size) % grid_size);
    
      ✗
            right = (row * grid_size) + ((col + 1) % grid_size);
    
      ✗
            MPI_Sendrecv_replace(a_block.data(), block_elems, MPI_DOUBLE, left, 0, right, 0, comm, MPI_STATUS_IGNORE);
    
      ✗
            up = (((row - 1 + grid_size) % grid_size) * grid_size) + col;
    
      ✗
            down = (((row + 1) % grid_size) * grid_size) + col;
    
      ✗
            MPI_Sendrecv_replace(b_block.data(), block_elems, MPI_DOUBLE, up, 0, down, 0, comm, MPI_STATUS_IGNORE);
    
          }
    
        }
    
        // упаковка данных в результирующую
    
      ✗
        std::vector<double> c_blocks(static_cast<size_t>(size_ * size_));
    
      ✗
        MPI_Gather(c_block.data(), block_elems, MPI_DOUBLE, c_blocks.data(), block_elems, MPI_DOUBLE, 0, comm);
    
      ✗
        std::vector<double> c(static_cast<size_t>(size_ * size_));
    
      ✗
        if (cannon_rank == 0) {
    
      ✗
          MatrixUnpack(c.data(), c_blocks.data(), working_proc_count, block_elems, block_size, grid_size);
    
        }
    
      ✗
        MPI_Bcast(c.data(), size_ * size_, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    
        GetOutput() = std::move(c);
    
      ✗
      }
    
      ✗
      void LukinICannonAlgorithmMPI::CannonNonWorkers() {
    
      ✗
        std::vector<double> c(static_cast<size_t>(size_ * size_));
    
      ✗
        MPI_Bcast(c.data(), size_ * size_, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    
        GetOutput() = std::move(c);
    
      ✗
      }
    
      ✗
      void LukinICannonAlgorithmMPI::MatrixPack(double *a_blocks, double *b_blocks, int working_proc_count, int block_elems,
    
                                                int block_size, int grid_size) {
    
        double *a = std::get<0>(GetInput()).data();
    
        double *b = std::get<1>(GetInput()).data();
    
      ✗
        for (int proc = 0; proc < working_proc_count; proc++) {
    
      ✗
          int proc_i = proc / grid_size;
    
      ✗
          int proc_j = proc % grid_size;
    
      ✗
          int buf_offset = proc * block_elems;
    
      ✗
          for (int i = 0; i < block_size; i++) {
    
      ✗
            for (int j = 0; j < block_size; j++) {
    
      ✗
              int global_i = (proc_i * block_size) + i;
    
      ✗
              int global_j = (proc_j * block_size) + j;
    
      ✗
              int global_idx = (global_i * size_) + global_j;
    
      ✗
              int buf_idx = buf_offset + (i * block_size) + j;
    
      ✗
              a_blocks[buf_idx] = a[global_idx];
    
      ✗
              b_blocks[buf_idx] = b[global_idx];
    
            }
    
          }
    
        }
    
      ✗
      }
    
      ✗
      void LukinICannonAlgorithmMPI::MatrixUnpack(double *c, const double *c_blocks, int working_proc_count, int block_elems,
    
                                                  int block_size, int grid_size) const {
    
      ✗
        for (int proc = 0; proc < working_proc_count; proc++) {
    
      ✗
          int proc_i = proc / grid_size;
    
      ✗
          int proc_j = proc % grid_size;
    
      ✗
          int buf_offset = proc * block_elems;
    
      ✗
          for (int i = 0; i < block_size; i++) {
    
      ✗
            for (int j = 0; j < block_size; j++) {
    
      ✗
              int global_i = (proc_i * block_size) + i;
    
      ✗
              int global_j = (proc_j * block_size) + j;
    
      ✗
              int global_idx = (global_i * size_) + global_j;
    
      ✗
              int buf_idx = buf_offset + (i * block_size) + j;
    
      ✗
              c[global_idx] = c_blocks[buf_idx];
    
            }
    
          }
    
        }
    
      ✗
      }
    
      }  // namespace lukin_i_cannon_algorithm

Line	Branch	Exec	Source
1			#include "lukin_i_cannon_algorithm/mpi/include/ops_mpi.hpp"
2
3			#include <mpi.h>
4
5			#include <cmath>
6			#include <cstddef>
7			#include <tuple>
8			#include <utility>
9			#include <vector>
10
11			#include "lukin_i_cannon_algorithm/common/include/common.hpp"
12
13			namespace lukin_i_cannon_algorithm {
14
15	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	LukinICannonAlgorithmMPI::LukinICannonAlgorithmMPI(const InType &in) {
16			SetTypeOfTask(GetStaticTypeOfTask());
17
18		8	int rank = 0;
19	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
20	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (rank == 0) {
21			GetInput() = in;
22			}
23		8	GetOutput() = OutType();
24		8	}
25
26		8	bool LukinICannonAlgorithmMPI::ValidationImpl() {
27		8	int rank = 0;
28		8	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
29	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (rank == 0) {
30		4	int proc_count = 0;
31		4	MPI_Comm_size(MPI_COMM_WORLD, &proc_count);
32	1/2 ✓ Branch 0 taken 4 times. ✗ Branch 1 not taken.	4	int grid_size = static_cast<int>(std::floor(std::sqrt(proc_count)));
33		4	int rsize_a = static_cast<int>(std::get<0>(GetInput()).size());
34		4	int rsize_b = static_cast<int>(std::get<1>(GetInput()).size());
35		4	size_ = std::get<2>(GetInput());
36	3/6 ✓ Branch 0 taken 4 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 4 times. ✗ Branch 3 not taken. ✓ Branch 4 taken 4 times. ✗ Branch 5 not taken.	4	return (rsize_a > 0) && (rsize_b > 0) && (rsize_a == size_ * size_) && (rsize_a == rsize_b) &&
37	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 4 times.	4	(rsize_a % grid_size == 0);
38			}
39			return true;
40			}
41
42		8	bool LukinICannonAlgorithmMPI::PreProcessingImpl() {
43		8	return true;
44			}
45
46		8	bool LukinICannonAlgorithmMPI::RunImpl() {
47		8	int global_rank = -1;
48		8	int proc_count = 0;
49		8	MPI_Comm_rank(MPI_COMM_WORLD, &global_rank);
50		8	MPI_Comm_size(MPI_COMM_WORLD, &proc_count);
51
52		8	MPI_Bcast(&size_, 1, MPI_INT, 0, MPI_COMM_WORLD);
53
54			// случай, если 1 на 1 решетка процессов - просто seq версия
55	1/2 ✓ Branch 0 taken 8 times. ✗ Branch 1 not taken.	8	if (proc_count < 4) {
56		8	return RunSeq(global_rank);
57			}
58
59			// для процессов, напрямую учавствующих в вычислениях, создается другой коммуникатор
60		✗	return RunCannon(global_rank, proc_count);
61			}
62
63		8	bool LukinICannonAlgorithmMPI::PostProcessingImpl() {
64		8	return true;
65			}
66
67		4	void LukinICannonAlgorithmMPI::MulNSum(const double a, const double b, double *c, int size) {
68	2/2 ✓ Branch 0 taken 26 times. ✓ Branch 1 taken 4 times.	30	for (int i = 0; i < size; i++) {
69	2/2 ✓ Branch 0 taken 228 times. ✓ Branch 1 taken 26 times.	254	for (int k = 0; k < size; k++) {
70		228	double fixed = a[(i * size) + k];
71	2/2 ✓ Branch 0 taken 2312 times. ✓ Branch 1 taken 228 times.	2540	for (int j = 0; j < size; j++) {
72		2312	c[(i * size) + j] += fixed * b[(k * size) + j];
73			}
74			}
75			}
76		4	}
77
78		8	bool LukinICannonAlgorithmMPI::RunSeq(int global_rank) {
79		8	std::vector<double> c(static_cast<size_t>(size_ * size_));
80	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 4 times.	8	if (global_rank == 0) {
81			double *a = std::get<0>(GetInput()).data();
82			double *b = std::get<1>(GetInput()).data();
83		4	LukinICannonAlgorithmMPI::MulNSum(a, b, c.data(), size_);
84			}
85	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	MPI_Bcast(c.data(), size_ * size_, MPI_DOUBLE, 0, MPI_COMM_WORLD);
86			GetOutput() = std::move(c);
87		8	return true;
88			}
89
90		✗	bool LukinICannonAlgorithmMPI::RunCannon(int global_rank, int proc_count) {
91		✗	int grid_size = static_cast<int>(std::floor(std::sqrt(proc_count)));
92		✗	int working_proc_count = grid_size * grid_size;
93		✗	MPI_Comm mpi_comm_cannon = MPI_COMM_NULL;
94		✗	int color = (global_rank < working_proc_count) ? 0 : MPI_UNDEFINED;
95		✗	MPI_Comm_split(MPI_COMM_WORLD, color, global_rank, &mpi_comm_cannon);
96
97		✗	if (mpi_comm_cannon != MPI_COMM_NULL) {
98		✗	CannonWorkers(mpi_comm_cannon, grid_size, working_proc_count);
99		✗	MPI_Comm_free(&mpi_comm_cannon);
100			} else {
101		✗	CannonNonWorkers();
102			}
103
104		✗	return true;
105			}
106
107		✗	void LukinICannonAlgorithmMPI::CannonWorkers(MPI_Comm comm, int grid_size, int working_proc_count) {
108		✗	int cannon_rank = -1;
109		✗	MPI_Comm_rank(comm, &cannon_rank);
110
111		✗	int block_size = size_ / grid_size;
112		✗	int block_elems = block_size * block_size;
113		✗	std::vector<double> a_block(block_elems);
114		✗	std::vector<double> b_block(block_elems);
115		✗	std::vector<double> c_block(block_elems, 0);
116
117		✗	std::vector<double> a_blocks;
118		✗	std::vector<double> b_blocks;
119
120			// ручная упаковка
121		✗	if (cannon_rank == 0) {
122		✗	a_blocks.resize(static_cast<size_t>(working_proc_count) * static_cast<size_t>(block_elems));
123		✗	b_blocks.resize(static_cast<size_t>(working_proc_count) * static_cast<size_t>(block_elems));
124
125		✗	MatrixPack(a_blocks.data(), b_blocks.data(), working_proc_count, block_elems, block_size, grid_size);
126			}
127
128		✗	MPI_Scatter(a_blocks.data(), block_elems, MPI_DOUBLE, a_block.data(), block_elems, MPI_DOUBLE, 0, comm);
129
130		✗	MPI_Scatter(b_blocks.data(), block_elems, MPI_DOUBLE, b_block.data(), block_elems, MPI_DOUBLE, 0, comm);
131
132		✗	int row = cannon_rank / grid_size;
133		✗	int col = cannon_rank % grid_size;
134
135			// начальный сдвиг
136		✗	int left = (row * grid_size) + ((col - row + grid_size) % grid_size);
137		✗	int right = (row * grid_size) + ((col + row) % grid_size);
138
139		✗	MPI_Sendrecv_replace(a_block.data(), block_elems, MPI_DOUBLE, left, 0, right, 0, comm, MPI_STATUS_IGNORE);
140
141		✗	int up = (((row - col + grid_size) % grid_size) * grid_size) + col;
142		✗	int down = (((row + col) % grid_size) * grid_size) + col;
143
144		✗	MPI_Sendrecv_replace(b_block.data(), block_elems, MPI_DOUBLE, up, 0, down, 0, comm, MPI_STATUS_IGNORE);
145
146			// цикл умножения и сдвига
147		✗	for (int iter = 0; iter < grid_size; iter++) {
148		✗	LukinICannonAlgorithmMPI::MulNSum(a_block.data(), b_block.data(), c_block.data(), block_size);
149
150		✗	if (iter < grid_size - 1) {
151		✗	left = (row * grid_size) + ((col - 1 + grid_size) % grid_size);
152		✗	right = (row * grid_size) + ((col + 1) % grid_size);
153
154		✗	MPI_Sendrecv_replace(a_block.data(), block_elems, MPI_DOUBLE, left, 0, right, 0, comm, MPI_STATUS_IGNORE);
155
156		✗	up = (((row - 1 + grid_size) % grid_size) * grid_size) + col;
157		✗	down = (((row + 1) % grid_size) * grid_size) + col;
158
159		✗	MPI_Sendrecv_replace(b_block.data(), block_elems, MPI_DOUBLE, up, 0, down, 0, comm, MPI_STATUS_IGNORE);
160			}
161			}
162
163			// упаковка данных в результирующую
164		✗	std::vector<double> c_blocks(static_cast<size_t>(size_ * size_));
165		✗	MPI_Gather(c_block.data(), block_elems, MPI_DOUBLE, c_blocks.data(), block_elems, MPI_DOUBLE, 0, comm);
166
167		✗	std::vector<double> c(static_cast<size_t>(size_ * size_));
168		✗	if (cannon_rank == 0) {
169		✗	MatrixUnpack(c.data(), c_blocks.data(), working_proc_count, block_elems, block_size, grid_size);
170			}
171
172		✗	MPI_Bcast(c.data(), size_ * size_, MPI_DOUBLE, 0, MPI_COMM_WORLD);
173			GetOutput() = std::move(c);
174		✗	}
175
176		✗	void LukinICannonAlgorithmMPI::CannonNonWorkers() {
177		✗	std::vector<double> c(static_cast<size_t>(size_ * size_));
178		✗	MPI_Bcast(c.data(), size_ * size_, MPI_DOUBLE, 0, MPI_COMM_WORLD);
179			GetOutput() = std::move(c);
180		✗	}
181
182		✗	void LukinICannonAlgorithmMPI::MatrixPack(double a_blocks, double b_blocks, int working_proc_count, int block_elems,
183			int block_size, int grid_size) {
184			double *a = std::get<0>(GetInput()).data();
185			double *b = std::get<1>(GetInput()).data();
186
187		✗	for (int proc = 0; proc < working_proc_count; proc++) {
188		✗	int proc_i = proc / grid_size;
189		✗	int proc_j = proc % grid_size;
190		✗	int buf_offset = proc * block_elems;
191
192		✗	for (int i = 0; i < block_size; i++) {
193		✗	for (int j = 0; j < block_size; j++) {
194		✗	int global_i = (proc_i * block_size) + i;
195		✗	int global_j = (proc_j * block_size) + j;
196		✗	int global_idx = (global_i * size_) + global_j;
197		✗	int buf_idx = buf_offset + (i * block_size) + j;
198
199		✗	a_blocks[buf_idx] = a[global_idx];
200		✗	b_blocks[buf_idx] = b[global_idx];
201			}
202			}
203			}
204		✗	}
205
206		✗	void LukinICannonAlgorithmMPI::MatrixUnpack(double c, const double c_blocks, int working_proc_count, int block_elems,
207			int block_size, int grid_size) const {
208		✗	for (int proc = 0; proc < working_proc_count; proc++) {
209		✗	int proc_i = proc / grid_size;
210		✗	int proc_j = proc % grid_size;
211		✗	int buf_offset = proc * block_elems;
212
213		✗	for (int i = 0; i < block_size; i++) {
214		✗	for (int j = 0; j < block_size; j++) {
215		✗	int global_i = (proc_i * block_size) + i;
216		✗	int global_j = (proc_j * block_size) + j;
217		✗	int global_idx = (global_i * size_) + global_j;
218		✗	int buf_idx = buf_offset + (i * block_size) + j;
219
220		✗	c[global_idx] = c_blocks[buf_idx];
221			}
222			}
223			}
224		✗	}
225
226			} // namespace lukin_i_cannon_algorithm
227