GCC Code Coverage Report


Directory: ./
File: tasks/batushin_i_striped_matrix_multiplication/mpi/src/ops_mpi.cpp
Date: 2026-01-27 01:59:34
Exec Total Coverage
Lines: 43 171 25.1%
Functions: 6 18 33.3%
Branches: 21 166 12.7%

Line Branch Exec Source
1 #include "batushin_i_striped_matrix_multiplication/mpi/include/ops_mpi.hpp"
2
3 #include <mpi.h>
4
5 #include <cstddef>
6 #include <cstdint>
7 #include <tuple>
8 #include <utility>
9 #include <vector>
10
11 #include "batushin_i_striped_matrix_multiplication/common/include/common.hpp"
12
13 namespace batushin_i_striped_matrix_multiplication {
14
15
1/2
✓ Branch 1 taken 30 times.
✗ Branch 2 not taken.
30 BatushinIStripedMatrixMultiplicationMPI::BatushinIStripedMatrixMultiplicationMPI(const InType &in) {
16 SetTypeOfTask(GetStaticTypeOfTask());
17 GetInput() = in;
18 30 }
19
20 60 bool BatushinIStripedMatrixMultiplicationMPI::ValidationImpl() {
21 const auto &input = GetInput();
22
23 60 const size_t rows_a = std::get<0>(input);
24 60 const size_t columns_a = std::get<1>(input);
25 const auto &matrix_a = std::get<2>(input);
26
27 60 const size_t rows_b = std::get<3>(input);
28 60 const size_t columns_b = std::get<4>(input);
29 const auto &matrix_b = std::get<5>(input);
30
31
2/4
✓ Branch 0 taken 60 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 60 times.
✗ Branch 3 not taken.
60 if (rows_a == 0 || columns_a == 0 || rows_b == 0 || columns_b == 0) {
32 return false;
33 }
34
35
1/2
✓ Branch 0 taken 60 times.
✗ Branch 1 not taken.
60 if (columns_a != rows_b) {
36 return false;
37 }
38
39
1/2
✓ Branch 0 taken 60 times.
✗ Branch 1 not taken.
60 if (matrix_a.size() != rows_a * columns_a) {
40 return false;
41 }
42
43
1/2
✓ Branch 0 taken 60 times.
✗ Branch 1 not taken.
60 if (matrix_b.size() != rows_b * columns_b) {
44 return false;
45 }
46
47 60 return GetOutput().empty();
48 }
49
50 30 bool BatushinIStripedMatrixMultiplicationMPI::PreProcessingImpl() {
51 30 return ValidationImpl();
52 }
53
54 namespace {
55
56 enum class MPITag : std::uint8_t { kMatrixB = 101 };
57
58 std::vector<int> ComputeBlockSizes(int total, int num_procs) {
59 std::vector<int> sizes(num_procs, 0);
60 int base = total / num_procs;
61 int extra = total % num_procs;
62 for (int i = 0; i < num_procs; ++i) {
63 sizes[i] = base + (i < extra ? 1 : 0);
64 }
65 return sizes;
66 }
67
68 std::vector<int> ComputeBlockOffsets(const std::vector<int> &sizes) {
69 std::vector<int> offsets(sizes.size(), 0);
70 for (size_t i = 1; i < sizes.size(); ++i) {
71 offsets[i] = offsets[i - 1] + sizes[i - 1];
72 }
73 return offsets;
74 }
75
76 30 bool RunSequentialFallback(int rank, size_t rows_a, size_t cols_a, size_t cols_b, const std::vector<double> &matrix_a,
77 const std::vector<double> &matrix_b, std::vector<double> &output) {
78
2/2
✓ Branch 0 taken 15 times.
✓ Branch 1 taken 15 times.
30 if (rank == 0) {
79 15 output.resize(rows_a * cols_b, 0.0);
80
2/2
✓ Branch 0 taken 38 times.
✓ Branch 1 taken 15 times.
53 for (size_t i = 0; i < rows_a; ++i) {
81
2/2
✓ Branch 0 taken 94 times.
✓ Branch 1 taken 38 times.
132 for (size_t j = 0; j < cols_b; ++j) {
82 double sum = 0.0;
83
2/2
✓ Branch 0 taken 274 times.
✓ Branch 1 taken 94 times.
368 for (size_t k = 0; k < cols_a; ++k) {
84 274 sum += matrix_a[(i * cols_a) + k] * matrix_b[(k * cols_b) + j];
85 }
86 94 output[(i * cols_b) + j] = sum;
87 }
88 }
89 }
90
91 15 int total_size = (rank == 0) ? static_cast<int>(output.size()) : 0;
92 30 MPI_Bcast(&total_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
93
2/2
✓ Branch 0 taken 15 times.
✓ Branch 1 taken 15 times.
30 if (rank != 0) {
94 15 output.resize(total_size);
95 }
96
1/2
✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
30 if (total_size > 0) {
97 30 MPI_Bcast(output.data(), total_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
98 }
99 30 return true;
100 }
101
102 void DistributeMatrixAFromRoot(int size, int m, const std::vector<double> &matrix_a, const std::vector<int> &row_counts,
103 const std::vector<int> &row_displs, std::vector<double> &local_a) {
104 std::vector<int> sendcounts_a(size);
105 std::vector<int> displs_a(size);
106 for (int i = 0; i < size; ++i) {
107 sendcounts_a[i] = row_counts[i] * m;
108 displs_a[i] = row_displs[i] * m;
109 }
110
111 int my_rows = (local_a.empty()) ? 0 : static_cast<int>(local_a.size()) / m;
112 if (my_rows > 0) {
113 MPI_Scatterv(matrix_a.data(), sendcounts_a.data(), displs_a.data(), MPI_DOUBLE, local_a.data(), my_rows * m,
114 MPI_DOUBLE, 0, MPI_COMM_WORLD);
115 } else {
116 MPI_Scatterv(matrix_a.data(), sendcounts_a.data(), displs_a.data(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0,
117 MPI_COMM_WORLD);
118 }
119 }
120
121 std::tuple<std::vector<int>, std::vector<int>, std::vector<double>> DistributeMatrixA(
122 int rank, int size, int n, int m, const std::vector<double> &matrix_a) {
123 auto row_counts = ComputeBlockSizes(n, size);
124 auto row_displs = ComputeBlockOffsets(row_counts);
125 int my_rows = row_counts[rank];
126
127 std::vector<double> local_a;
128 if (my_rows > 0) {
129 local_a.resize(static_cast<size_t>(my_rows) * static_cast<size_t>(m));
130 }
131
132 DistributeMatrixAFromRoot(size, m, matrix_a, row_counts, row_displs, local_a);
133 return {row_counts, row_displs, local_a};
134 }
135
136 std::vector<double> ExtractColumnBlock(int m, int p, int dest_col_start, int dest_col_count,
137 const std::vector<double> &matrix_b) {
138 std::vector<double> buf(static_cast<size_t>(m) * static_cast<size_t>(dest_col_count));
139 for (int row = 0; row < m; ++row) {
140 for (int col = 0; col < dest_col_count; ++col) {
141 int global_col = dest_col_start + col;
142 buf[(static_cast<size_t>(row) * static_cast<size_t>(dest_col_count)) + static_cast<size_t>(col)] =
143 matrix_b[(static_cast<size_t>(row) * static_cast<size_t>(p)) + static_cast<size_t>(global_col)];
144 }
145 }
146 return buf;
147 }
148
149 void DistributeMatrixBFromRoot(int size, int m, int p, const std::vector<double> &matrix_b,
150 const std::vector<int> &col_counts, const std::vector<int> &col_displs,
151 std::vector<double> &current_b, int &current_cols) {
152 for (int dest = 0; dest < size; ++dest) {
153 if (col_counts[dest] > 0) {
154 std::vector<double> buf = ExtractColumnBlock(m, p, col_displs[dest], col_counts[dest], matrix_b);
155 if (dest == 0) {
156 current_b = std::move(buf);
157 current_cols = col_counts[0];
158 } else {
159 MPI_Send(buf.data(), static_cast<int>(buf.size()), MPI_DOUBLE, dest, 100, MPI_COMM_WORLD);
160 }
161 } else {
162 if (dest == 0) {
163 current_cols = 0;
164 } else {
165 MPI_Send(nullptr, 0, MPI_DOUBLE, dest, 100, MPI_COMM_WORLD);
166 }
167 }
168 }
169 }
170
171 std::tuple<std::vector<int>, std::vector<int>, std::vector<double>, int> DistributeMatrixB(
172 int rank, int size, int m, int p, const std::vector<double> &matrix_b) {
173 auto col_counts = ComputeBlockSizes(p, size);
174 auto col_displs = ComputeBlockOffsets(col_counts);
175
176 std::vector<double> current_b;
177 int current_cols = 0;
178
179 if (rank == 0) {
180 DistributeMatrixBFromRoot(size, m, p, matrix_b, col_counts, col_displs, current_b, current_cols);
181 } else {
182 if (col_counts[rank] > 0) {
183 current_b.resize(static_cast<size_t>(m) * static_cast<size_t>(col_counts[rank]));
184 MPI_Recv(current_b.data(), static_cast<int>(current_b.size()), MPI_DOUBLE, 0, 100, MPI_COMM_WORLD,
185 MPI_STATUS_IGNORE);
186 current_cols = col_counts[rank];
187 } else {
188 MPI_Recv(nullptr, 0, MPI_DOUBLE, 0, 100, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
189 current_cols = 0;
190 }
191 }
192
193 return {col_counts, col_displs, current_b, current_cols};
194 }
195
196 void PerformLocalComputation(int my_rows, int current_cols, int m, int p, int stripe_offset,
197 const std::vector<double> &local_a, const std::vector<double> &current_b,
198 std::vector<double> &local_c) {
199 for (int i = 0; i < my_rows; ++i) {
200 for (int j = 0; j < current_cols; ++j) {
201 double sum = 0.0;
202 for (int k = 0; k < m; ++k) {
203 sum += local_a[(static_cast<size_t>(i) * static_cast<size_t>(m)) + static_cast<size_t>(k)] *
204 current_b[static_cast<size_t>(k) + (static_cast<size_t>(j) * static_cast<size_t>(m))];
205 }
206 local_c[(static_cast<size_t>(i) * static_cast<size_t>(p)) + static_cast<size_t>(stripe_offset + j)] = sum;
207 }
208 }
209 }
210
211 std::pair<std::vector<double>, int> ShiftMatrixB(int rank, int size, int m, std::vector<double> &current_b,
212 int current_cols) {
213 int next = (rank + 1) % size;
214 int prev = (rank - 1 + size) % size;
215
216 int send_cols = current_cols;
217 int recv_cols = 0;
218 MPI_Sendrecv(&send_cols, 1, MPI_INT, next, 200, &recv_cols, 1, MPI_INT, prev, 200, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
219
220 std::vector<double> recv_buffer;
221 if (recv_cols > 0) {
222 recv_buffer.resize(static_cast<size_t>(m) * static_cast<size_t>(recv_cols));
223 }
224
225 int send_count = send_cols * m;
226 int recv_count = recv_cols * m;
227 const double *send_ptr = (send_count > 0 && !current_b.empty()) ? current_b.data() : nullptr;
228 double *recv_ptr = (recv_count > 0) ? recv_buffer.data() : nullptr;
229
230 MPI_Sendrecv(send_ptr, send_count, MPI_DOUBLE, next, 201, recv_ptr, recv_count, MPI_DOUBLE, prev, 201, MPI_COMM_WORLD,
231 MPI_STATUS_IGNORE);
232
233 if (recv_cols > 0) {
234 return {std::move(recv_buffer), recv_cols};
235 }
236 return {{}, 0};
237 }
238
239 std::vector<double> ComputeWithCyclicShift(int rank, int size, int m, int p, const std::vector<double> &local_a,
240 std::vector<double> current_b, int current_cols,
241 const std::vector<int> &col_displs) {
242 int my_rows = (local_a.empty()) ? 0 : static_cast<int>(local_a.size()) / m;
243 std::vector<double> local_c;
244 if (my_rows > 0) {
245 local_c.resize(static_cast<size_t>(my_rows) * static_cast<size_t>(p), 0.0);
246 }
247
248 int stripe_owner = rank;
249 for (int step = 0; step < size; ++step) {
250 if (my_rows > 0 && current_cols > 0 && !current_b.empty()) {
251 int stripe_offset = (static_cast<size_t>(stripe_owner) < col_displs.size()) ? col_displs[stripe_owner] : 0;
252 PerformLocalComputation(my_rows, current_cols, m, p, stripe_offset, local_a, current_b, local_c);
253 }
254
255 if (step == size - 1) {
256 break;
257 }
258
259 auto [new_b, new_cols] = ShiftMatrixB(rank, size, m, current_b, current_cols);
260 current_b = std::move(new_b);
261 current_cols = new_cols;
262 stripe_owner = (stripe_owner - 1 + size) % size;
263 }
264
265 return local_c;
266 }
267
268 void BroadcastFinalResult(int rank, int size, int n, int p, const std::vector<int> &row_counts,
269 const std::vector<int> &row_displs, const std::vector<double> &local_c,
270 std::vector<double> &result) {
271 std::vector<int> result_counts(size);
272 std::vector<int> result_displs(size);
273 for (int i = 0; i < size; ++i) {
274 result_counts[i] = row_counts[i] * p;
275 result_displs[i] = row_displs[i] * p;
276 }
277
278 if (rank == 0) {
279 result.resize(static_cast<size_t>(n) * static_cast<size_t>(p));
280 }
281
282 int my_rows = (local_c.empty()) ? 0 : static_cast<int>(local_c.size()) / p;
283 int local_result_elements = my_rows * p;
284 const double *local_result_ptr = (my_rows > 0) ? local_c.data() : nullptr;
285
286 MPI_Gatherv(local_result_ptr, local_result_elements, MPI_DOUBLE, result.data(), result_counts.data(),
287 result_displs.data(), MPI_DOUBLE, 0, MPI_COMM_WORLD);
288
289 int total_size = n * p;
290 if (rank != 0) {
291 result.resize(static_cast<size_t>(total_size));
292 }
293 MPI_Bcast(result.data(), total_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
294 }
295
296 bool RunStripedScheme(int rank, int size, size_t rows_a, size_t cols_a, size_t cols_b,
297 const std::vector<double> &matrix_a, const std::vector<double> &matrix_b,
298 std::vector<double> &output) {
299 const int n = static_cast<int>(rows_a);
300 const int m = static_cast<int>(cols_a);
301 const int p = static_cast<int>(cols_b);
302
303 auto [row_counts, row_displs, local_a] = DistributeMatrixA(rank, size, n, m, matrix_a);
304 auto [col_counts, col_displs, current_b, current_cols] = DistributeMatrixB(rank, size, m, p, matrix_b);
305 auto local_c = ComputeWithCyclicShift(rank, size, m, p, local_a, current_b, current_cols, col_displs);
306
307 BroadcastFinalResult(rank, size, n, p, row_counts, row_displs, local_c, output);
308 return true;
309 }
310
311 } // namespace
312
313 30 bool BatushinIStripedMatrixMultiplicationMPI::RunImpl() {
314 30 int rank = 0;
315 30 int size = 0;
316 30 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
317 30 MPI_Comm_size(MPI_COMM_WORLD, &size);
318
319 const auto &input = GetInput();
320 30 const size_t rows_a = std::get<0>(input);
321
1/2
✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
30 const size_t cols_a = std::get<1>(input);
322 const auto &matrix_a = std::get<2>(input);
323 30 const size_t cols_b = std::get<4>(input);
324 const auto &matrix_b = std::get<5>(input);
325
326 30 std::vector<double> output;
327
328
2/4
✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 26 times.
56 if (std::cmp_greater(size, rows_a) || std::cmp_greater(size, cols_b) || size <= 4) {
329
1/2
✓ Branch 1 taken 30 times.
✗ Branch 2 not taken.
30 RunSequentialFallback(rank, rows_a, cols_a, cols_b, matrix_a, matrix_b, output);
330 } else {
331 RunStripedScheme(rank, size, rows_a, cols_a, cols_b, matrix_a, matrix_b, output);
332 }
333
334 GetOutput() = std::move(output);
335 30 return true;
336 }
337
338 30 bool BatushinIStripedMatrixMultiplicationMPI::PostProcessingImpl() {
339 30 return !GetOutput().empty();
340 }
341
342 } // namespace batushin_i_striped_matrix_multiplication
343