GCC Code Coverage Report


Directory: ./
File: tasks/akhmetov_daniil_strassen_dense_double/all/src/ops_all.cpp
Date: 2026-06-04 20:25:32
Exec Total Coverage
Lines: 157 159 98.7%
Functions: 13 13 100.0%
Branches: 54 98 55.1%

Line Branch Exec Source
1 #include "akhmetov_daniil_strassen_dense_double/all/include/ops_all.hpp"
2
3 #include <mpi.h>
4
5 #include <atomic>
6 #include <cstddef>
7 #include <thread>
8 #include <utility>
9 #include <vector>
10
11 #include "akhmetov_daniil_strassen_dense_double/common/include/common.hpp"
12 #include "oneapi/tbb/parallel_for.h"
13 #include "util/include/util.hpp"
14
15 namespace akhmetov_daniil_strassen_dense_double {
16
17
1/2
✓ Branch 1 taken 6 times.
✗ Branch 2 not taken.
6 AkhmetovDStrassenDenseDoubleALL::AkhmetovDStrassenDenseDoubleALL(const InType &in) {
18 SetTypeOfTask(GetStaticTypeOfTask());
19
1/2
✓ Branch 1 taken 6 times.
✗ Branch 2 not taken.
6 GetInput() = in;
20 6 }
21
22
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 bool AkhmetovDStrassenDenseDoubleALL::ValidationImpl() {
23 const auto &input = GetInput();
24
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 if (input.empty()) {
25 return false;
26 }
27 6 const size_t n = format::GetN(input);
28
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 if (n == 0) {
29 return false;
30 }
31 6 const size_t expected_size = 1 + (2 * n * n);
32 6 return input.size() == expected_size;
33 }
34
35 6 bool AkhmetovDStrassenDenseDoubleALL::PreProcessingImpl() {
36 const auto &input = GetInput();
37 6 const size_t n = format::GetN(input);
38 6 GetOutput().assign(n * n, 0.0);
39 6 return true;
40 }
41
42 namespace {
43
44 constexpr size_t kThreshold = 64;
45 [[maybe_unused]] constexpr size_t kParallelThreshold = 256;
46
47 inline size_t NextPow2(size_t n) {
48 size_t p = 1;
49
2/2
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 4 times.
34 while (p < n) {
50 30 p <<= 1;
51 }
52 return p;
53 }
54
55 114 Matrix StandardMultiply(const Matrix &a, const Matrix &b, size_t size) {
56 114 Matrix c(size * size, 0.0);
57 114 #pragma omp parallel for default(none) shared(a, b, c, size) schedule(static) if (size >= kParallelThreshold)
58 for (size_t i = 0; i < size; ++i) {
59 for (size_t k = 0; k < size; ++k) {
60 const double aik = a.at((i * size) + k);
61 const size_t bk = k * size;
62 const size_t ci = i * size;
63 for (size_t j = 0; j < size; ++j) {
64 c.at(ci + j) += aik * b.at(bk + j);
65 }
66 }
67 }
68 114 return c;
69 }
70
71 36 void Split(const Matrix &src, Matrix &a11, Matrix &a12, Matrix &a21, Matrix &a22, size_t size) {
72 36 const size_t half = size / 2;
73 36 const size_t block = half * half;
74 36 a11.assign(block, 0.0);
75 36 a12.assign(block, 0.0);
76 36 a21.assign(block, 0.0);
77 36 a22.assign(block, 0.0);
78
79 36 #pragma omp parallel for default(none) shared(src, a11, a12, a21, a22, size, half) \
80 schedule(static) if (size >= kParallelThreshold)
81 for (size_t i = 0; i < half; ++i) {
82 const size_t is = i * size;
83 const size_t ih = i * half;
84 const size_t is2 = (i + half) * size;
85 for (size_t j = 0; j < half; ++j) {
86 a11.at(ih + j) = src.at(is + j);
87 a12.at(ih + j) = src.at(is + j + half);
88 a21.at(ih + j) = src.at(is2 + j);
89 a22.at(ih + j) = src.at(is2 + j + half);
90 }
91 }
92 36 }
93
94 void Merge(Matrix &dst, const Matrix &c11, const Matrix &c12, const Matrix &c21, const Matrix &c22, size_t size) {
95 18 const size_t half = size / 2;
96 18 #pragma omp parallel for default(none) shared(dst, c11, c12, c21, c22, size, half) \
97 schedule(static) if (size >= kParallelThreshold)
98 for (size_t i = 0; i < half; ++i) {
99 const size_t is = i * size;
100 const size_t ih = i * half;
101 const size_t is2 = (i + half) * size;
102 for (size_t j = 0; j < half; ++j) {
103 dst.at(is + j) = c11.at(ih + j);
104 dst.at(is + j + half) = c12.at(ih + j);
105 dst.at(is2 + j) = c21.at(ih + j);
106 dst.at(is2 + j + half) = c22.at(ih + j);
107 }
108 }
109 }
110
111 108 inline void AddInto(const Matrix &a, const Matrix &b, Matrix &c) {
112 const size_t n = a.size();
113 108 #pragma omp parallel for default(none) shared(a, b, c, n) \
114 schedule(static) if (n >= (kParallelThreshold * kParallelThreshold))
115 for (size_t i = 0; i < n; ++i) {
116 c.at(i) = a.at(i) + b.at(i);
117 }
118 108 }
119
120 72 inline void SubInto(const Matrix &a, const Matrix &b, Matrix &c) {
121 const size_t n = a.size();
122 72 #pragma omp parallel for default(none) shared(a, b, c, n) \
123 schedule(static) if (n >= (kParallelThreshold * kParallelThreshold))
124 for (size_t i = 0; i < n; ++i) {
125 c.at(i) = a.at(i) - b.at(i);
126 }
127 72 }
128
129 struct Frame {
130 Matrix a;
131 Matrix b;
132 Matrix result;
133 size_t size;
134 size_t stage{0};
135
136 Matrix a11;
137 Matrix a12;
138 Matrix a21;
139 Matrix a22;
140 Matrix b11;
141 Matrix b12;
142 Matrix b21;
143 Matrix b22;
144
145 Matrix m1;
146 Matrix m2;
147 Matrix m3;
148 Matrix m4;
149 Matrix m5;
150 Matrix m6;
151 Matrix m7;
152
153 Matrix temp_a;
154 Matrix temp_b;
155
156 130 Frame(Matrix aa, Matrix bb, size_t s) : a(std::move(aa)), b(std::move(bb)), size(s) {}
157 };
158
159
1/2
✓ Branch 0 taken 274 times.
✗ Branch 1 not taken.
274 void ProcessTopFrame(std::vector<Frame> &stack, Matrix &final_result) {
160
1/2
✓ Branch 0 taken 274 times.
✗ Branch 1 not taken.
274 if (stack.empty()) {
161 return;
162 }
163
164
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 274 times.
274 const size_t current_index = stack.size() - 1;
165 Frame &frame = stack.at(current_index);
166
167
2/2
✓ Branch 0 taken 112 times.
✓ Branch 1 taken 162 times.
274 if (frame.size <= kThreshold) {
168 112 Matrix base = StandardMultiply(frame.a, frame.b, frame.size);
169 stack.pop_back();
170
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 112 times.
112 if (stack.empty()) {
171 final_result = std::move(base);
172 } else {
173 112 stack.back().temp_a = std::move(base);
174 }
175 return;
176 }
177
178 162 const size_t half = frame.size / 2;
179 162 const size_t block_size = half * half;
180
181
9/10
✓ Branch 0 taken 18 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 18 times.
✓ Branch 3 taken 18 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 18 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 18 times.
✓ Branch 8 taken 18 times.
✗ Branch 9 not taken.
162 switch (frame.stage) {
182 18 case 0: {
183 18 Split(frame.a, frame.a11, frame.a12, frame.a21, frame.a22, frame.size);
184 18 Split(frame.b, frame.b11, frame.b12, frame.b21, frame.b22, frame.size);
185
186 18 frame.temp_a.assign(block_size, 0.0);
187 18 frame.temp_b.assign(block_size, 0.0);
188 18 frame.m1.assign(block_size, 0.0);
189 18 frame.m2.assign(block_size, 0.0);
190 18 frame.m3.assign(block_size, 0.0);
191 18 frame.m4.assign(block_size, 0.0);
192 18 frame.m5.assign(block_size, 0.0);
193 18 frame.m6.assign(block_size, 0.0);
194 18 frame.m7.assign(block_size, 0.0);
195
196 18 frame.stage = 1;
197 18 return;
198 }
199
200 18 case 1: {
201 18 AddInto(frame.a11, frame.a22, frame.temp_a);
202 18 AddInto(frame.b11, frame.b22, frame.temp_b);
203
204 18 stack.emplace_back(frame.temp_a, frame.temp_b, half);
205 18 stack.at(current_index).stage = 2;
206 18 return;
207 }
208
209 18 case 2: {
210 18 frame.m1 = frame.temp_a;
211 18 AddInto(frame.a21, frame.a22, frame.temp_a);
212
213 18 stack.emplace_back(frame.temp_a, frame.b11, half);
214 18 stack.at(current_index).stage = 3;
215 18 return;
216 }
217
218 18 case 3: {
219 18 frame.m2 = frame.temp_a;
220 18 SubInto(frame.b12, frame.b22, frame.temp_b);
221
222 18 stack.emplace_back(frame.a11, frame.temp_b, half);
223 18 stack.at(current_index).stage = 4;
224 18 return;
225 }
226
227 18 case 4: {
228 18 frame.m3 = frame.temp_a;
229 18 SubInto(frame.b21, frame.b11, frame.temp_b);
230
231 18 stack.emplace_back(frame.a22, frame.temp_b, half);
232 18 stack.at(current_index).stage = 5;
233 18 return;
234 }
235
236 18 case 5: {
237 18 frame.m4 = frame.temp_a;
238 18 AddInto(frame.a11, frame.a12, frame.temp_a);
239
240 18 stack.emplace_back(frame.temp_a, frame.b22, half);
241 18 stack.at(current_index).stage = 6;
242 18 return;
243 }
244
245 18 case 6: {
246 18 frame.m5 = frame.temp_a;
247 18 SubInto(frame.a21, frame.a11, frame.temp_a);
248 18 AddInto(frame.b11, frame.b12, frame.temp_b);
249
250 18 stack.emplace_back(frame.temp_a, frame.temp_b, half);
251 18 stack.at(current_index).stage = 7;
252 18 return;
253 }
254
255 18 case 7: {
256 18 frame.m6 = frame.temp_a;
257 18 SubInto(frame.a12, frame.a22, frame.temp_a);
258 18 AddInto(frame.b21, frame.b22, frame.temp_b);
259
260 18 stack.emplace_back(frame.temp_a, frame.temp_b, half);
261 18 stack.at(current_index).stage = 8;
262 18 return;
263 }
264
265 18 case 8: {
266 18 frame.m7 = frame.temp_a;
267
268 18 Matrix c11(block_size);
269
1/4
✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
18 Matrix c12(block_size);
270
1/4
✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
18 Matrix c21(block_size);
271
1/4
✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
18 Matrix c22(block_size);
272
273 18 #pragma omp parallel for default(none) shared(c11, c12, c21, c22, frame, block_size) \
274 schedule(static) if (block_size >= (kParallelThreshold * kParallelThreshold))
275 for (size_t i = 0; i < block_size; ++i) {
276 c11.at(i) = frame.m1.at(i) + frame.m4.at(i) - frame.m5.at(i) + frame.m7.at(i);
277 c12.at(i) = frame.m3.at(i) + frame.m5.at(i);
278 c21.at(i) = frame.m2.at(i) + frame.m4.at(i);
279 c22.at(i) = frame.m1.at(i) - frame.m2.at(i) + frame.m3.at(i) + frame.m6.at(i);
280 }
281
282
1/4
✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
18 Matrix merged(frame.size * frame.size, 0.0);
283 18 Merge(merged, c11, c12, c21, c22, frame.size);
284
285 stack.pop_back();
286
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 14 times.
18 if (stack.empty()) {
287 final_result = std::move(merged);
288 } else {
289 14 stack.back().temp_a = std::move(merged);
290 }
291 return;
292 }
293
294 default:
295 return;
296 }
297 }
298
299 4 Matrix StrassenMultiply(const Matrix &a_init, const Matrix &b_init, size_t size_init) {
300 4 std::vector<Frame> stack;
301
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 stack.emplace_back(a_init, b_init, size_init);
302
303 4 Matrix result;
304
2/2
✓ Branch 0 taken 274 times.
✓ Branch 1 taken 4 times.
278 while (!stack.empty()) {
305
1/2
✓ Branch 1 taken 274 times.
✗ Branch 2 not taken.
274 ProcessTopFrame(stack, result);
306 }
307 4 return result;
308 4 }
309
310 8 Matrix PadTo(const Matrix &src, size_t n, size_t new_n) {
311
1/2
✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
8 if (new_n == n) {
312 8 return src;
313 }
314 Matrix padded(new_n * new_n, 0.0);
315 #pragma omp parallel for default(none) shared(padded, src, n, new_n) schedule(static) if (new_n >= kParallelThreshold)
316 for (size_t i = 0; i < n; ++i) {
317 const size_t is = i * n;
318 const size_t id = i * new_n;
319 for (size_t j = 0; j < n; ++j) {
320 padded.at(id + j) = src.at(is + j);
321 }
322 }
323 return padded;
324 }
325
326 } // namespace
327
328 6 bool AkhmetovDStrassenDenseDoubleALL::RunImpl() {
329 const auto &input = GetInput();
330 auto &output = GetOutput();
331
332 6 const size_t n = format::GetN(input);
333 6 const Matrix a = format::GetA(input);
334
1/2
✓ Branch 1 taken 6 times.
✗ Branch 2 not taken.
6 const Matrix b = format::GetB(input);
335
336
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 4 times.
6 if (n <= kThreshold) {
337
1/4
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
2 output = StandardMultiply(a, b, n);
338 2 return true;
339 }
340
341 const size_t new_n = NextPow2(n);
342
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 const Matrix a_padded = PadTo(a, n, new_n);
343
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 const Matrix b_padded = PadTo(b, n, new_n);
344
345
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 const Matrix result_padded = StrassenMultiply(a_padded, b_padded, new_n);
346
347
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 output.assign(n * n, 0.0);
348 4 #pragma omp parallel for default(none) shared(output, result_padded, n, new_n) \
349 schedule(static) if (n >= kParallelThreshold)
350 for (size_t i = 0; i < n; ++i) {
351 const size_t is = i * new_n;
352 const size_t id = i * n;
353 for (size_t j = 0; j < n; ++j) {
354 output.at(id + j) = result_padded.at(is + j);
355 }
356 }
357
358
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 const int num_threads = ppc::util::GetNumThreads();
359 4 int rank = 0;
360
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
361
362
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.
4 if (rank == 0) {
363 2 std::atomic<int> counter(0);
364
1/2
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
2 #pragma omp parallel default(none) shared(counter) num_threads(ppc::util::GetNumThreads())
365 counter++;
366 }
367
368 {
369
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 std::vector<std::thread> threads(num_threads);
370 4 std::atomic<int> counter(0);
371
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 4 times.
12 for (int i = 0; i < num_threads; i++) {
372
2/4
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 8 times.
8 threads.at(i) = std::thread([&]() { counter++; });
373
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 threads.at(i).join();
374 }
375 4 }
376
377 {
378 4 std::atomic<int> counter(0);
379
1/4
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
12 tbb::parallel_for(0, num_threads, [&](int /*i*/) { counter++; });
380 }
381
382
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 MPI_Barrier(MPI_COMM_WORLD);
383 return true;
384 }
385
386 6 bool AkhmetovDStrassenDenseDoubleALL::PostProcessingImpl() {
387 const auto &input = GetInput();
388 6 const size_t n = format::GetN(input);
389 6 return GetOutput().size() == n * n;
390 }
391
392 } // namespace akhmetov_daniil_strassen_dense_double
393