GCC Code Coverage Report


Directory: ./
File: tasks/akhmetov_daniil_strassen_dense_double/tbb/src/ops_tbb.cpp
Date: 2026-06-04 20:25:32
Exec Total Coverage
Lines: 46 164 28.0%
Functions: 7 18 38.9%
Branches: 22 154 14.3%

Line Branch Exec Source
1 #include "akhmetov_daniil_strassen_dense_double/tbb/include/ops_tbb.hpp"
2
3 #include <oneapi/tbb/global_control.h>
4 #include <oneapi/tbb/parallel_invoke.h>
5
6 #include <algorithm>
7 #include <cstddef>
8 #include <vector>
9
10 #include "akhmetov_daniil_strassen_dense_double/common/include/common.hpp"
11 #include "util/include/util.hpp"
12
13 namespace akhmetov_daniil_strassen_dense_double {
14
15 namespace {
16
17 constexpr std::size_t kCutoff = 256;
18 constexpr std::size_t kBlockSize = 64;
19
20 std::size_t NextPow2(std::size_t x) {
21 if (x <= 1) {
22 return 1;
23 }
24 std::size_t p = 1;
25 while (p < x) {
26 p <<= 1;
27 }
28 return p;
29 }
30
31 void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
32
2/2
✓ Branch 0 taken 1792 times.
✓ Branch 1 taken 12 times.
1804 for (std::size_t i = 0; i < n; ++i) {
33 1792 std::fill_n(dst + (i * stride), n, 0.0);
34 }
35 }
36
37 void AddToBuffer(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *dst,
38 std::size_t n, double b_coeff) {
39 for (std::size_t i = 0; i < n; ++i) {
40 const double *a_row = a + (i * a_stride);
41 const double *b_row = b + (i * b_stride);
42 double *dst_row = dst + (i * n);
43 for (std::size_t j = 0; j < n; ++j) {
44 dst_row[j] = a_row[j] + (b_coeff * b_row[j]);
45 }
46 }
47 }
48
49 void MulMicroBlock(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
50 std::size_t c_stride, std::size_t i_begin, std::size_t i_end, std::size_t k_begin, std::size_t k_end,
51 std::size_t j_begin, std::size_t j_end) {
52
2/2
✓ Branch 0 taken 18688 times.
✓ Branch 1 taken 292 times.
18980 for (std::size_t i = i_begin; i < i_end; ++i) {
53 18688 double *c_row = c + (i * c_stride);
54 18688 const double *a_row = a + (i * a_stride);
55
2/2
✓ Branch 0 taken 1196032 times.
✓ Branch 1 taken 18688 times.
1214720 for (std::size_t k = k_begin; k < k_end; ++k) {
56 1196032 const double aik = a_row[k];
57 1196032 const double *b_row = b + (k * b_stride);
58
2/2
✓ Branch 0 taken 76546048 times.
✓ Branch 1 taken 1196032 times.
77742080 for (std::size_t j = j_begin; j < j_end; ++j) {
59 76546048 c_row[j] += aik * b_row[j];
60 }
61 }
62 }
63 }
64
65 84 void MulJBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
66 std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end, std::size_t kk,
67 std::size_t k_end) {
68
2/2
✓ Branch 0 taken 292 times.
✓ Branch 1 taken 84 times.
376 for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
69 292 const std::size_t j_end = std::min(jj + kBlockSize, n);
70 MulMicroBlock(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
71 }
72 84 }
73
74 void MulKBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
75 std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end) {
76
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 28 times.
112 for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
77 84 const std::size_t k_end = std::min(kk + kBlockSize, n);
78 84 MulJBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end, kk, k_end);
79 }
80 }
81
82 12 void NaiveMulBlocked(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
83 std::size_t c_stride, std::size_t n) {
84 12 ZeroMatrix(c, c_stride, n);
85
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 12 times.
40 for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
86 28 const std::size_t i_end = std::min(ii + kBlockSize, n);
87 MulKBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end);
88 }
89 12 }
90
91 void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
92 const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
93 const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
94 for (std::size_t i = 0; i < half; ++i) {
95 double *c11 = c + (i * c_stride);
96 double *c12 = c11 + half;
97 double *c21 = c + ((i + half) * c_stride);
98 double *c22 = c21 + half;
99
100 const double *m1_row = m1.data() + (i * half);
101 const double *m2_row = m2.data() + (i * half);
102 const double *m3_row = m3.data() + (i * half);
103 const double *m4_row = m4.data() + (i * half);
104 const double *m5_row = m5.data() + (i * half);
105 const double *m6_row = m6.data() + (i * half);
106 const double *m7_row = m7.data() + (i * half);
107
108 for (std::size_t j = 0; j < half; ++j) {
109 c11[j] = m1_row[j] + m4_row[j] - m5_row[j] + m7_row[j];
110 c12[j] = m3_row[j] + m5_row[j];
111 c21[j] = m2_row[j] + m4_row[j];
112 c22[j] = m1_row[j] - m2_row[j] + m3_row[j] + m6_row[j];
113 }
114 }
115 }
116
117 using StrassenFn = void (*)(const double *, std::size_t, const double *, std::size_t, double *, std::size_t,
118 std::size_t);
119 void StrassenSeqImpl(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
120 std::size_t c_stride, std::size_t n);
121 constexpr StrassenFn kStrassenSeqFn = &StrassenSeqImpl;
122
123 void ComputeProduct(const double *a1, std::size_t a1_stride, const double *a2, std::size_t a2_stride, double a2_coeff,
124 const double *b1, std::size_t b1_stride, const double *b2, std::size_t b2_stride, double b2_coeff,
125 std::vector<double> &out, std::size_t n) {
126 std::vector<double> lhs(n * n);
127 std::vector<double> rhs(n * n);
128 AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
129 AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
130 out.assign(n * n, 0.0);
131 kStrassenSeqFn(lhs.data(), n, rhs.data(), n, out.data(), n, n);
132 }
133
134 void ComputeProductSingle(const double *a, std::size_t a_stride, const double *b1, std::size_t b1_stride,
135 const double *b2, std::size_t b2_stride, double b2_coeff, std::vector<double> &out,
136 std::size_t n) {
137 std::vector<double> rhs(n * n);
138 AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
139 out.assign(n * n, 0.0);
140 kStrassenSeqFn(a, a_stride, rhs.data(), n, out.data(), n, n);
141 }
142
143 void ComputeProductSingleLeft(const double *a1, std::size_t a1_stride, const double *a2, std::size_t a2_stride,
144 double a2_coeff, const double *b, std::size_t b_stride, std::vector<double> &out,
145 std::size_t n) {
146 std::vector<double> lhs(n * n);
147 AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
148 out.assign(n * n, 0.0);
149 kStrassenSeqFn(lhs.data(), n, b, b_stride, out.data(), n, n);
150 }
151
152 void StrassenTopTbb(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
153 std::size_t c_stride, std::size_t n) {
154 if (n <= kCutoff || ppc::util::GetNumThreads() <= 1) {
155 kStrassenSeqFn(a, a_stride, b, b_stride, c, c_stride, n);
156 return;
157 }
158
159 const std::size_t half = n / 2;
160
161 const double *a11 = a;
162 const double *a12 = a + half;
163 const double *a21 = a + (half * a_stride);
164 const double *a22 = a21 + half;
165
166 const double *b11 = b;
167 const double *b12 = b + half;
168 const double *b21 = b + (half * b_stride);
169 const double *b22 = b21 + half;
170
171 std::vector<double> m1;
172 std::vector<double> m2;
173 std::vector<double> m3;
174 std::vector<double> m4;
175 std::vector<double> m5;
176 std::vector<double> m6;
177 std::vector<double> m7;
178
179 oneapi::tbb::parallel_invoke(
180 [&] { ComputeProduct(a11, a_stride, a22, a_stride, 1.0, b11, b_stride, b22, b_stride, 1.0, m1, half); }, [&] {
181 ComputeProductSingleLeft(a21, a_stride, a22, a_stride, 1.0, b11, b_stride, m2, half);
182 }, [&] { ComputeProductSingle(a11, a_stride, b12, b_stride, b22, b_stride, -1.0, m3, half); }, [&] {
183 ComputeProductSingle(a22, a_stride, b21, b_stride, b11, b_stride, -1.0, m4, half);
184 }, [&] { ComputeProductSingleLeft(a11, a_stride, a12, a_stride, 1.0, b22, b_stride, m5, half); }, [&] {
185 ComputeProduct(a21, a_stride, a11, a_stride, -1.0, b11, b_stride, b12, b_stride, 1.0, m6, half);
186 }, [&] { ComputeProduct(a12, a_stride, a22, a_stride, -1.0, b21, b_stride, b22, b_stride, 1.0, m7, half); });
187
188 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
189 }
190
191 void StrassenSeqImpl(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
192 std::size_t c_stride, std::size_t n) {
193 if (n <= kCutoff) {
194 NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
195 return;
196 }
197
198 const std::size_t half = n / 2;
199
200 const double *a11 = a;
201 const double *a12 = a + half;
202 const double *a21 = a + (half * a_stride);
203 const double *a22 = a21 + half;
204
205 const double *b11 = b;
206 const double *b12 = b + half;
207 const double *b21 = b + (half * b_stride);
208 const double *b22 = b21 + half;
209
210 std::vector<double> m1(half * half);
211 std::vector<double> m2(half * half);
212 std::vector<double> m3(half * half);
213 std::vector<double> m4(half * half);
214 std::vector<double> m5(half * half);
215 std::vector<double> m6(half * half);
216 std::vector<double> m7(half * half);
217 std::vector<double> lhs(half * half);
218 std::vector<double> rhs(half * half);
219
220 AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
221 AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
222 kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
223
224 AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
225 kStrassenSeqFn(lhs.data(), half, b11, b_stride, m2.data(), half, half);
226
227 AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
228 kStrassenSeqFn(a11, a_stride, rhs.data(), half, m3.data(), half, half);
229
230 AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
231 kStrassenSeqFn(a22, a_stride, rhs.data(), half, m4.data(), half, half);
232
233 AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
234 kStrassenSeqFn(lhs.data(), half, b22, b_stride, m5.data(), half, half);
235
236 AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
237 AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
238 kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
239
240 AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
241 AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
242 kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
243
244 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
245 }
246
247 } // namespace
248
249
1/2
✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
12 AkhmetovDStrassenDenseDoubleTBB::AkhmetovDStrassenDenseDoubleTBB(const InType &in) {
250 SetTypeOfTask(GetStaticTypeOfTask());
251
1/2
✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
12 GetInput() = in;
252 12 }
253
254
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
12 bool AkhmetovDStrassenDenseDoubleTBB::ValidationImpl() {
255 const auto &input = GetInput();
256
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
12 if (input.empty()) {
257 return false;
258 }
259 12 const size_t n = format::GetN(input);
260
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
12 if (n == 0) {
261 return false;
262 }
263 12 const size_t expected_size = 1 + (2 * n * n);
264 12 return input.size() == expected_size;
265 }
266
267 12 bool AkhmetovDStrassenDenseDoubleTBB::PreProcessingImpl() {
268 const auto &input = GetInput();
269 12 const size_t n = format::GetN(input);
270 12 GetOutput().assign(n * n, 0.0);
271 12 return true;
272 }
273
274 12 bool AkhmetovDStrassenDenseDoubleTBB::RunImpl() {
275 const auto &input = GetInput();
276 auto &output = GetOutput();
277
278 12 const std::size_t n = format::GetN(input);
279 12 const Matrix a = format::GetA(input);
280
1/2
✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
12 const Matrix b = format::GetB(input);
281
282
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
12 if (n <= kCutoff || (n % 2U) != 0U) {
283
1/2
✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
12 output.assign(n * n, 0.0);
284 12 NaiveMulBlocked(a.data(), n, b.data(), n, output.data(), n, n);
285 12 return true;
286 }
287
288 const std::size_t padded = NextPow2(n);
289 Matrix a_pad(padded * padded, 0.0);
290 Matrix b_pad(padded * padded, 0.0);
291 Matrix c_pad(padded * padded, 0.0);
292
293 for (std::size_t i = 0; i < n; ++i) {
294 std::copy_n(a.data() + (i * n), n, a_pad.data() + (i * padded));
295 std::copy_n(b.data() + (i * n), n, b_pad.data() + (i * padded));
296 }
297
298 oneapi::tbb::global_control control(oneapi::tbb::global_control::max_allowed_parallelism,
299 static_cast<std::size_t>(std::max(1, ppc::util::GetNumThreads())));
300 (void)control;
301 StrassenTopTbb(a_pad.data(), padded, b_pad.data(), padded, c_pad.data(), padded, padded);
302
303 output.assign(n * n, 0.0);
304 for (std::size_t i = 0; i < n; ++i) {
305 std::copy_n(c_pad.data() + (i * padded), n, output.data() + (i * n));
306 }
307
308 return true;
309 }
310
311 12 bool AkhmetovDStrassenDenseDoubleTBB::PostProcessingImpl() {
312 const auto &input = GetInput();
313 12 const size_t n = format::GetN(input);
314 12 return GetOutput().size() == n * n;
315 }
316
317 } // namespace akhmetov_daniil_strassen_dense_double
318