GCC Code Coverage Report


Directory: ./
File: tasks/zorin_d_strassen_alg_matrix_seq/tbb/src/ops_tbb.cpp
Date: 2026-04-02 17:12:27
Exec Total Coverage
Lines: 59 157 37.6%
Functions: 9 18 50.0%
Branches: 35 142 24.6%

Line Branch Exec Source
1 #include "zorin_d_strassen_alg_matrix_seq/tbb/include/ops_tbb.hpp"
2
3 #include <oneapi/tbb/global_control.h>
4 #include <oneapi/tbb/parallel_invoke.h>
5
6 #include <algorithm>
7 #include <cstddef>
8 #include <vector>
9
10 #include "util/include/util.hpp"
11 #include "zorin_d_strassen_alg_matrix_seq/common/include/common.hpp"
12
13 namespace zorin_d_strassen_alg_matrix_seq {
14
15 namespace {
16
17 constexpr std::size_t kCutoff = 256;
18 constexpr std::size_t kBlockSize = 64;
19
20 std::size_t NextPow2(std::size_t x) {
21 84 if (x <= 1) {
22 return 1;
23 }
24 std::size_t p = 1;
25
2/2
✓ Branch 0 taken 204 times.
✓ Branch 1 taken 72 times.
276 while (p < x) {
26 204 p <<= 1;
27 }
28 return p;
29 }
30
31 void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
32
2/2
✓ Branch 0 taken 660 times.
✓ Branch 1 taken 84 times.
744 for (std::size_t i = 0; i < n; ++i) {
33 660 std::fill_n(dst + (i * stride), n, 0.0);
34 }
35 }
36
37 void AddToBuffer(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *dst,
38 std::size_t n, double b_coeff) {
39 for (std::size_t i = 0; i < n; ++i) {
40 const double *a_row = a + (i * a_stride);
41 const double *b_row = b + (i * b_stride);
42 double *dst_row = dst + (i * n);
43 for (std::size_t j = 0; j < n; ++j) {
44 dst_row[j] = a_row[j] + (b_coeff * b_row[j]);
45 }
46 }
47 }
48
49 void MulMicroBlock(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
50 std::size_t c_stride, std::size_t i_begin, std::size_t i_end, std::size_t k_begin, std::size_t k_end,
51 std::size_t j_begin, std::size_t j_end) {
52
2/2
✓ Branch 0 taken 660 times.
✓ Branch 1 taken 84 times.
744 for (std::size_t i = i_begin; i < i_end; ++i) {
53 660 double *c_row = c + (i * c_stride);
54 660 const double *a_row = a + (i * a_stride);
55
2/2
✓ Branch 0 taken 7932 times.
✓ Branch 1 taken 660 times.
8592 for (std::size_t k = k_begin; k < k_end; ++k) {
56 7932 const double aik = a_row[k];
57 7932 const double *b_row = b + (k * b_stride);
58
2/2
✓ Branch 0 taken 111468 times.
✓ Branch 1 taken 7932 times.
119400 for (std::size_t j = j_begin; j < j_end; ++j) {
59 111468 c_row[j] += aik * b_row[j];
60 }
61 }
62 }
63 }
64
65 84 void MulJBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
66 std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end, std::size_t kk,
67 std::size_t k_end) {
68
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 84 times.
168 for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
69 84 const std::size_t j_end = std::min(jj + kBlockSize, n);
70 MulMicroBlock(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
71 }
72 84 }
73
74 void MulKBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
75 std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end) {
76
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 84 times.
168 for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
77 84 const std::size_t k_end = std::min(kk + kBlockSize, n);
78 84 MulJBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end, kk, k_end);
79 }
80 }
81
82 84 void NaiveMulBlocked(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
83 std::size_t c_stride, std::size_t n) {
84 84 ZeroMatrix(c, c_stride, n);
85
86
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 84 times.
168 for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
87 84 const std::size_t i_end = std::min(ii + kBlockSize, n);
88 MulKBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end);
89 }
90 84 }
91
92 void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
93 const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
94 const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
95 for (std::size_t i = 0; i < half; ++i) {
96 double *c11 = c + (i * c_stride);
97 double *c12 = c11 + half;
98 double *c21 = c + ((i + half) * c_stride);
99 double *c22 = c21 + half;
100
101 const double *m1_row = m1.data() + (i * half);
102 const double *m2_row = m2.data() + (i * half);
103 const double *m3_row = m3.data() + (i * half);
104 const double *m4_row = m4.data() + (i * half);
105 const double *m5_row = m5.data() + (i * half);
106 const double *m6_row = m6.data() + (i * half);
107 const double *m7_row = m7.data() + (i * half);
108
109 for (std::size_t j = 0; j < half; ++j) {
110 c11[j] = m1_row[j] + m4_row[j] - m5_row[j] + m7_row[j];
111 c12[j] = m3_row[j] + m5_row[j];
112 c21[j] = m2_row[j] + m4_row[j];
113 c22[j] = m1_row[j] - m2_row[j] + m3_row[j] + m6_row[j];
114 }
115 }
116 }
117
118 using StrassenSeqFn = void (*)(const double *, std::size_t, const double *, std::size_t, double *, std::size_t,
119 std::size_t);
120
121 void StrassenSeqImpl(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
122 std::size_t c_stride, std::size_t n);
123
124 constexpr StrassenSeqFn kStrassenSeqFn = &StrassenSeqImpl;
125
126 void ComputeProduct(const double *a1, std::size_t a1_stride, const double *a2, std::size_t a2_stride, double a2_coeff,
127 const double *b1, std::size_t b1_stride, const double *b2, std::size_t b2_stride, double b2_coeff,
128 std::vector<double> &out, std::size_t n) {
129 std::vector<double> lhs(n * n);
130 std::vector<double> rhs(n * n);
131 AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
132 AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
133 out.assign(n * n, 0.0);
134 kStrassenSeqFn(lhs.data(), n, rhs.data(), n, out.data(), n, n);
135 }
136
137 void ComputeProductSingle(const double *a, std::size_t a_stride, const double *b1, std::size_t b1_stride,
138 const double *b2, std::size_t b2_stride, double b2_coeff, std::vector<double> &out,
139 std::size_t n) {
140 std::vector<double> rhs(n * n);
141 AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
142 out.assign(n * n, 0.0);
143 kStrassenSeqFn(a, a_stride, rhs.data(), n, out.data(), n, n);
144 }
145
146 void ComputeProductSingleLeft(const double *a1, std::size_t a1_stride, const double *a2, std::size_t a2_stride,
147 double a2_coeff, const double *b, std::size_t b_stride, std::vector<double> &out,
148 std::size_t n) {
149 std::vector<double> lhs(n * n);
150 AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
151 out.assign(n * n, 0.0);
152 kStrassenSeqFn(lhs.data(), n, b, b_stride, out.data(), n, n);
153 }
154
155 84 void StrassenTopTbb(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
156 std::size_t c_stride, std::size_t n) {
157
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 84 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
84 if (n <= kCutoff || ppc::util::GetNumThreads() <= 1) {
158 84 kStrassenSeqFn(a, a_stride, b, b_stride, c, c_stride, n);
159 84 return;
160 }
161
162 const std::size_t half = n / 2;
163
164 const double *a11 = a;
165 const double *a12 = a + half;
166 const double *a21 = a + (half * a_stride);
167 const double *a22 = a21 + half;
168
169 const double *b11 = b;
170 const double *b12 = b + half;
171 const double *b21 = b + (half * b_stride);
172 const double *b22 = b21 + half;
173
174 std::vector<double> m1;
175 std::vector<double> m2;
176 std::vector<double> m3;
177 std::vector<double> m4;
178 std::vector<double> m5;
179 std::vector<double> m6;
180 std::vector<double> m7;
181
182 oneapi::tbb::parallel_invoke(
183 [&] { ComputeProduct(a11, a_stride, a22, a_stride, 1.0, b11, b_stride, b22, b_stride, 1.0, m1, half); }, [&] {
184 ComputeProductSingleLeft(a21, a_stride, a22, a_stride, 1.0, b11, b_stride, m2, half);
185 }, [&] { ComputeProductSingle(a11, a_stride, b12, b_stride, b22, b_stride, -1.0, m3, half); }, [&] {
186 ComputeProductSingle(a22, a_stride, b21, b_stride, b11, b_stride, -1.0, m4, half);
187 }, [&] { ComputeProductSingleLeft(a11, a_stride, a12, a_stride, 1.0, b22, b_stride, m5, half); }, [&] {
188 ComputeProduct(a21, a_stride, a11, a_stride, -1.0, b11, b_stride, b12, b_stride, 1.0, m6, half);
189 }, [&] { ComputeProduct(a12, a_stride, a22, a_stride, -1.0, b21, b_stride, b22, b_stride, 1.0, m7, half); });
190
191 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
192 }
193
194 84 void StrassenSeqImpl(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
195 std::size_t c_stride, std::size_t n) {
196
1/2
✓ Branch 0 taken 84 times.
✗ Branch 1 not taken.
84 if (n <= kCutoff) {
197 84 NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
198 84 return;
199 }
200
201 const std::size_t half = n / 2;
202
203 const double *a11 = a;
204 const double *a12 = a + half;
205 const double *a21 = a + (half * a_stride);
206 const double *a22 = a21 + half;
207
208 const double *b11 = b;
209 const double *b12 = b + half;
210 const double *b21 = b + (half * b_stride);
211 const double *b22 = b21 + half;
212
213 std::vector<double> m1(half * half);
214 std::vector<double> m2(half * half);
215 std::vector<double> m3(half * half);
216 std::vector<double> m4(half * half);
217 std::vector<double> m5(half * half);
218 std::vector<double> m6(half * half);
219 std::vector<double> m7(half * half);
220 std::vector<double> lhs(half * half);
221 std::vector<double> rhs(half * half);
222
223 AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
224 AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
225 kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
226
227 AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
228 kStrassenSeqFn(lhs.data(), half, b11, b_stride, m2.data(), half, half);
229
230 AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
231 kStrassenSeqFn(a11, a_stride, rhs.data(), half, m3.data(), half, half);
232
233 AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
234 kStrassenSeqFn(a22, a_stride, rhs.data(), half, m4.data(), half, half);
235
236 AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
237 kStrassenSeqFn(lhs.data(), half, b22, b_stride, m5.data(), half, half);
238
239 AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
240 AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
241 kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
242
243 AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
244 AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
245 kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
246
247 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
248 }
249
250 } // namespace
251
252
1/2
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
84 ZorinDStrassenAlgMatrixTBB::ZorinDStrassenAlgMatrixTBB(const InType &in) {
253 SetTypeOfTask(GetStaticTypeOfTask());
254 GetInput() = in;
255 GetOutput().clear();
256 84 }
257
258 84 bool ZorinDStrassenAlgMatrixTBB::ValidationImpl() {
259 const auto &in = GetInput();
260
1/2
✓ Branch 0 taken 84 times.
✗ Branch 1 not taken.
84 if (in.n == 0) {
261 return false;
262 }
263
1/2
✓ Branch 0 taken 84 times.
✗ Branch 1 not taken.
84 if (in.a.size() != in.n * in.n) {
264 return false;
265 }
266
1/2
✓ Branch 0 taken 84 times.
✗ Branch 1 not taken.
84 if (in.b.size() != in.n * in.n) {
267 return false;
268 }
269
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 84 times.
84 if (!GetOutput().empty()) {
270 return false;
271 }
272 return true;
273 }
274
275 84 bool ZorinDStrassenAlgMatrixTBB::PreProcessingImpl() {
276 84 const auto n = GetInput().n;
277 84 GetOutput().assign(n * n, 0.0);
278 84 return true;
279 }
280
281 84 bool ZorinDStrassenAlgMatrixTBB::RunImpl() {
282 const auto &in = GetInput();
283
2/2
✓ Branch 0 taken 72 times.
✓ Branch 1 taken 12 times.
84 const std::size_t n = in.n;
284 const std::size_t padded = NextPow2(n);
285
286 84 std::vector<double> a_pad(padded * padded, 0.0);
287
1/4
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
84 std::vector<double> b_pad(padded * padded, 0.0);
288
1/4
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
84 std::vector<double> c_pad(padded * padded, 0.0);
289
290
2/2
✓ Branch 0 taken 528 times.
✓ Branch 1 taken 84 times.
612 for (std::size_t i = 0; i < n; ++i) {
291 528 std::copy_n(in.a.data() + (i * n), n, a_pad.data() + (i * padded));
292 528 std::copy_n(in.b.data() + (i * n), n, b_pad.data() + (i * padded));
293 }
294
295 oneapi::tbb::global_control control(oneapi::tbb::global_control::max_allowed_parallelism,
296
3/8
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 84 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 84 times.
✗ Branch 8 not taken.
✗ Branch 9 not taken.
✗ Branch 10 not taken.
84 static_cast<std::size_t>(std::max(1, ppc::util::GetNumThreads())));
297 (void)control;
298
1/2
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
84 StrassenTopTbb(a_pad.data(), padded, b_pad.data(), padded, c_pad.data(), padded, padded);
299
300 auto &out = GetOutput();
301
2/2
✓ Branch 0 taken 528 times.
✓ Branch 1 taken 84 times.
612 for (std::size_t i = 0; i < n; ++i) {
302 528 std::copy_n(c_pad.data() + (i * padded), n, out.data() + (i * n));
303 }
304
305 84 return true;
306 }
307
308 84 bool ZorinDStrassenAlgMatrixTBB::PostProcessingImpl() {
309 84 return true;
310 }
311
312 } // namespace zorin_d_strassen_alg_matrix_seq
313