GCC Code Coverage Report


Directory: ./
File: tasks/muhammadkhon_i_stressen_alg/tbb/src/ops_tbb.cpp
Date: 2026-06-04 20:25:32
Exec Total Coverage
Lines: 69 178 38.8%
Functions: 10 19 52.6%
Branches: 42 170 24.7%

Line Branch Exec Source
1 #include "muhammadkhon_i_stressen_alg/tbb/include/ops_tbb.hpp"
2
3 #include <oneapi/tbb/global_control.h>
4 #include <oneapi/tbb/parallel_invoke.h>
5
6 #include <algorithm>
7 #include <cstddef>
8 #include <functional>
9 #include <vector>
10
11 #include "muhammadkhon_i_stressen_alg/common/include/common.hpp"
12 #include "util/include/util.hpp"
13
14 namespace muhammadkhon_i_stressen_alg {
15
16 namespace {
17
18 constexpr std::size_t kCutoff = 64;
19 constexpr std::size_t kBlockSize = 64;
20
21 std::size_t NextPow2(std::size_t x) {
22 24 if (x <= 1) {
23 return 1;
24 }
25 std::size_t p = 1;
26
2/2
✓ Branch 0 taken 96 times.
✓ Branch 1 taken 24 times.
120 while (p < x) {
27 96 p <<= 1;
28 }
29 return p;
30 }
31
32 void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
33
2/2
✓ Branch 0 taken 672 times.
✓ Branch 1 taken 24 times.
696 for (std::size_t i = 0; i < n; ++i) {
34 672 std::fill_n(dst + (i * stride), n, 0.0);
35 }
36 }
37
38 void AddToBuffer(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *dst,
39 std::size_t n, double b_coeff) {
40 for (std::size_t i = 0; i < n; ++i) {
41 for (std::size_t j = 0; j < n; ++j) {
42 dst[(i * n) + j] = a[(i * a_stride) + j] + (b_coeff * b[(i * b_stride) + j]);
43 }
44 }
45 }
46
47 void MulMicroBlock(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
48 std::size_t c_stride, std::size_t i_begin, std::size_t i_end, std::size_t k_begin, std::size_t k_end,
49 std::size_t j_begin, std::size_t j_end) {
50
2/2
✓ Branch 0 taken 672 times.
✓ Branch 1 taken 24 times.
696 for (std::size_t i = i_begin; i < i_end; ++i) {
51 672 double *c_row = c + (i * c_stride);
52 672 const double *a_row = a + (i * a_stride);
53
2/2
✓ Branch 0 taken 34944 times.
✓ Branch 1 taken 672 times.
35616 for (std::size_t k = k_begin; k < k_end; ++k) {
54 34944 const double aik = a_row[k];
55 34944 const double *b_row = b + (k * b_stride);
56
2/2
✓ Branch 0 taken 2130432 times.
✓ Branch 1 taken 34944 times.
2165376 for (std::size_t j = j_begin; j < j_end; ++j) {
57 2130432 c_row[j] += aik * b_row[j];
58 }
59 }
60 }
61 }
62
63 24 void MulKBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
64 std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end) {
65
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 24 times.
48 for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
66 24 const std::size_t k_end = std::min(kk + kBlockSize, n);
67
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 24 times.
48 for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
68 24 const std::size_t j_end = std::min(jj + kBlockSize, n);
69 MulMicroBlock(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
70 }
71 }
72 24 }
73
74 24 void NaiveMulBlocked(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
75 std::size_t c_stride, std::size_t n) {
76 24 ZeroMatrix(c, c_stride, n);
77
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 24 times.
48 for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
78 24 const std::size_t i_end = std::min(ii + kBlockSize, n);
79 24 MulKBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end);
80 }
81 24 }
82
83 void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
84 const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
85 const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
86 for (std::size_t i = 0; i < half; ++i) {
87 double *c11 = c + (i * c_stride);
88 double *c12 = c11 + half;
89 double *c21 = c + ((i + half) * c_stride);
90 double *c22 = c21 + half;
91 for (std::size_t j = 0; j < half; ++j) {
92 const std::size_t idx = (i * half) + j;
93 c11[j] = m1[idx] + m4[idx] - m5[idx] + m7[idx];
94 c12[j] = m3[idx] + m5[idx];
95 c21[j] = m2[idx] + m4[idx];
96 c22[j] = m1[idx] - m2[idx] + m3[idx] + m6[idx];
97 }
98 }
99 }
100
101
1/2
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
24 void StrassenSeq(const double *a_in, std::size_t a_stride_in, const double *b_in, std::size_t b_stride_in, double *c_in,
102 std::size_t c_stride_in, std::size_t n_in) {
103 std::function<void(const double *, std::size_t, const double *, std::size_t, double *, std::size_t, std::size_t)>
104 24 impl = [&](const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
105 std::size_t c_stride, std::size_t n) {
106
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
24 if (n <= kCutoff) {
107 24 NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
108 24 return;
109 }
110 const std::size_t half = n / 2;
111
112 const double *a11 = a;
113 const double *a12 = a + half;
114 const double *a21 = a + (half * a_stride);
115 const double *a22 = a21 + half;
116 const double *b11 = b;
117 const double *b12 = b + half;
118 const double *b21 = b + (half * b_stride);
119 const double *b22 = b21 + half;
120
121 std::vector<double> lhs(half * half);
122 std::vector<double> rhs(half * half);
123 std::vector<double> m1(half * half);
124 std::vector<double> m2(half * half);
125 std::vector<double> m3(half * half);
126 std::vector<double> m4(half * half);
127 std::vector<double> m5(half * half);
128 std::vector<double> m6(half * half);
129 std::vector<double> m7(half * half);
130
131 AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
132 AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
133 impl(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
134
135 AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
136 impl(lhs.data(), half, b11, b_stride, m2.data(), half, half);
137
138 AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
139 impl(a11, a_stride, rhs.data(), half, m3.data(), half, half);
140
141 AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
142 impl(a22, a_stride, rhs.data(), half, m4.data(), half, half);
143
144 AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
145 impl(lhs.data(), half, b22, b_stride, m5.data(), half, half);
146
147 AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
148 AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
149 impl(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
150
151 AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
152 AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
153 impl(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
154
155 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
156 };
157
158
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
24 impl(a_in, a_stride_in, b_in, b_stride_in, c_in, c_stride_in, n_in);
159 24 }
160
161 24 void StrassenTopTbb(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
162 std::size_t c_stride, std::size_t n) {
163
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
24 if (n <= kCutoff || ppc::util::GetNumThreads() <= 1) {
164 24 StrassenSeq(a, a_stride, b, b_stride, c, c_stride, n);
165 24 return;
166 }
167
168 const std::size_t half = n / 2;
169
170 const double *a11 = a;
171 const double *a12 = a + half;
172 const double *a21 = a + (half * a_stride);
173 const double *a22 = a21 + half;
174 const double *b11 = b;
175 const double *b12 = b + half;
176 const double *b21 = b + (half * b_stride);
177 const double *b22 = b21 + half;
178
179 std::vector<double> m1;
180 std::vector<double> m2;
181 std::vector<double> m3;
182 std::vector<double> m4;
183 std::vector<double> m5;
184 std::vector<double> m6;
185 std::vector<double> m7;
186
187 oneapi::tbb::parallel_invoke([&] {
188 std::vector<double> lhs(half * half);
189 std::vector<double> rhs(half * half);
190 AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
191 AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
192 m1.assign(half * half, 0.0);
193 StrassenSeq(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
194 }, [&] {
195 std::vector<double> lhs(half * half);
196 AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
197 m2.assign(half * half, 0.0);
198 StrassenSeq(lhs.data(), half, b11, b_stride, m2.data(), half, half);
199 }, [&] {
200 std::vector<double> rhs(half * half);
201 AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
202 m3.assign(half * half, 0.0);
203 StrassenSeq(a11, a_stride, rhs.data(), half, m3.data(), half, half);
204 }, [&] {
205 std::vector<double> rhs(half * half);
206 AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
207 m4.assign(half * half, 0.0);
208 StrassenSeq(a22, a_stride, rhs.data(), half, m4.data(), half, half);
209 }, [&] {
210 std::vector<double> lhs(half * half);
211 AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
212 m5.assign(half * half, 0.0);
213 StrassenSeq(lhs.data(), half, b22, b_stride, m5.data(), half, half);
214 }, [&] {
215 std::vector<double> lhs(half * half);
216 std::vector<double> rhs(half * half);
217 AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
218 AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
219 m6.assign(half * half, 0.0);
220 StrassenSeq(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
221 }, [&] {
222 std::vector<double> lhs(half * half);
223 std::vector<double> rhs(half * half);
224 AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
225 AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
226 m7.assign(half * half, 0.0);
227 StrassenSeq(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
228 });
229
230 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
231 }
232
233 } // namespace
234
235
1/2
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
24 MuhammadkhonIStressenAlgTBB::MuhammadkhonIStressenAlgTBB(const InType &in) {
236 SetTypeOfTask(GetStaticTypeOfTask());
237
1/2
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
24 GetInput() = in;
238 GetOutput() = {};
239 24 }
240
241 24 bool MuhammadkhonIStressenAlgTBB::ValidationImpl() {
242 const auto &in = GetInput();
243
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 24 times.
24 return in.a_rows > 0 && in.a_cols_b_rows > 0 && in.b_cols > 0 &&
244
2/4
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 24 times.
48 in.a.size() == static_cast<size_t>(in.a_rows * in.a_cols_b_rows) &&
245
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
24 in.b.size() == static_cast<size_t>(in.a_cols_b_rows * in.b_cols);
246 }
247
248 24 bool MuhammadkhonIStressenAlgTBB::PreProcessingImpl() {
249 GetOutput() = {};
250 const auto &in = GetInput();
251 24 a_rows_ = in.a_rows;
252 24 a_cols_b_rows_ = in.a_cols_b_rows;
253 24 b_cols_ = in.b_cols;
254
255
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
48 size_t max_dim = std::max({a_rows_, a_cols_b_rows_, b_cols_});
256 24 padded_n_ = NextPow2(max_dim);
257
258 24 padded_a_.assign(padded_n_ * padded_n_, 0.0);
259 24 padded_b_.assign(padded_n_ * padded_n_, 0.0);
260
261
2/2
✓ Branch 0 taken 660 times.
✓ Branch 1 taken 24 times.
684 for (size_t i = 0; i < a_rows_; ++i) {
262
2/2
✓ Branch 0 taken 33684 times.
✓ Branch 1 taken 660 times.
34344 for (size_t j = 0; j < a_cols_b_rows_; ++j) {
263 33684 padded_a_[(i * padded_n_) + j] = in.a[(i * a_cols_b_rows_) + j];
264 }
265 }
266
267
2/2
✓ Branch 0 taken 620 times.
✓ Branch 1 taken 24 times.
644 for (size_t i = 0; i < a_cols_b_rows_; ++i) {
268
2/2
✓ Branch 0 taken 33684 times.
✓ Branch 1 taken 620 times.
34304 for (size_t j = 0; j < b_cols_; ++j) {
269 33684 padded_b_[(i * padded_n_) + j] = in.b[(i * b_cols_) + j];
270 }
271 }
272
273 24 return true;
274 }
275
276 24 bool MuhammadkhonIStressenAlgTBB::RunImpl() {
277 24 result_c_.assign(padded_n_ * padded_n_, 0.0);
278
279 oneapi::tbb::global_control control(oneapi::tbb::global_control::max_allowed_parallelism,
280 24 static_cast<std::size_t>(std::max(1, ppc::util::GetNumThreads())));
281 (void)control;
282
283
1/2
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
24 StrassenTopTbb(padded_a_.data(), padded_n_, padded_b_.data(), padded_n_, result_c_.data(), padded_n_, padded_n_);
284
285 auto &out = GetOutput();
286
1/2
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
24 out.assign(a_rows_ * b_cols_, 0.0);
287
2/2
✓ Branch 0 taken 660 times.
✓ Branch 1 taken 24 times.
684 for (size_t i = 0; i < a_rows_; ++i) {
288
2/2
✓ Branch 0 taken 34284 times.
✓ Branch 1 taken 660 times.
34944 for (size_t j = 0; j < b_cols_; ++j) {
289 34284 out[(i * b_cols_) + j] = result_c_[(i * padded_n_) + j];
290 }
291 }
292
293 24 return true;
294 }
295
296 24 bool MuhammadkhonIStressenAlgTBB::PostProcessingImpl() {
297 24 return true;
298 }
299
300 } // namespace muhammadkhon_i_stressen_alg
301