GCC Code Coverage Report


Directory: ./
File: tasks/muhammadkhon_i_stressen_alg/stl/src/ops_stl.cpp
Date: 2026-06-04 20:25:32
Exec Total Coverage
Lines: 68 193 35.2%
Functions: 10 19 52.6%
Branches: 40 204 19.6%

Line Branch Exec Source
1 #include "muhammadkhon_i_stressen_alg/stl/include/ops_stl.hpp"
2
3 #include <algorithm>
4 #include <cstddef>
5 #include <functional>
6 #include <future>
7 #include <vector>
8
9 #include "muhammadkhon_i_stressen_alg/common/include/common.hpp"
10 #include "util/include/util.hpp"
11
12 namespace muhammadkhon_i_stressen_alg {
13
14 namespace {
15
16 constexpr std::size_t kCutoff = 64;
17 constexpr std::size_t kBlockSize = 64;
18
19 std::size_t NextPow2(std::size_t x) {
20 48 if (x <= 1) {
21 return 1;
22 }
23 std::size_t p = 1;
24
2/2
✓ Branch 0 taken 192 times.
✓ Branch 1 taken 48 times.
240 while (p < x) {
25 192 p <<= 1;
26 }
27 return p;
28 }
29
30 void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
31
2/2
✓ Branch 0 taken 1344 times.
✓ Branch 1 taken 48 times.
1392 for (std::size_t i = 0; i < n; ++i) {
32 1344 std::fill_n(dst + (i * stride), n, 0.0);
33 }
34 }
35
36 void AddToBuffer(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *dst,
37 std::size_t n, double b_coeff) {
38 for (std::size_t i = 0; i < n; ++i) {
39 const double *a_row = a + (i * a_stride);
40 const double *b_row = b + (i * b_stride);
41 double *dst_row = dst + (i * n);
42 for (std::size_t j = 0; j < n; ++j) {
43 dst_row[j] = a_row[j] + (b_coeff * b_row[j]);
44 }
45 }
46 }
47
48 void MulMicroBlock(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
49 std::size_t c_stride, std::size_t i_begin, std::size_t i_end, std::size_t k_begin, std::size_t k_end,
50 std::size_t j_begin, std::size_t j_end) {
51
2/2
✓ Branch 0 taken 1344 times.
✓ Branch 1 taken 48 times.
1392 for (std::size_t i = i_begin; i < i_end; ++i) {
52 1344 double *c_row = c + (i * c_stride);
53 1344 const double *a_row = a + (i * a_stride);
54
2/2
✓ Branch 0 taken 69888 times.
✓ Branch 1 taken 1344 times.
71232 for (std::size_t k = k_begin; k < k_end; ++k) {
55 69888 const double aik = a_row[k];
56 69888 const double *b_row = b + (k * b_stride);
57
2/2
✓ Branch 0 taken 4260864 times.
✓ Branch 1 taken 69888 times.
4330752 for (std::size_t j = j_begin; j < j_end; ++j) {
58 4260864 c_row[j] += aik * b_row[j];
59 }
60 }
61 }
62 }
63
64 48 void MulKBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
65 std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end) {
66
2/2
✓ Branch 0 taken 48 times.
✓ Branch 1 taken 48 times.
96 for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
67 48 const std::size_t k_end = std::min(kk + kBlockSize, n);
68
2/2
✓ Branch 0 taken 48 times.
✓ Branch 1 taken 48 times.
96 for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
69 48 const std::size_t j_end = std::min(jj + kBlockSize, n);
70 MulMicroBlock(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
71 }
72 }
73 48 }
74
75 48 void NaiveMulBlocked(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
76 std::size_t c_stride, std::size_t n) {
77 48 ZeroMatrix(c, c_stride, n);
78
2/2
✓ Branch 0 taken 48 times.
✓ Branch 1 taken 48 times.
96 for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
79 48 const std::size_t i_end = std::min(ii + kBlockSize, n);
80 48 MulKBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end);
81 }
82 48 }
83
84 void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
85 const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
86 const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
87 for (std::size_t i = 0; i < half; ++i) {
88 double *c11 = c + (i * c_stride);
89 double *c12 = c11 + half;
90 double *c21 = c + ((i + half) * c_stride);
91 double *c22 = c21 + half;
92 const double *m1r = m1.data() + (i * half);
93 const double *m2r = m2.data() + (i * half);
94 const double *m3r = m3.data() + (i * half);
95 const double *m4r = m4.data() + (i * half);
96 const double *m5r = m5.data() + (i * half);
97 const double *m6r = m6.data() + (i * half);
98 const double *m7r = m7.data() + (i * half);
99 for (std::size_t j = 0; j < half; ++j) {
100 c11[j] = m1r[j] + m4r[j] - m5r[j] + m7r[j];
101 c12[j] = m3r[j] + m5r[j];
102 c21[j] = m2r[j] + m4r[j];
103 c22[j] = m1r[j] - m2r[j] + m3r[j] + m6r[j];
104 }
105 }
106 }
107
108
1/2
✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
48 void StrassenSeq(const double *a_in, std::size_t a_stride_in, const double *b_in, std::size_t b_stride_in, double *c_in,
109 std::size_t c_stride_in, std::size_t n_in) {
110 std::function<void(const double *, std::size_t, const double *, std::size_t, double *, std::size_t, std::size_t)>
111 48 impl = [&](const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
112 std::size_t c_stride, std::size_t n) {
113
1/2
✓ Branch 0 taken 48 times.
✗ Branch 1 not taken.
48 if (n <= kCutoff) {
114 48 NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
115 48 return;
116 }
117 const std::size_t half = n / 2;
118
119 const double *a11 = a;
120 const double *a12 = a + half;
121 const double *a21 = a + (half * a_stride);
122 const double *a22 = a21 + half;
123 const double *b11 = b;
124 const double *b12 = b + half;
125 const double *b21 = b + (half * b_stride);
126 const double *b22 = b21 + half;
127
128 std::vector<double> lhs(half * half);
129 std::vector<double> rhs(half * half);
130 std::vector<double> m1(half * half);
131 std::vector<double> m2(half * half);
132 std::vector<double> m3(half * half);
133 std::vector<double> m4(half * half);
134 std::vector<double> m5(half * half);
135 std::vector<double> m6(half * half);
136 std::vector<double> m7(half * half);
137
138 AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
139 AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
140 impl(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
141
142 AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
143 impl(lhs.data(), half, b11, b_stride, m2.data(), half, half);
144
145 AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
146 impl(a11, a_stride, rhs.data(), half, m3.data(), half, half);
147
148 AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
149 impl(a22, a_stride, rhs.data(), half, m4.data(), half, half);
150
151 AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
152 impl(lhs.data(), half, b22, b_stride, m5.data(), half, half);
153
154 AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
155 AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
156 impl(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
157
158 AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
159 AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
160 impl(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
161
162 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
163 };
164
165
1/2
✓ Branch 0 taken 48 times.
✗ Branch 1 not taken.
48 impl(a_in, a_stride_in, b_in, b_stride_in, c_in, c_stride_in, n_in);
166 48 }
167
168 48 void StrassenTopStl(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
169 std::size_t c_stride, std::size_t n) {
170
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 48 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
48 if (n <= kCutoff || ppc::util::GetNumThreads() <= 1) {
171 48 StrassenSeq(a, a_stride, b, b_stride, c, c_stride, n);
172 48 return;
173 }
174
175 const std::size_t half = n / 2;
176
177 const double *a11 = a;
178 const double *a12 = a + half;
179 const double *a21 = a + (half * a_stride);
180 const double *a22 = a21 + half;
181 const double *b11 = b;
182 const double *b12 = b + half;
183 const double *b21 = b + (half * b_stride);
184 const double *b22 = b21 + half;
185
186 std::vector<double> m1;
187 std::vector<double> m2;
188 std::vector<double> m3;
189 std::vector<double> m4;
190 std::vector<double> m5;
191 std::vector<double> m6;
192 std::vector<double> m7;
193
194 auto f1 = std::async(std::launch::async, [&] {
195 std::vector<double> lhs(half * half);
196 std::vector<double> rhs(half * half);
197 AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
198 AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
199 m1.assign(half * half, 0.0);
200 StrassenSeq(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
201 });
202 auto f2 = std::async(std::launch::async, [&] {
203 std::vector<double> lhs(half * half);
204 AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
205 m2.assign(half * half, 0.0);
206 StrassenSeq(lhs.data(), half, b11, b_stride, m2.data(), half, half);
207 });
208 auto f3 = std::async(std::launch::async, [&] {
209 std::vector<double> rhs(half * half);
210 AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
211 m3.assign(half * half, 0.0);
212 StrassenSeq(a11, a_stride, rhs.data(), half, m3.data(), half, half);
213 });
214 auto f4 = std::async(std::launch::async, [&] {
215 std::vector<double> rhs(half * half);
216 AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
217 m4.assign(half * half, 0.0);
218 StrassenSeq(a22, a_stride, rhs.data(), half, m4.data(), half, half);
219 });
220 auto f5 = std::async(std::launch::async, [&] {
221 std::vector<double> lhs(half * half);
222 AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
223 m5.assign(half * half, 0.0);
224 StrassenSeq(lhs.data(), half, b22, b_stride, m5.data(), half, half);
225 });
226 auto f6 = std::async(std::launch::async, [&] {
227 std::vector<double> lhs(half * half);
228 std::vector<double> rhs(half * half);
229 AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
230 AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
231 m6.assign(half * half, 0.0);
232 StrassenSeq(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
233 });
234 auto f7 = std::async(std::launch::async, [&] {
235 std::vector<double> lhs(half * half);
236 std::vector<double> rhs(half * half);
237 AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
238 AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
239 m7.assign(half * half, 0.0);
240 StrassenSeq(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
241 });
242
243 f1.get();
244 f2.get();
245 f3.get();
246 f4.get();
247 f5.get();
248 f6.get();
249 f7.get();
250
251 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
252 }
253
254 } // namespace
255
256
1/2
✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
48 MuhammadkhonIStressenAlgSTL::MuhammadkhonIStressenAlgSTL(const InType &in) {
257 SetTypeOfTask(GetStaticTypeOfTask());
258
1/2
✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
48 GetInput() = in;
259 GetOutput() = {};
260 48 }
261
262 48 bool MuhammadkhonIStressenAlgSTL::ValidationImpl() {
263 const auto &in = GetInput();
264
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 48 times.
48 return in.a_rows > 0 && in.a_cols_b_rows > 0 && in.b_cols > 0 &&
265
2/4
✓ Branch 0 taken 48 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 48 times.
96 in.a.size() == static_cast<size_t>(in.a_rows * in.a_cols_b_rows) &&
266
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 48 times.
48 in.b.size() == static_cast<size_t>(in.a_cols_b_rows * in.b_cols);
267 }
268
269 48 bool MuhammadkhonIStressenAlgSTL::PreProcessingImpl() {
270 GetOutput() = {};
271 const auto &in = GetInput();
272 48 a_rows_ = in.a_rows;
273 48 a_cols_b_rows_ = in.a_cols_b_rows;
274 48 b_cols_ = in.b_cols;
275
276
1/2
✓ Branch 0 taken 48 times.
✗ Branch 1 not taken.
96 size_t max_dim = std::max({a_rows_, a_cols_b_rows_, b_cols_});
277 48 padded_n_ = NextPow2(max_dim);
278
279 48 padded_a_.assign(padded_n_ * padded_n_, 0.0);
280 48 padded_b_.assign(padded_n_ * padded_n_, 0.0);
281
282
2/2
✓ Branch 0 taken 1320 times.
✓ Branch 1 taken 48 times.
1368 for (size_t i = 0; i < a_rows_; ++i) {
283
2/2
✓ Branch 0 taken 67368 times.
✓ Branch 1 taken 1320 times.
68688 for (size_t j = 0; j < a_cols_b_rows_; ++j) {
284 67368 padded_a_[(i * padded_n_) + j] = in.a[(i * a_cols_b_rows_) + j];
285 }
286 }
287
288
2/2
✓ Branch 0 taken 1240 times.
✓ Branch 1 taken 48 times.
1288 for (size_t i = 0; i < a_cols_b_rows_; ++i) {
289
2/2
✓ Branch 0 taken 67368 times.
✓ Branch 1 taken 1240 times.
68608 for (size_t j = 0; j < b_cols_; ++j) {
290 67368 padded_b_[(i * padded_n_) + j] = in.b[(i * b_cols_) + j];
291 }
292 }
293
294 48 return true;
295 }
296
297 48 bool MuhammadkhonIStressenAlgSTL::RunImpl() {
298 48 result_c_.assign(padded_n_ * padded_n_, 0.0);
299
300 48 StrassenTopStl(padded_a_.data(), padded_n_, padded_b_.data(), padded_n_, result_c_.data(), padded_n_, padded_n_);
301
302 auto &out = GetOutput();
303 48 out.assign(a_rows_ * b_cols_, 0.0);
304
2/2
✓ Branch 0 taken 1320 times.
✓ Branch 1 taken 48 times.
1368 for (size_t i = 0; i < a_rows_; ++i) {
305
2/2
✓ Branch 0 taken 68568 times.
✓ Branch 1 taken 1320 times.
69888 for (size_t j = 0; j < b_cols_; ++j) {
306 68568 out[(i * b_cols_) + j] = result_c_[(i * padded_n_) + j];
307 }
308 }
309
310 48 return true;
311 }
312
313 48 bool MuhammadkhonIStressenAlgSTL::PostProcessingImpl() {
314 48 return true;
315 }
316
317 } // namespace muhammadkhon_i_stressen_alg
318