GCC Code Coverage Report


Directory: ./
File: tasks/muhammadkhon_i_stressen_alg/omp/src/ops_omp.cpp
Date: 2026-06-04 20:25:32
Exec Total Coverage
Lines: 58 131 44.3%
Functions: 7 11 63.6%
Branches: 37 130 28.5%

Line Branch Exec Source
1 #include "muhammadkhon_i_stressen_alg/omp/include/ops_omp.hpp"
2
3 #include <algorithm>
4 #include <cstddef>
5 #include <functional>
6 #include <vector>
7
8 #include "muhammadkhon_i_stressen_alg/common/include/common.hpp"
9
10 namespace muhammadkhon_i_stressen_alg {
11
12 namespace {
13
14 constexpr std::size_t kCutoff = 64;
15 constexpr std::size_t kBlockSize = 64;
16
17 std::size_t NextPow2(std::size_t x) {
18 24 if (x <= 1) {
19 return 1;
20 }
21 std::size_t p = 1;
22
2/2
✓ Branch 0 taken 96 times.
✓ Branch 1 taken 24 times.
120 while (p < x) {
23 96 p <<= 1;
24 }
25 return p;
26 }
27
28 void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
29
2/2
✓ Branch 0 taken 672 times.
✓ Branch 1 taken 24 times.
696 for (std::size_t i = 0; i < n; ++i) {
30 672 std::fill_n(dst + (i * stride), n, 0.0);
31 }
32 }
33
34 void AddToBuffer(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *dst,
35 std::size_t n, double b_coeff) {
36 for (std::size_t i = 0; i < n; ++i) {
37 for (std::size_t j = 0; j < n; ++j) {
38 dst[(i * n) + j] = a[(i * a_stride) + j] + (b_coeff * b[(i * b_stride) + j]);
39 }
40 }
41 }
42
43 void MulIKJ(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
44 std::size_t c_stride, std::size_t ii, std::size_t i_end, std::size_t kk, std::size_t k_end, std::size_t jj,
45 std::size_t j_end) {
46
2/2
✓ Branch 0 taken 672 times.
✓ Branch 1 taken 24 times.
696 for (std::size_t i = ii; i < i_end; ++i) {
47 672 double *c_row = c + (i * c_stride);
48 672 const double *a_row = a + (i * a_stride);
49
2/2
✓ Branch 0 taken 34944 times.
✓ Branch 1 taken 672 times.
35616 for (std::size_t k = kk; k < k_end; ++k) {
50 34944 const double aik = a_row[k];
51 34944 const double *b_row = b + (k * b_stride);
52
2/2
✓ Branch 0 taken 2130432 times.
✓ Branch 1 taken 34944 times.
2165376 for (std::size_t j = jj; j < j_end; ++j) {
53 2130432 c_row[j] += aik * b_row[j];
54 }
55 }
56 }
57 }
58
59 24 void NaiveMulBlocked(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
60 std::size_t c_stride, std::size_t n) {
61 24 ZeroMatrix(c, c_stride, n);
62
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 24 times.
48 for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
63 24 const std::size_t i_end = std::min(ii + kBlockSize, n);
64
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 24 times.
48 for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
65 24 const std::size_t k_end = std::min(kk + kBlockSize, n);
66
2/2
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 24 times.
48 for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
67 24 const std::size_t j_end = std::min(jj + kBlockSize, n);
68 MulIKJ(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
69 }
70 }
71 }
72 24 }
73
74 void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
75 const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
76 const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
77 for (std::size_t i = 0; i < half; ++i) {
78 double *c11 = c + (i * c_stride);
79 double *c12 = c11 + half;
80 double *c21 = c + ((i + half) * c_stride);
81 double *c22 = c21 + half;
82 for (std::size_t j = 0; j < half; ++j) {
83 const std::size_t idx = (i * half) + j;
84 c11[j] = m1[idx] + m4[idx] - m5[idx] + m7[idx];
85 c12[j] = m3[idx] + m5[idx];
86 c21[j] = m2[idx] + m4[idx];
87 c22[j] = m1[idx] - m2[idx] + m3[idx] + m6[idx];
88 }
89 }
90 }
91
92 void StrassenSeq(const double *a_in, std::size_t a_stride_in, const double *b_in, std::size_t b_stride_in, double *c_in,
93 std::size_t c_stride_in, std::size_t n_in) {
94 std::function<void(const double *, std::size_t, const double *, std::size_t, double *, std::size_t, std::size_t)>
95 impl = [&](const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
96 std::size_t c_stride, std::size_t n) {
97 if (n <= kCutoff) {
98 NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
99 return;
100 }
101 const std::size_t half = n / 2;
102
103 const double *a11 = a;
104 const double *a12 = a + half;
105 const double *a21 = a + (half * a_stride);
106 const double *a22 = a21 + half;
107 const double *b11 = b;
108 const double *b12 = b + half;
109 const double *b21 = b + (half * b_stride);
110 const double *b22 = b21 + half;
111
112 std::vector<double> lhs(half * half);
113 std::vector<double> rhs(half * half);
114 std::vector<double> m1(half * half);
115 std::vector<double> m2(half * half);
116 std::vector<double> m3(half * half);
117 std::vector<double> m4(half * half);
118 std::vector<double> m5(half * half);
119 std::vector<double> m6(half * half);
120 std::vector<double> m7(half * half);
121
122 // M1 = (A11+A22)(B11+B22)
123 AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
124 AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
125 impl(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
126
127 // M2 = (A21+A22)B11
128 AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
129 impl(lhs.data(), half, b11, b_stride, m2.data(), half, half);
130
131 // M3 = A11(B12-B22)
132 AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
133 impl(a11, a_stride, rhs.data(), half, m3.data(), half, half);
134
135 // M4 = A22(B21-B11)
136 AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
137 impl(a22, a_stride, rhs.data(), half, m4.data(), half, half);
138
139 // M5 = (A11+A12)B22
140 AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
141 impl(lhs.data(), half, b22, b_stride, m5.data(), half, half);
142
143 // M6 = (A21-A11)(B11+B12)
144 AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
145 AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
146 impl(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
147
148 // M7 = (A12-A22)(B21+B22)
149 AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
150 AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
151 impl(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
152
153 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
154 };
155
156 impl(a_in, a_stride_in, b_in, b_stride_in, c_in, c_stride_in, n_in);
157 }
158
159 24 void StrassenTopOmp(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
160 std::size_t c_stride, std::size_t n) {
161
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
24 if (n <= kCutoff) {
162 24 NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
163 24 return;
164 }
165
166 const std::size_t half = n / 2;
167
168 const double *a11 = a;
169 const double *a12 = a + half;
170 const double *a21 = a + (half * a_stride);
171 const double *a22 = a21 + half;
172 const double *b11 = b;
173 const double *b12 = b + half;
174 const double *b21 = b + (half * b_stride);
175 const double *b22 = b21 + half;
176
177 std::vector<double> m1(half * half);
178 std::vector<double> m2(half * half);
179 std::vector<double> m3(half * half);
180 std::vector<double> m4(half * half);
181 std::vector<double> m5(half * half);
182 std::vector<double> m6(half * half);
183 std::vector<double> m7(half * half);
184
185 #pragma omp parallel default(none) \
186 shared(m1, m2, m3, m4, m5, m6, m7, a11, a12, a21, a22, b11, b12, b21, b22, a_stride, b_stride, half)
187 {
188 #pragma omp single nowait
189 {
190 // M1 = (A11+A22)(B11+B22)
191 #pragma omp task default(none) shared(m1, a11, a22, b11, b22, a_stride, b_stride, half)
192 {
193 std::vector<double> lhs(half * half);
194 std::vector<double> rhs(half * half);
195 AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
196 AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
197 StrassenSeq(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
198 }
199 // M2 = (A21+A22)B11
200 #pragma omp task default(none) shared(m2, a21, a22, b11, a_stride, b_stride, half)
201 {
202 std::vector<double> lhs(half * half);
203 AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
204 StrassenSeq(lhs.data(), half, b11, b_stride, m2.data(), half, half);
205 }
206 // M3 = A11(B12-B22)
207 #pragma omp task default(none) shared(m3, a11, b12, b22, a_stride, b_stride, half)
208 {
209 std::vector<double> rhs(half * half);
210 AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
211 StrassenSeq(a11, a_stride, rhs.data(), half, m3.data(), half, half);
212 }
213 // M4 = A22(B21-B11)
214 #pragma omp task default(none) shared(m4, a22, b21, b11, a_stride, b_stride, half)
215 {
216 std::vector<double> rhs(half * half);
217 AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
218 StrassenSeq(a22, a_stride, rhs.data(), half, m4.data(), half, half);
219 }
220 // M5 = (A11+A12)B22
221 #pragma omp task default(none) shared(m5, a11, a12, b22, a_stride, b_stride, half)
222 {
223 std::vector<double> lhs(half * half);
224 AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
225 StrassenSeq(lhs.data(), half, b22, b_stride, m5.data(), half, half);
226 }
227 // M6 = (A21-A11)(B11+B12)
228 #pragma omp task default(none) shared(m6, a21, a11, b11, b12, a_stride, b_stride, half)
229 {
230 std::vector<double> lhs(half * half);
231 std::vector<double> rhs(half * half);
232 AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
233 AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
234 StrassenSeq(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
235 }
236 // M7 = (A12-A22)(B21+B22)
237 #pragma omp task default(none) shared(m7, a12, a22, b21, b22, a_stride, b_stride, half)
238 {
239 std::vector<double> lhs(half * half);
240 std::vector<double> rhs(half * half);
241 AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
242 AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
243 StrassenSeq(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
244 }
245 #pragma omp taskwait
246 }
247 }
248
249 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
250 }
251
252 } // namespace
253
254
1/2
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
24 MuhammadkhonIStressenAlgOMP::MuhammadkhonIStressenAlgOMP(const InType &in) {
255 SetTypeOfTask(GetStaticTypeOfTask());
256
1/2
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
24 GetInput() = in;
257 GetOutput() = {};
258 24 }
259
260 24 bool MuhammadkhonIStressenAlgOMP::ValidationImpl() {
261 const auto &in = GetInput();
262
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 24 times.
24 return in.a_rows > 0 && in.a_cols_b_rows > 0 && in.b_cols > 0 &&
263
2/4
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 24 times.
48 in.a.size() == static_cast<size_t>(in.a_rows * in.a_cols_b_rows) &&
264
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
24 in.b.size() == static_cast<size_t>(in.a_cols_b_rows * in.b_cols);
265 }
266
267 24 bool MuhammadkhonIStressenAlgOMP::PreProcessingImpl() {
268 GetOutput() = {};
269 const auto &in = GetInput();
270 24 a_rows_ = in.a_rows;
271 24 a_cols_b_rows_ = in.a_cols_b_rows;
272 24 b_cols_ = in.b_cols;
273
274
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
48 size_t max_dim = std::max({a_rows_, a_cols_b_rows_, b_cols_});
275 24 padded_n_ = NextPow2(max_dim);
276
277 24 padded_a_.assign(padded_n_ * padded_n_, 0.0);
278 24 padded_b_.assign(padded_n_ * padded_n_, 0.0);
279
280
2/2
✓ Branch 0 taken 660 times.
✓ Branch 1 taken 24 times.
684 for (size_t i = 0; i < a_rows_; ++i) {
281
2/2
✓ Branch 0 taken 33684 times.
✓ Branch 1 taken 660 times.
34344 for (size_t j = 0; j < a_cols_b_rows_; ++j) {
282 33684 padded_a_[(i * padded_n_) + j] = in.a[(i * a_cols_b_rows_) + j];
283 }
284 }
285
286
2/2
✓ Branch 0 taken 620 times.
✓ Branch 1 taken 24 times.
644 for (size_t i = 0; i < a_cols_b_rows_; ++i) {
287
2/2
✓ Branch 0 taken 33684 times.
✓ Branch 1 taken 620 times.
34304 for (size_t j = 0; j < b_cols_; ++j) {
288 33684 padded_b_[(i * padded_n_) + j] = in.b[(i * b_cols_) + j];
289 }
290 }
291
292 24 return true;
293 }
294
295 24 bool MuhammadkhonIStressenAlgOMP::RunImpl() {
296 24 result_c_.assign(padded_n_ * padded_n_, 0.0);
297
298 24 StrassenTopOmp(padded_a_.data(), padded_n_, padded_b_.data(), padded_n_, result_c_.data(), padded_n_, padded_n_);
299
300 auto &out = GetOutput();
301 24 out.assign(a_rows_ * b_cols_, 0.0);
302
2/2
✓ Branch 0 taken 660 times.
✓ Branch 1 taken 24 times.
684 for (size_t i = 0; i < a_rows_; ++i) {
303
2/2
✓ Branch 0 taken 34284 times.
✓ Branch 1 taken 660 times.
34944 for (size_t j = 0; j < b_cols_; ++j) {
304 34284 out[(i * b_cols_) + j] = result_c_[(i * padded_n_) + j];
305 }
306 }
307
308 24 return true;
309 }
310
311 24 bool MuhammadkhonIStressenAlgOMP::PostProcessingImpl() {
312 24 return true;
313 }
314
315 } // namespace muhammadkhon_i_stressen_alg
316