GCC Code Coverage Report


Directory: ./
File: tasks/zorin_d_strassen_alg_matrix_seq/omp/src/ops_omp.cpp
Date: 2026-04-02 17:12:27
Exec Total Coverage
Lines: 58 148 39.2%
Functions: 9 14 64.3%
Branches: 32 130 24.6%

Line Branch Exec Source
1 #include "zorin_d_strassen_alg_matrix_seq/omp/include/ops_omp.hpp"
2
3 #include <algorithm>
4 #include <cstddef>
5 #include <vector>
6
7 #include "util/include/util.hpp"
8 #include "zorin_d_strassen_alg_matrix_seq/common/include/common.hpp"
9
10 namespace zorin_d_strassen_alg_matrix_seq {
11
12 namespace {
13
14 constexpr std::size_t kCutoff = 256;
15 constexpr std::size_t kBlockSize = 64;
16
17 std::size_t NextPow2(std::size_t x) {
18 84 if (x <= 1) {
19 return 1;
20 }
21 std::size_t p = 1;
22
2/2
✓ Branch 0 taken 204 times.
✓ Branch 1 taken 72 times.
276 while (p < x) {
23 204 p <<= 1;
24 }
25 return p;
26 }
27
28 void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
29
2/2
✓ Branch 0 taken 660 times.
✓ Branch 1 taken 84 times.
744 for (std::size_t i = 0; i < n; ++i) {
30 660 std::fill_n(dst + (i * stride), n, 0.0);
31 }
32 }
33
34 void AddToBuffer(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *dst,
35 std::size_t n, double b_coeff) {
36 for (std::size_t i = 0; i < n; ++i) {
37 const double *a_row = a + (i * a_stride);
38 const double *b_row = b + (i * b_stride);
39 double *dst_row = dst + (i * n);
40 for (std::size_t j = 0; j < n; ++j) {
41 dst_row[j] = a_row[j] + (b_coeff * b_row[j]);
42 }
43 }
44 }
45
46 void MulMicroBlock(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
47 std::size_t c_stride, std::size_t i_begin, std::size_t i_end, std::size_t k_begin, std::size_t k_end,
48 std::size_t j_begin, std::size_t j_end) {
49
2/2
✓ Branch 0 taken 660 times.
✓ Branch 1 taken 84 times.
744 for (std::size_t i = i_begin; i < i_end; ++i) {
50 660 double *c_row = c + (i * c_stride);
51 660 const double *a_row = a + (i * a_stride);
52
2/2
✓ Branch 0 taken 7932 times.
✓ Branch 1 taken 660 times.
8592 for (std::size_t k = k_begin; k < k_end; ++k) {
53 7932 const double aik = a_row[k];
54 7932 const double *b_row = b + (k * b_stride);
55
2/2
✓ Branch 0 taken 111468 times.
✓ Branch 1 taken 7932 times.
119400 for (std::size_t j = j_begin; j < j_end; ++j) {
56 111468 c_row[j] += aik * b_row[j];
57 }
58 }
59 }
60 }
61
62 84 void MulJBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
63 std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end, std::size_t kk,
64 std::size_t k_end) {
65
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 84 times.
168 for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
66 84 const std::size_t j_end = std::min(jj + kBlockSize, n);
67 MulMicroBlock(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
68 }
69 84 }
70
71 void MulKBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
72 std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end) {
73
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 84 times.
168 for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
74 84 const std::size_t k_end = std::min(kk + kBlockSize, n);
75 84 MulJBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end, kk, k_end);
76 }
77 }
78
79 84 void NaiveMulBlocked(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
80 std::size_t c_stride, std::size_t n) {
81 84 ZeroMatrix(c, c_stride, n);
82
83
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 84 times.
168 for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
84 84 const std::size_t i_end = std::min(ii + kBlockSize, n);
85 MulKBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end);
86 }
87 84 }
88
89 void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
90 const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
91 const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
92 for (std::size_t i = 0; i < half; ++i) {
93 double *c11 = c + (i * c_stride);
94 double *c12 = c11 + half;
95 double *c21 = c + ((i + half) * c_stride);
96 double *c22 = c21 + half;
97
98 const double *m1_row = m1.data() + (i * half);
99 const double *m2_row = m2.data() + (i * half);
100 const double *m3_row = m3.data() + (i * half);
101 const double *m4_row = m4.data() + (i * half);
102 const double *m5_row = m5.data() + (i * half);
103 const double *m6_row = m6.data() + (i * half);
104 const double *m7_row = m7.data() + (i * half);
105
106 for (std::size_t j = 0; j < half; ++j) {
107 c11[j] = m1_row[j] + m4_row[j] - m5_row[j] + m7_row[j];
108 c12[j] = m3_row[j] + m5_row[j];
109 c21[j] = m2_row[j] + m4_row[j];
110 c22[j] = m1_row[j] - m2_row[j] + m3_row[j] + m6_row[j];
111 }
112 }
113 }
114
115 using StrassenSeqFn = void (*)(const double *, std::size_t, const double *, std::size_t, double *, std::size_t,
116 std::size_t);
117
118 void StrassenSeqImpl(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
119 std::size_t c_stride, std::size_t n);
120
121 constexpr StrassenSeqFn kStrassenSeqFn = &StrassenSeqImpl;
122
123 void ComputeProduct(const double *a1, std::size_t a1_stride, const double *a2, std::size_t a2_stride, double a2_coeff,
124 const double *b1, std::size_t b1_stride, const double *b2, std::size_t b2_stride, double b2_coeff,
125 std::vector<double> &out, std::size_t n) {
126 std::vector<double> lhs(n * n);
127 std::vector<double> rhs(n * n);
128 AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
129 AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
130 out.assign(n * n, 0.0);
131 kStrassenSeqFn(lhs.data(), n, rhs.data(), n, out.data(), n, n);
132 }
133
134 void ComputeProductSingle(const double *a, std::size_t a_stride, const double *b1, std::size_t b1_stride,
135 const double *b2, std::size_t b2_stride, double b2_coeff, std::vector<double> &out,
136 std::size_t n) {
137 std::vector<double> rhs(n * n);
138 AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
139 out.assign(n * n, 0.0);
140 kStrassenSeqFn(a, a_stride, rhs.data(), n, out.data(), n, n);
141 }
142
143 void ComputeProductSingleLeft(const double *a1, std::size_t a1_stride, const double *a2, std::size_t a2_stride,
144 double a2_coeff, const double *b, std::size_t b_stride, std::vector<double> &out,
145 std::size_t n) {
146 std::vector<double> lhs(n * n);
147 AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
148 out.assign(n * n, 0.0);
149 kStrassenSeqFn(lhs.data(), n, b, b_stride, out.data(), n, n);
150 }
151
152 84 void StrassenTopOmp(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
153 std::size_t c_stride, std::size_t n) {
154
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 84 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
84 if (n <= kCutoff || ppc::util::GetNumThreads() <= 1) {
155 84 kStrassenSeqFn(a, a_stride, b, b_stride, c, c_stride, n);
156 84 return;
157 }
158
159 const std::size_t half = n / 2;
160
161 const double *a11 = a;
162 const double *a12 = a + half;
163 const double *a21 = a + (half * a_stride);
164 const double *a22 = a21 + half;
165
166 const double *b11 = b;
167 const double *b12 = b + half;
168 const double *b21 = b + (half * b_stride);
169 const double *b22 = b21 + half;
170
171 std::vector<double> m1;
172 std::vector<double> m2;
173 std::vector<double> m3;
174 std::vector<double> m4;
175 std::vector<double> m5;
176 std::vector<double> m6;
177 std::vector<double> m7;
178
179 #pragma omp parallel default(none) \
180 shared(m1, m2, m3, m4, m5, m6, m7, a11, a12, a21, a22, b11, b12, b21, b22, a_stride, b_stride, half)
181 {
182 #pragma omp single nowait
183 {
184 #pragma omp task default(none) shared(m1, a11, a22, b11, b22, a_stride, b_stride, half)
185 ComputeProduct(a11, a_stride, a22, a_stride, 1.0, b11, b_stride, b22, b_stride, 1.0, m1, half);
186 #pragma omp task default(none) shared(m2, a21, a22, b11, a_stride, b_stride, half)
187 ComputeProductSingleLeft(a21, a_stride, a22, a_stride, 1.0, b11, b_stride, m2, half);
188 #pragma omp task default(none) shared(m3, a11, b12, b22, a_stride, b_stride, half)
189 ComputeProductSingle(a11, a_stride, b12, b_stride, b22, b_stride, -1.0, m3, half);
190 #pragma omp task default(none) shared(m4, a22, b21, b11, a_stride, b_stride, half)
191 ComputeProductSingle(a22, a_stride, b21, b_stride, b11, b_stride, -1.0, m4, half);
192 #pragma omp task default(none) shared(m5, a11, a12, b22, a_stride, b_stride, half)
193 ComputeProductSingleLeft(a11, a_stride, a12, a_stride, 1.0, b22, b_stride, m5, half);
194 #pragma omp task default(none) shared(m6, a21, a11, b11, b12, a_stride, b_stride, half)
195 ComputeProduct(a21, a_stride, a11, a_stride, -1.0, b11, b_stride, b12, b_stride, 1.0, m6, half);
196 #pragma omp task default(none) shared(m7, a12, a22, b21, b22, a_stride, b_stride, half)
197 ComputeProduct(a12, a_stride, a22, a_stride, -1.0, b21, b_stride, b22, b_stride, 1.0, m7, half);
198 #pragma omp taskwait
199 }
200 }
201
202 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
203 }
204
205 84 void StrassenSeqImpl(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
206 std::size_t c_stride, std::size_t n) {
207
1/2
✓ Branch 0 taken 84 times.
✗ Branch 1 not taken.
84 if (n <= kCutoff) {
208 84 NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
209 84 return;
210 }
211
212 const std::size_t half = n / 2;
213
214 const double *a11 = a;
215 const double *a12 = a + half;
216 const double *a21 = a + (half * a_stride);
217 const double *a22 = a21 + half;
218
219 const double *b11 = b;
220 const double *b12 = b + half;
221 const double *b21 = b + (half * b_stride);
222 const double *b22 = b21 + half;
223
224 std::vector<double> m1(half * half);
225 std::vector<double> m2(half * half);
226 std::vector<double> m3(half * half);
227 std::vector<double> m4(half * half);
228 std::vector<double> m5(half * half);
229 std::vector<double> m6(half * half);
230 std::vector<double> m7(half * half);
231 std::vector<double> lhs(half * half);
232 std::vector<double> rhs(half * half);
233
234 AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
235 AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
236 kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
237
238 AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
239 kStrassenSeqFn(lhs.data(), half, b11, b_stride, m2.data(), half, half);
240
241 AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
242 kStrassenSeqFn(a11, a_stride, rhs.data(), half, m3.data(), half, half);
243
244 AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
245 kStrassenSeqFn(a22, a_stride, rhs.data(), half, m4.data(), half, half);
246
247 AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
248 kStrassenSeqFn(lhs.data(), half, b22, b_stride, m5.data(), half, half);
249
250 AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
251 AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
252 kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
253
254 AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
255 AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
256 kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
257
258 CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
259 }
260
261 } // namespace
262
263
1/2
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
84 ZorinDStrassenAlgMatrixOMP::ZorinDStrassenAlgMatrixOMP(const InType &in) {
264 SetTypeOfTask(GetStaticTypeOfTask());
265 GetInput() = in;
266 GetOutput().clear();
267 84 }
268
269 84 bool ZorinDStrassenAlgMatrixOMP::ValidationImpl() {
270 const auto &in = GetInput();
271
1/2
✓ Branch 0 taken 84 times.
✗ Branch 1 not taken.
84 if (in.n == 0) {
272 return false;
273 }
274
1/2
✓ Branch 0 taken 84 times.
✗ Branch 1 not taken.
84 if (in.a.size() != in.n * in.n) {
275 return false;
276 }
277
1/2
✓ Branch 0 taken 84 times.
✗ Branch 1 not taken.
84 if (in.b.size() != in.n * in.n) {
278 return false;
279 }
280
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 84 times.
84 if (!GetOutput().empty()) {
281 return false;
282 }
283 return true;
284 }
285
286 84 bool ZorinDStrassenAlgMatrixOMP::PreProcessingImpl() {
287 84 const auto n = GetInput().n;
288 84 GetOutput().assign(n * n, 0.0);
289 84 return true;
290 }
291
292 84 bool ZorinDStrassenAlgMatrixOMP::RunImpl() {
293 const auto &in = GetInput();
294
2/2
✓ Branch 0 taken 72 times.
✓ Branch 1 taken 12 times.
84 const std::size_t n = in.n;
295 const std::size_t padded = NextPow2(n);
296
297 84 std::vector<double> a_pad(padded * padded, 0.0);
298
1/4
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
84 std::vector<double> b_pad(padded * padded, 0.0);
299
1/4
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
84 std::vector<double> c_pad(padded * padded, 0.0);
300
301
2/2
✓ Branch 0 taken 528 times.
✓ Branch 1 taken 84 times.
612 for (std::size_t i = 0; i < n; ++i) {
302 528 std::copy_n(in.a.data() + (i * n), n, a_pad.data() + (i * padded));
303 528 std::copy_n(in.b.data() + (i * n), n, b_pad.data() + (i * padded));
304 }
305
306
1/2
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
84 StrassenTopOmp(a_pad.data(), padded, b_pad.data(), padded, c_pad.data(), padded, padded);
307
308 auto &out = GetOutput();
309
2/2
✓ Branch 0 taken 528 times.
✓ Branch 1 taken 84 times.
612 for (std::size_t i = 0; i < n; ++i) {
310 528 std::copy_n(c_pad.data() + (i * padded), n, out.data() + (i * n));
311 }
312
313 84 return true;
314 }
315
316 84 bool ZorinDStrassenAlgMatrixOMP::PostProcessingImpl() {
317 84 return true;
318 }
319
320 } // namespace zorin_d_strassen_alg_matrix_seq
321