GCC Code Coverage Report

Directory:	./
File:	tasks/muhammadkhon_i_stressen_alg/omp/src/ops_omp.cpp
Date:	2026-06-04 20:25:32

	Exec	Total	Coverage
Lines:	58	131	44.3%
Functions:	7	11	63.6%
Branches:	37	130	28.5%

  
      Line
      Branch
      Exec
      Source
    
      #include "muhammadkhon_i_stressen_alg/omp/include/ops_omp.hpp"
    
      #include <algorithm>
    
      #include <cstddef>
    
      #include <functional>
    
      #include <vector>
    
      #include "muhammadkhon_i_stressen_alg/common/include/common.hpp"
    
      namespace muhammadkhon_i_stressen_alg {
    
      namespace {
    
      constexpr std::size_t kCutoff = 64;
    
      constexpr std::size_t kBlockSize = 64;
    
      std::size_t NextPow2(std::size_t x) {
    
      24
        if (x <= 1) {
    
          return 1;
    
        }
    
        std::size_t p = 1;
    
        2/2✓ Branch 0 taken 96 times.
✓ Branch 1 taken 24 times.

      120
        while (p < x) {
    
      96
          p <<= 1;
    
        }
    
        return p;
    
      }
    
      void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
    
        2/2✓ Branch 0 taken 672 times.
✓ Branch 1 taken 24 times.

      696
        for (std::size_t i = 0; i < n; ++i) {
    
      672
          std::fill_n(dst + (i * stride), n, 0.0);
    
        }
    
      }
    
      ✗
      void AddToBuffer(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *dst,
    
                       std::size_t n, double b_coeff) {
    
      ✗
        for (std::size_t i = 0; i < n; ++i) {
    
      ✗
          for (std::size_t j = 0; j < n; ++j) {
    
      ✗
            dst[(i * n) + j] = a[(i * a_stride) + j] + (b_coeff * b[(i * b_stride) + j]);
    
          }
    
        }
    
      ✗
      }
    
      void MulIKJ(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                  std::size_t c_stride, std::size_t ii, std::size_t i_end, std::size_t kk, std::size_t k_end, std::size_t jj,
    
                  std::size_t j_end) {
    
        2/2✓ Branch 0 taken 672 times.
✓ Branch 1 taken 24 times.

      696
        for (std::size_t i = ii; i < i_end; ++i) {
    
      672
          double *c_row = c + (i * c_stride);
    
      672
          const double *a_row = a + (i * a_stride);
    
        2/2✓ Branch 0 taken 34944 times.
✓ Branch 1 taken 672 times.

      35616
          for (std::size_t k = kk; k < k_end; ++k) {
    
      34944
            const double aik = a_row[k];
    
      34944
            const double *b_row = b + (k * b_stride);
    
        2/2✓ Branch 0 taken 2130432 times.
✓ Branch 1 taken 34944 times.

      2165376
            for (std::size_t j = jj; j < j_end; ++j) {
    
      2130432
              c_row[j] += aik * b_row[j];
    
            }
    
          }
    
        }
    
      }
    
      24
      void NaiveMulBlocked(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                           std::size_t c_stride, std::size_t n) {
    
      24
        ZeroMatrix(c, c_stride, n);
    
        2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 24 times.

      48
        for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
    
      24
          const std::size_t i_end = std::min(ii + kBlockSize, n);
    
        2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 24 times.

      48
          for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
    
      24
            const std::size_t k_end = std::min(kk + kBlockSize, n);
    
        2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 24 times.

      48
            for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
    
      24
              const std::size_t j_end = std::min(jj + kBlockSize, n);
    
              MulIKJ(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
    
            }
    
          }
    
        }
    
      24
      }
    
      ✗
      void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
    
                            const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
    
                            const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
    
      ✗
        for (std::size_t i = 0; i < half; ++i) {
    
      ✗
          double *c11 = c + (i * c_stride);
    
          double *c12 = c11 + half;
    
      ✗
          double *c21 = c + ((i + half) * c_stride);
    
          double *c22 = c21 + half;
    
      ✗
          for (std::size_t j = 0; j < half; ++j) {
    
      ✗
            const std::size_t idx = (i * half) + j;
    
      ✗
            c11[j] = m1[idx] + m4[idx] - m5[idx] + m7[idx];
    
      ✗
            c12[j] = m3[idx] + m5[idx];
    
      ✗
            c21[j] = m2[idx] + m4[idx];
    
      ✗
            c22[j] = m1[idx] - m2[idx] + m3[idx] + m6[idx];
    
          }
    
        }
    
      ✗
      }
    
      ✗
      void StrassenSeq(const double *a_in, std::size_t a_stride_in, const double *b_in, std::size_t b_stride_in, double *c_in,
    
                       std::size_t c_stride_in, std::size_t n_in) {
    
        std::function<void(const double *, std::size_t, const double *, std::size_t, double *, std::size_t, std::size_t)>
    
      ✗
            impl = [&](const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                       std::size_t c_stride, std::size_t n) {
    
      ✗
          if (n <= kCutoff) {
    
      ✗
            NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
    
      ✗
            return;
    
          }
    
      ✗
          const std::size_t half = n / 2;
    
          const double *a11 = a;
    
      ✗
          const double *a12 = a + half;
    
      ✗
          const double *a21 = a + (half * a_stride);
    
      ✗
          const double *a22 = a21 + half;
    
          const double *b11 = b;
    
      ✗
          const double *b12 = b + half;
    
      ✗
          const double *b21 = b + (half * b_stride);
    
      ✗
          const double *b22 = b21 + half;
    
      ✗
          std::vector<double> lhs(half * half);
    
      ✗
          std::vector<double> rhs(half * half);
    
      ✗
          std::vector<double> m1(half * half);
    
      ✗
          std::vector<double> m2(half * half);
    
      ✗
          std::vector<double> m3(half * half);
    
      ✗
          std::vector<double> m4(half * half);
    
      ✗
          std::vector<double> m5(half * half);
    
      ✗
          std::vector<double> m6(half * half);
    
      ✗
          std::vector<double> m7(half * half);
    
          // M1 = (A11+A22)(B11+B22)
    
      ✗
          AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
    
      ✗
          AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
    
      ✗
          impl(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
    
          // M2 = (A21+A22)B11
    
      ✗
          AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
    
      ✗
          impl(lhs.data(), half, b11, b_stride, m2.data(), half, half);
    
          // M3 = A11(B12-B22)
    
      ✗
          AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
    
      ✗
          impl(a11, a_stride, rhs.data(), half, m3.data(), half, half);
    
          // M4 = A22(B21-B11)
    
      ✗
          AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
    
      ✗
          impl(a22, a_stride, rhs.data(), half, m4.data(), half, half);
    
          // M5 = (A11+A12)B22
    
      ✗
          AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
    
      ✗
          impl(lhs.data(), half, b22, b_stride, m5.data(), half, half);
    
          // M6 = (A21-A11)(B11+B12)
    
      ✗
          AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
    
      ✗
          AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
    
      ✗
          impl(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
    
          // M7 = (A12-A22)(B21+B22)
    
      ✗
          AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
    
      ✗
          AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
    
      ✗
          impl(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
    
      ✗
          CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
    
        };
    
      ✗
        impl(a_in, a_stride_in, b_in, b_stride_in, c_in, c_stride_in, n_in);
    
      ✗
      }
    
      24
      void StrassenTopOmp(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                          std::size_t c_stride, std::size_t n) {
    
        1/2✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.

      24
        if (n <= kCutoff) {
    
      24
          NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
    
      24
          return;
    
        }
    
      ✗
        const std::size_t half = n / 2;
    
        const double *a11 = a;
    
      ✗
        const double *a12 = a + half;
    
      ✗
        const double *a21 = a + (half * a_stride);
    
      ✗
        const double *a22 = a21 + half;
    
        const double *b11 = b;
    
      ✗
        const double *b12 = b + half;
    
      ✗
        const double *b21 = b + (half * b_stride);
    
      ✗
        const double *b22 = b21 + half;
    
      ✗
        std::vector<double> m1(half * half);
    
      ✗
        std::vector<double> m2(half * half);
    
      ✗
        std::vector<double> m3(half * half);
    
      ✗
        std::vector<double> m4(half * half);
    
      ✗
        std::vector<double> m5(half * half);
    
      ✗
        std::vector<double> m6(half * half);
    
      ✗
        std::vector<double> m7(half * half);
    
      ✗
      #pragma omp parallel default(none) \
    
          shared(m1, m2, m3, m4, m5, m6, m7, a11, a12, a21, a22, b11, b12, b21, b22, a_stride, b_stride, half)
    
        {
    
      #pragma omp single nowait
    
          {
    
      // M1 = (A11+A22)(B11+B22)
    
      #pragma omp task default(none) shared(m1, a11, a22, b11, b22, a_stride, b_stride, half)
    
            {
    
              std::vector<double> lhs(half * half);
    
              std::vector<double> rhs(half * half);
    
              AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
    
              AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
    
              StrassenSeq(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
    
            }
    
      // M2 = (A21+A22)B11
    
      #pragma omp task default(none) shared(m2, a21, a22, b11, a_stride, b_stride, half)
    
            {
    
              std::vector<double> lhs(half * half);
    
              AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
    
              StrassenSeq(lhs.data(), half, b11, b_stride, m2.data(), half, half);
    
            }
    
      // M3 = A11(B12-B22)
    
      #pragma omp task default(none) shared(m3, a11, b12, b22, a_stride, b_stride, half)
    
            {
    
              std::vector<double> rhs(half * half);
    
              AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
    
              StrassenSeq(a11, a_stride, rhs.data(), half, m3.data(), half, half);
    
            }
    
      // M4 = A22(B21-B11)
    
      #pragma omp task default(none) shared(m4, a22, b21, b11, a_stride, b_stride, half)
    
            {
    
              std::vector<double> rhs(half * half);
    
              AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
    
              StrassenSeq(a22, a_stride, rhs.data(), half, m4.data(), half, half);
    
            }
    
      // M5 = (A11+A12)B22
    
      #pragma omp task default(none) shared(m5, a11, a12, b22, a_stride, b_stride, half)
    
            {
    
              std::vector<double> lhs(half * half);
    
              AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
    
              StrassenSeq(lhs.data(), half, b22, b_stride, m5.data(), half, half);
    
            }
    
      // M6 = (A21-A11)(B11+B12)
    
      #pragma omp task default(none) shared(m6, a21, a11, b11, b12, a_stride, b_stride, half)
    
            {
    
              std::vector<double> lhs(half * half);
    
              std::vector<double> rhs(half * half);
    
              AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
    
              AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
    
              StrassenSeq(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
    
            }
    
      // M7 = (A12-A22)(B21+B22)
    
      #pragma omp task default(none) shared(m7, a12, a22, b21, b22, a_stride, b_stride, half)
    
            {
    
              std::vector<double> lhs(half * half);
    
              std::vector<double> rhs(half * half);
    
              AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
    
              AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
    
              StrassenSeq(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
    
            }
    
      #pragma omp taskwait
    
          }
    
        }
    
      ✗
        CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
    
      }
    
      }  // namespace
    
        1/2✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.

      24
      MuhammadkhonIStressenAlgOMP::MuhammadkhonIStressenAlgOMP(const InType &in) {
    
        SetTypeOfTask(GetStaticTypeOfTask());
    
        1/2✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.

      24
        GetInput() = in;
    
        GetOutput() = {};
    
      24
      }
    
      24
      bool MuhammadkhonIStressenAlgOMP::ValidationImpl() {
    
        const auto &in = GetInput();
    
        2/4✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 24 times.

      24
        return in.a_rows > 0 && in.a_cols_b_rows > 0 && in.b_cols > 0 &&
    
        2/4✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 24 times.

      48
               in.a.size() == static_cast<size_t>(in.a_rows * in.a_cols_b_rows) &&
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.

      24
               in.b.size() == static_cast<size_t>(in.a_cols_b_rows * in.b_cols);
    
      }
    
      24
      bool MuhammadkhonIStressenAlgOMP::PreProcessingImpl() {
    
        GetOutput() = {};
    
        const auto &in = GetInput();
    
      24
        a_rows_ = in.a_rows;
    
      24
        a_cols_b_rows_ = in.a_cols_b_rows;
    
      24
        b_cols_ = in.b_cols;
    
        1/2✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.

      48
        size_t max_dim = std::max({a_rows_, a_cols_b_rows_, b_cols_});
    
      24
        padded_n_ = NextPow2(max_dim);
    
      24
        padded_a_.assign(padded_n_ * padded_n_, 0.0);
    
      24
        padded_b_.assign(padded_n_ * padded_n_, 0.0);
    
        2/2✓ Branch 0 taken 660 times.
✓ Branch 1 taken 24 times.

      684
        for (size_t i = 0; i < a_rows_; ++i) {
    
        2/2✓ Branch 0 taken 33684 times.
✓ Branch 1 taken 660 times.

      34344
          for (size_t j = 0; j < a_cols_b_rows_; ++j) {
    
      33684
            padded_a_[(i * padded_n_) + j] = in.a[(i * a_cols_b_rows_) + j];
    
          }
    
        }
    
        2/2✓ Branch 0 taken 620 times.
✓ Branch 1 taken 24 times.

      644
        for (size_t i = 0; i < a_cols_b_rows_; ++i) {
    
        2/2✓ Branch 0 taken 33684 times.
✓ Branch 1 taken 620 times.

      34304
          for (size_t j = 0; j < b_cols_; ++j) {
    
      33684
            padded_b_[(i * padded_n_) + j] = in.b[(i * b_cols_) + j];
    
          }
    
        }
    
      24
        return true;
    
      }
    
      24
      bool MuhammadkhonIStressenAlgOMP::RunImpl() {
    
      24
        result_c_.assign(padded_n_ * padded_n_, 0.0);
    
      24
        StrassenTopOmp(padded_a_.data(), padded_n_, padded_b_.data(), padded_n_, result_c_.data(), padded_n_, padded_n_);
    
        auto &out = GetOutput();
    
      24
        out.assign(a_rows_ * b_cols_, 0.0);
    
        2/2✓ Branch 0 taken 660 times.
✓ Branch 1 taken 24 times.

      684
        for (size_t i = 0; i < a_rows_; ++i) {
    
        2/2✓ Branch 0 taken 34284 times.
✓ Branch 1 taken 660 times.

      34944
          for (size_t j = 0; j < b_cols_; ++j) {
    
      34284
            out[(i * b_cols_) + j] = result_c_[(i * padded_n_) + j];
    
          }
    
        }
    
      24
        return true;
    
      }
    
      24
      bool MuhammadkhonIStressenAlgOMP::PostProcessingImpl() {
    
      24
        return true;
    
      }
    
      }  // namespace muhammadkhon_i_stressen_alg

Line	Branch	Exec	Source
1			#include "muhammadkhon_i_stressen_alg/omp/include/ops_omp.hpp"
2
3			#include <algorithm>
4			#include <cstddef>
5			#include <functional>
6			#include <vector>
7
8			#include "muhammadkhon_i_stressen_alg/common/include/common.hpp"
9
10			namespace muhammadkhon_i_stressen_alg {
11
12			namespace {
13
14			constexpr std::size_t kCutoff = 64;
15			constexpr std::size_t kBlockSize = 64;
16
17			std::size_t NextPow2(std::size_t x) {
18		24	if (x <= 1) {
19			return 1;
20			}
21			std::size_t p = 1;
22	2/2 ✓ Branch 0 taken 96 times. ✓ Branch 1 taken 24 times.	120	while (p < x) {
23		96	p <<= 1;
24			}
25			return p;
26			}
27
28			void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
29	2/2 ✓ Branch 0 taken 672 times. ✓ Branch 1 taken 24 times.	696	for (std::size_t i = 0; i < n; ++i) {
30		672	std::fill_n(dst + (i * stride), n, 0.0);
31			}
32			}
33
34		✗	void AddToBuffer(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *dst,
35			std::size_t n, double b_coeff) {
36		✗	for (std::size_t i = 0; i < n; ++i) {
37		✗	for (std::size_t j = 0; j < n; ++j) {
38		✗	dst[(i * n) + j] = a[(i * a_stride) + j] + (b_coeff * b[(i * b_stride) + j]);
39			}
40			}
41		✗	}
42
43			void MulIKJ(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
44			std::size_t c_stride, std::size_t ii, std::size_t i_end, std::size_t kk, std::size_t k_end, std::size_t jj,
45			std::size_t j_end) {
46	2/2 ✓ Branch 0 taken 672 times. ✓ Branch 1 taken 24 times.	696	for (std::size_t i = ii; i < i_end; ++i) {
47		672	double c_row = c + (i c_stride);
48		672	const double a_row = a + (i a_stride);
49	2/2 ✓ Branch 0 taken 34944 times. ✓ Branch 1 taken 672 times.	35616	for (std::size_t k = kk; k < k_end; ++k) {
50		34944	const double aik = a_row[k];
51		34944	const double b_row = b + (k b_stride);
52	2/2 ✓ Branch 0 taken 2130432 times. ✓ Branch 1 taken 34944 times.	2165376	for (std::size_t j = jj; j < j_end; ++j) {
53		2130432	c_row[j] += aik * b_row[j];
54			}
55			}
56			}
57			}
58
59		24	void NaiveMulBlocked(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
60			std::size_t c_stride, std::size_t n) {
61		24	ZeroMatrix(c, c_stride, n);
62	2/2 ✓ Branch 0 taken 24 times. ✓ Branch 1 taken 24 times.	48	for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
63		24	const std::size_t i_end = std::min(ii + kBlockSize, n);
64	2/2 ✓ Branch 0 taken 24 times. ✓ Branch 1 taken 24 times.	48	for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
65		24	const std::size_t k_end = std::min(kk + kBlockSize, n);
66	2/2 ✓ Branch 0 taken 24 times. ✓ Branch 1 taken 24 times.	48	for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
67		24	const std::size_t j_end = std::min(jj + kBlockSize, n);
68			MulIKJ(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
69			}
70			}
71			}
72		24	}
73
74		✗	void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
75			const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
76			const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
77		✗	for (std::size_t i = 0; i < half; ++i) {
78		✗	double c11 = c + (i c_stride);
79			double *c12 = c11 + half;
80		✗	double c21 = c + ((i + half) c_stride);
81			double *c22 = c21 + half;
82		✗	for (std::size_t j = 0; j < half; ++j) {
83		✗	const std::size_t idx = (i * half) + j;
84		✗	c11[j] = m1[idx] + m4[idx] - m5[idx] + m7[idx];
85		✗	c12[j] = m3[idx] + m5[idx];
86		✗	c21[j] = m2[idx] + m4[idx];
87		✗	c22[j] = m1[idx] - m2[idx] + m3[idx] + m6[idx];
88			}
89			}
90		✗	}
91
92		✗	void StrassenSeq(const double a_in, std::size_t a_stride_in, const double b_in, std::size_t b_stride_in, double *c_in,
93			std::size_t c_stride_in, std::size_t n_in) {
94			std::function<void(const double , std::size_t, const double , std::size_t, double *, std::size_t, std::size_t)>
95		✗	impl = [&](const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
96			std::size_t c_stride, std::size_t n) {
97		✗	if (n <= kCutoff) {
98		✗	NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
99		✗	return;
100			}
101		✗	const std::size_t half = n / 2;
102
103			const double *a11 = a;
104		✗	const double *a12 = a + half;
105		✗	const double a21 = a + (half a_stride);
106		✗	const double *a22 = a21 + half;
107			const double *b11 = b;
108		✗	const double *b12 = b + half;
109		✗	const double b21 = b + (half b_stride);
110		✗	const double *b22 = b21 + half;
111
112		✗	std::vector<double> lhs(half * half);
113		✗	std::vector<double> rhs(half * half);
114		✗	std::vector<double> m1(half * half);
115		✗	std::vector<double> m2(half * half);
116		✗	std::vector<double> m3(half * half);
117		✗	std::vector<double> m4(half * half);
118		✗	std::vector<double> m5(half * half);
119		✗	std::vector<double> m6(half * half);
120		✗	std::vector<double> m7(half * half);
121
122			// M1 = (A11+A22)(B11+B22)
123		✗	AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
124		✗	AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
125		✗	impl(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
126
127			// M2 = (A21+A22)B11
128		✗	AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
129		✗	impl(lhs.data(), half, b11, b_stride, m2.data(), half, half);
130
131			// M3 = A11(B12-B22)
132		✗	AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
133		✗	impl(a11, a_stride, rhs.data(), half, m3.data(), half, half);
134
135			// M4 = A22(B21-B11)
136		✗	AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
137		✗	impl(a22, a_stride, rhs.data(), half, m4.data(), half, half);
138
139			// M5 = (A11+A12)B22
140		✗	AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
141		✗	impl(lhs.data(), half, b22, b_stride, m5.data(), half, half);
142
143			// M6 = (A21-A11)(B11+B12)
144		✗	AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
145		✗	AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
146		✗	impl(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
147
148			// M7 = (A12-A22)(B21+B22)
149		✗	AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
150		✗	AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
151		✗	impl(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
152
153		✗	CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
154			};
155
156		✗	impl(a_in, a_stride_in, b_in, b_stride_in, c_in, c_stride_in, n_in);
157		✗	}
158
159		24	void StrassenTopOmp(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
160			std::size_t c_stride, std::size_t n) {
161	1/2 ✓ Branch 0 taken 24 times. ✗ Branch 1 not taken.	24	if (n <= kCutoff) {
162		24	NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
163		24	return;
164			}
165
166		✗	const std::size_t half = n / 2;
167
168			const double *a11 = a;
169		✗	const double *a12 = a + half;
170		✗	const double a21 = a + (half a_stride);
171		✗	const double *a22 = a21 + half;
172			const double *b11 = b;
173		✗	const double *b12 = b + half;
174		✗	const double b21 = b + (half b_stride);
175		✗	const double *b22 = b21 + half;
176
177		✗	std::vector<double> m1(half * half);
178		✗	std::vector<double> m2(half * half);
179		✗	std::vector<double> m3(half * half);
180		✗	std::vector<double> m4(half * half);
181		✗	std::vector<double> m5(half * half);
182		✗	std::vector<double> m6(half * half);
183		✗	std::vector<double> m7(half * half);
184
185		✗	#pragma omp parallel default(none) \
186			shared(m1, m2, m3, m4, m5, m6, m7, a11, a12, a21, a22, b11, b12, b21, b22, a_stride, b_stride, half)
187			{
188			#pragma omp single nowait
189			{
190			// M1 = (A11+A22)(B11+B22)
191			#pragma omp task default(none) shared(m1, a11, a22, b11, b22, a_stride, b_stride, half)
192			{
193			std::vector<double> lhs(half * half);
194			std::vector<double> rhs(half * half);
195			AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
196			AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
197			StrassenSeq(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
198			}
199			// M2 = (A21+A22)B11
200			#pragma omp task default(none) shared(m2, a21, a22, b11, a_stride, b_stride, half)
201			{
202			std::vector<double> lhs(half * half);
203			AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
204			StrassenSeq(lhs.data(), half, b11, b_stride, m2.data(), half, half);
205			}
206			// M3 = A11(B12-B22)
207			#pragma omp task default(none) shared(m3, a11, b12, b22, a_stride, b_stride, half)
208			{
209			std::vector<double> rhs(half * half);
210			AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
211			StrassenSeq(a11, a_stride, rhs.data(), half, m3.data(), half, half);
212			}
213			// M4 = A22(B21-B11)
214			#pragma omp task default(none) shared(m4, a22, b21, b11, a_stride, b_stride, half)
215			{
216			std::vector<double> rhs(half * half);
217			AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
218			StrassenSeq(a22, a_stride, rhs.data(), half, m4.data(), half, half);
219			}
220			// M5 = (A11+A12)B22
221			#pragma omp task default(none) shared(m5, a11, a12, b22, a_stride, b_stride, half)
222			{
223			std::vector<double> lhs(half * half);
224			AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
225			StrassenSeq(lhs.data(), half, b22, b_stride, m5.data(), half, half);
226			}
227			// M6 = (A21-A11)(B11+B12)
228			#pragma omp task default(none) shared(m6, a21, a11, b11, b12, a_stride, b_stride, half)
229			{
230			std::vector<double> lhs(half * half);
231			std::vector<double> rhs(half * half);
232			AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
233			AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
234			StrassenSeq(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
235			}
236			// M7 = (A12-A22)(B21+B22)
237			#pragma omp task default(none) shared(m7, a12, a22, b21, b22, a_stride, b_stride, half)
238			{
239			std::vector<double> lhs(half * half);
240			std::vector<double> rhs(half * half);
241			AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
242			AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
243			StrassenSeq(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
244			}
245			#pragma omp taskwait
246			}
247			}
248
249		✗	CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
250			}
251
252			} // namespace
253
254	1/2 ✓ Branch 1 taken 24 times. ✗ Branch 2 not taken.	24	MuhammadkhonIStressenAlgOMP::MuhammadkhonIStressenAlgOMP(const InType &in) {
255			SetTypeOfTask(GetStaticTypeOfTask());
256	1/2 ✓ Branch 1 taken 24 times. ✗ Branch 2 not taken.	24	GetInput() = in;
257			GetOutput() = {};
258		24	}
259
260		24	bool MuhammadkhonIStressenAlgOMP::ValidationImpl() {
261			const auto &in = GetInput();
262	2/4 ✗ Branch 0 not taken. ✓ Branch 1 taken 24 times. ✗ Branch 2 not taken. ✓ Branch 3 taken 24 times.	24	return in.a_rows > 0 && in.a_cols_b_rows > 0 && in.b_cols > 0 &&
263	2/4 ✓ Branch 0 taken 24 times. ✗ Branch 1 not taken. ✗ Branch 2 not taken. ✓ Branch 3 taken 24 times.	48	in.a.size() == static_cast<size_t>(in.a_rows * in.a_cols_b_rows) &&
264	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 24 times.	24	in.b.size() == static_cast<size_t>(in.a_cols_b_rows * in.b_cols);
265			}
266
267		24	bool MuhammadkhonIStressenAlgOMP::PreProcessingImpl() {
268			GetOutput() = {};
269			const auto &in = GetInput();
270		24	a_rows_ = in.a_rows;
271		24	a_cols_b_rows_ = in.a_cols_b_rows;
272		24	b_cols_ = in.b_cols;
273
274	1/2 ✓ Branch 0 taken 24 times. ✗ Branch 1 not taken.	48	size_t max_dim = std::max({a_rows_, a_cols_b_rows_, b_cols_});
275		24	padded_n_ = NextPow2(max_dim);
276
277		24	padded_a_.assign(padded_n_ * padded_n_, 0.0);
278		24	padded_b_.assign(padded_n_ * padded_n_, 0.0);
279
280	2/2 ✓ Branch 0 taken 660 times. ✓ Branch 1 taken 24 times.	684	for (size_t i = 0; i < a_rows_; ++i) {
281	2/2 ✓ Branch 0 taken 33684 times. ✓ Branch 1 taken 660 times.	34344	for (size_t j = 0; j < a_cols_b_rows_; ++j) {
282		33684	padded_a_[(i * padded_n_) + j] = in.a[(i * a_cols_b_rows_) + j];
283			}
284			}
285
286	2/2 ✓ Branch 0 taken 620 times. ✓ Branch 1 taken 24 times.	644	for (size_t i = 0; i < a_cols_b_rows_; ++i) {
287	2/2 ✓ Branch 0 taken 33684 times. ✓ Branch 1 taken 620 times.	34304	for (size_t j = 0; j < b_cols_; ++j) {
288		33684	padded_b_[(i * padded_n_) + j] = in.b[(i * b_cols_) + j];
289			}
290			}
291
292		24	return true;
293			}
294
295		24	bool MuhammadkhonIStressenAlgOMP::RunImpl() {
296		24	result_c_.assign(padded_n_ * padded_n_, 0.0);
297
298		24	StrassenTopOmp(padded_a_.data(), padded_n_, padded_b_.data(), padded_n_, result_c_.data(), padded_n_, padded_n_);
299
300			auto &out = GetOutput();
301		24	out.assign(a_rows_ * b_cols_, 0.0);
302	2/2 ✓ Branch 0 taken 660 times. ✓ Branch 1 taken 24 times.	684	for (size_t i = 0; i < a_rows_; ++i) {
303	2/2 ✓ Branch 0 taken 34284 times. ✓ Branch 1 taken 660 times.	34944	for (size_t j = 0; j < b_cols_; ++j) {
304		34284	out[(i * b_cols_) + j] = result_c_[(i * padded_n_) + j];
305			}
306			}
307
308		24	return true;
309			}
310
311		24	bool MuhammadkhonIStressenAlgOMP::PostProcessingImpl() {
312		24	return true;
313			}
314
315			} // namespace muhammadkhon_i_stressen_alg
316