GCC Code Coverage Report

Directory:	./
File:	tasks/akhmetov_daniil_strassen_dense_double/tbb/src/ops_tbb.cpp
Date:	2026-06-04 20:25:32

	Exec	Total	Coverage
Lines:	46	164	28.0%
Functions:	7	18	38.9%
Branches:	22	154	14.3%

  
      Line
      Branch
      Exec
      Source
    
      #include "akhmetov_daniil_strassen_dense_double/tbb/include/ops_tbb.hpp"
    
      #include <oneapi/tbb/global_control.h>
    
      #include <oneapi/tbb/parallel_invoke.h>
    
      #include <algorithm>
    
      #include <cstddef>
    
      #include <vector>
    
      #include "akhmetov_daniil_strassen_dense_double/common/include/common.hpp"
    
      #include "util/include/util.hpp"
    
      namespace akhmetov_daniil_strassen_dense_double {
    
      namespace {
    
      constexpr std::size_t kCutoff = 256;
    
      constexpr std::size_t kBlockSize = 64;
    
      std::size_t NextPow2(std::size_t x) {
    
        if (x <= 1) {
    
          return 1;
    
        }
    
        std::size_t p = 1;
    
      ✗
        while (p < x) {
    
      ✗
          p <<= 1;
    
        }
    
        return p;
    
      }
    
      void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
    
        2/2✓ Branch 0 taken 1792 times.
✓ Branch 1 taken 12 times.

      1804
        for (std::size_t i = 0; i < n; ++i) {
    
      1792
          std::fill_n(dst + (i * stride), n, 0.0);
    
        }
    
      }
    
      ✗
      void AddToBuffer(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *dst,
    
                       std::size_t n, double b_coeff) {
    
      ✗
        for (std::size_t i = 0; i < n; ++i) {
    
      ✗
          const double *a_row = a + (i * a_stride);
    
      ✗
          const double *b_row = b + (i * b_stride);
    
      ✗
          double *dst_row = dst + (i * n);
    
      ✗
          for (std::size_t j = 0; j < n; ++j) {
    
      ✗
            dst_row[j] = a_row[j] + (b_coeff * b_row[j]);
    
          }
    
        }
    
      ✗
      }
    
      void MulMicroBlock(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                         std::size_t c_stride, std::size_t i_begin, std::size_t i_end, std::size_t k_begin, std::size_t k_end,
    
                         std::size_t j_begin, std::size_t j_end) {
    
        2/2✓ Branch 0 taken 18688 times.
✓ Branch 1 taken 292 times.

      18980
        for (std::size_t i = i_begin; i < i_end; ++i) {
    
      18688
          double *c_row = c + (i * c_stride);
    
      18688
          const double *a_row = a + (i * a_stride);
    
        2/2✓ Branch 0 taken 1196032 times.
✓ Branch 1 taken 18688 times.

      1214720
          for (std::size_t k = k_begin; k < k_end; ++k) {
    
      1196032
            const double aik = a_row[k];
    
      1196032
            const double *b_row = b + (k * b_stride);
    
        2/2✓ Branch 0 taken 76546048 times.
✓ Branch 1 taken 1196032 times.

      77742080
            for (std::size_t j = j_begin; j < j_end; ++j) {
    
      76546048
              c_row[j] += aik * b_row[j];
    
            }
    
          }
    
        }
    
      }
    
      84
      void MulJBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                      std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end, std::size_t kk,
    
                      std::size_t k_end) {
    
        2/2✓ Branch 0 taken 292 times.
✓ Branch 1 taken 84 times.

      376
        for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
    
      292
          const std::size_t j_end = std::min(jj + kBlockSize, n);
    
          MulMicroBlock(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
    
        }
    
      84
      }
    
      void MulKBlocks(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                      std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end) {
    
        2/2✓ Branch 0 taken 84 times.
✓ Branch 1 taken 28 times.

      112
        for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
    
      84
          const std::size_t k_end = std::min(kk + kBlockSize, n);
    
      84
          MulJBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end, kk, k_end);
    
        }
    
      }
    
      12
      void NaiveMulBlocked(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                           std::size_t c_stride, std::size_t n) {
    
      12
        ZeroMatrix(c, c_stride, n);
    
        2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 12 times.

      40
        for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
    
      28
          const std::size_t i_end = std::min(ii + kBlockSize, n);
    
          MulKBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end);
    
        }
    
      12
      }
    
      ✗
      void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
    
                            const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
    
                            const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
    
      ✗
        for (std::size_t i = 0; i < half; ++i) {
    
      ✗
          double *c11 = c + (i * c_stride);
    
          double *c12 = c11 + half;
    
      ✗
          double *c21 = c + ((i + half) * c_stride);
    
          double *c22 = c21 + half;
    
      ✗
          const double *m1_row = m1.data() + (i * half);
    
          const double *m2_row = m2.data() + (i * half);
    
          const double *m3_row = m3.data() + (i * half);
    
          const double *m4_row = m4.data() + (i * half);
    
          const double *m5_row = m5.data() + (i * half);
    
          const double *m6_row = m6.data() + (i * half);
    
          const double *m7_row = m7.data() + (i * half);
    
      ✗
          for (std::size_t j = 0; j < half; ++j) {
    
      ✗
            c11[j] = m1_row[j] + m4_row[j] - m5_row[j] + m7_row[j];
    
      ✗
            c12[j] = m3_row[j] + m5_row[j];
    
      ✗
            c21[j] = m2_row[j] + m4_row[j];
    
      ✗
            c22[j] = m1_row[j] - m2_row[j] + m3_row[j] + m6_row[j];
    
          }
    
        }
    
      ✗
      }
    
      using StrassenFn = void (*)(const double *, std::size_t, const double *, std::size_t, double *, std::size_t,
    
                                  std::size_t);
    
      void StrassenSeqImpl(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                           std::size_t c_stride, std::size_t n);
    
      constexpr StrassenFn kStrassenSeqFn = &StrassenSeqImpl;
    
      ✗
      void ComputeProduct(const double *a1, std::size_t a1_stride, const double *a2, std::size_t a2_stride, double a2_coeff,
    
                          const double *b1, std::size_t b1_stride, const double *b2, std::size_t b2_stride, double b2_coeff,
    
                          std::vector<double> &out, std::size_t n) {
    
      ✗
        std::vector<double> lhs(n * n);
    
      ✗
        std::vector<double> rhs(n * n);
    
      ✗
        AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
    
      ✗
        AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
    
      ✗
        out.assign(n * n, 0.0);
    
      ✗
        kStrassenSeqFn(lhs.data(), n, rhs.data(), n, out.data(), n, n);
    
      ✗
      }
    
      ✗
      void ComputeProductSingle(const double *a, std::size_t a_stride, const double *b1, std::size_t b1_stride,
    
                                const double *b2, std::size_t b2_stride, double b2_coeff, std::vector<double> &out,
    
                                std::size_t n) {
    
      ✗
        std::vector<double> rhs(n * n);
    
      ✗
        AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
    
      ✗
        out.assign(n * n, 0.0);
    
      ✗
        kStrassenSeqFn(a, a_stride, rhs.data(), n, out.data(), n, n);
    
      ✗
      }
    
      ✗
      void ComputeProductSingleLeft(const double *a1, std::size_t a1_stride, const double *a2, std::size_t a2_stride,
    
                                    double a2_coeff, const double *b, std::size_t b_stride, std::vector<double> &out,
    
                                    std::size_t n) {
    
      ✗
        std::vector<double> lhs(n * n);
    
      ✗
        AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
    
      ✗
        out.assign(n * n, 0.0);
    
      ✗
        kStrassenSeqFn(lhs.data(), n, b, b_stride, out.data(), n, n);
    
      ✗
      }
    
      ✗
      void StrassenTopTbb(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                          std::size_t c_stride, std::size_t n) {
    
      ✗
        if (n <= kCutoff || ppc::util::GetNumThreads() <= 1) {
    
      ✗
          kStrassenSeqFn(a, a_stride, b, b_stride, c, c_stride, n);
    
      ✗
          return;
    
        }
    
      ✗
        const std::size_t half = n / 2;
    
      ✗
        const double *a11 = a;
    
      ✗
        const double *a12 = a + half;
    
      ✗
        const double *a21 = a + (half * a_stride);
    
      ✗
        const double *a22 = a21 + half;
    
      ✗
        const double *b11 = b;
    
      ✗
        const double *b12 = b + half;
    
      ✗
        const double *b21 = b + (half * b_stride);
    
      ✗
        const double *b22 = b21 + half;
    
      ✗
        std::vector<double> m1;
    
      ✗
        std::vector<double> m2;
    
      ✗
        std::vector<double> m3;
    
      ✗
        std::vector<double> m4;
    
      ✗
        std::vector<double> m5;
    
      ✗
        std::vector<double> m6;
    
      ✗
        std::vector<double> m7;
    
        oneapi::tbb::parallel_invoke(
    
      ✗
            [&] { ComputeProduct(a11, a_stride, a22, a_stride, 1.0, b11, b_stride, b22, b_stride, 1.0, m1, half); }, [&] {
    
      ✗
          ComputeProductSingleLeft(a21, a_stride, a22, a_stride, 1.0, b11, b_stride, m2, half);
    
      ✗
        }, [&] { ComputeProductSingle(a11, a_stride, b12, b_stride, b22, b_stride, -1.0, m3, half); }, [&] {
    
      ✗
          ComputeProductSingle(a22, a_stride, b21, b_stride, b11, b_stride, -1.0, m4, half);
    
      ✗
        }, [&] { ComputeProductSingleLeft(a11, a_stride, a12, a_stride, 1.0, b22, b_stride, m5, half); }, [&] {
    
      ✗
          ComputeProduct(a21, a_stride, a11, a_stride, -1.0, b11, b_stride, b12, b_stride, 1.0, m6, half);
    
      ✗
        }, [&] { ComputeProduct(a12, a_stride, a22, a_stride, -1.0, b21, b_stride, b22, b_stride, 1.0, m7, half); });
    
      ✗
        CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
    
      }
    
      ✗
      void StrassenSeqImpl(const double *a, std::size_t a_stride, const double *b, std::size_t b_stride, double *c,
    
                           std::size_t c_stride, std::size_t n) {
    
      ✗
        if (n <= kCutoff) {
    
      ✗
          NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
    
      ✗
          return;
    
        }
    
      ✗
        const std::size_t half = n / 2;
    
        const double *a11 = a;
    
      ✗
        const double *a12 = a + half;
    
      ✗
        const double *a21 = a + (half * a_stride);
    
      ✗
        const double *a22 = a21 + half;
    
        const double *b11 = b;
    
      ✗
        const double *b12 = b + half;
    
      ✗
        const double *b21 = b + (half * b_stride);
    
      ✗
        const double *b22 = b21 + half;
    
      ✗
        std::vector<double> m1(half * half);
    
      ✗
        std::vector<double> m2(half * half);
    
      ✗
        std::vector<double> m3(half * half);
    
      ✗
        std::vector<double> m4(half * half);
    
      ✗
        std::vector<double> m5(half * half);
    
      ✗
        std::vector<double> m6(half * half);
    
      ✗
        std::vector<double> m7(half * half);
    
      ✗
        std::vector<double> lhs(half * half);
    
      ✗
        std::vector<double> rhs(half * half);
    
      ✗
        AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
    
      ✗
        AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
    
      ✗
        kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
    
      ✗
        AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
    
      ✗
        kStrassenSeqFn(lhs.data(), half, b11, b_stride, m2.data(), half, half);
    
      ✗
        AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
    
      ✗
        kStrassenSeqFn(a11, a_stride, rhs.data(), half, m3.data(), half, half);
    
      ✗
        AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
    
      ✗
        kStrassenSeqFn(a22, a_stride, rhs.data(), half, m4.data(), half, half);
    
      ✗
        AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
    
      ✗
        kStrassenSeqFn(lhs.data(), half, b22, b_stride, m5.data(), half, half);
    
      ✗
        AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
    
      ✗
        AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
    
      ✗
        kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
    
      ✗
        AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
    
      ✗
        AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
    
      ✗
        kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
    
      ✗
        CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
    
      }
    
      }  // namespace
    
        1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.

      12
      AkhmetovDStrassenDenseDoubleTBB::AkhmetovDStrassenDenseDoubleTBB(const InType &in) {
    
        SetTypeOfTask(GetStaticTypeOfTask());
    
        1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.

      12
        GetInput() = in;
    
      12
      }
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.

      12
      bool AkhmetovDStrassenDenseDoubleTBB::ValidationImpl() {
    
        const auto &input = GetInput();
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.

      12
        if (input.empty()) {
    
          return false;
    
        }
    
      12
        const size_t n = format::GetN(input);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.

      12
        if (n == 0) {
    
          return false;
    
        }
    
      12
        const size_t expected_size = 1 + (2 * n * n);
    
      12
        return input.size() == expected_size;
    
      }
    
      12
      bool AkhmetovDStrassenDenseDoubleTBB::PreProcessingImpl() {
    
        const auto &input = GetInput();
    
      12
        const size_t n = format::GetN(input);
    
      12
        GetOutput().assign(n * n, 0.0);
    
      12
        return true;
    
      }
    
      12
      bool AkhmetovDStrassenDenseDoubleTBB::RunImpl() {
    
        const auto &input = GetInput();
    
        auto &output = GetOutput();
    
      12
        const std::size_t n = format::GetN(input);
    
      12
        const Matrix a = format::GetA(input);
    
        1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.

      12
        const Matrix b = format::GetB(input);
    
        1/4✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.

      12
        if (n <= kCutoff || (n % 2U) != 0U) {
    
        1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.

      12
          output.assign(n * n, 0.0);
    
      12
          NaiveMulBlocked(a.data(), n, b.data(), n, output.data(), n, n);
    
      12
          return true;
    
        }
    
        const std::size_t padded = NextPow2(n);
    
      ✗
        Matrix a_pad(padded * padded, 0.0);
    
      ✗
        Matrix b_pad(padded * padded, 0.0);
    
      ✗
        Matrix c_pad(padded * padded, 0.0);
    
      ✗
        for (std::size_t i = 0; i < n; ++i) {
    
      ✗
          std::copy_n(a.data() + (i * n), n, a_pad.data() + (i * padded));
    
      ✗
          std::copy_n(b.data() + (i * n), n, b_pad.data() + (i * padded));
    
        }
    
        oneapi::tbb::global_control control(oneapi::tbb::global_control::max_allowed_parallelism,
    
      ✗
                                            static_cast<std::size_t>(std::max(1, ppc::util::GetNumThreads())));
    
        (void)control;
    
      ✗
        StrassenTopTbb(a_pad.data(), padded, b_pad.data(), padded, c_pad.data(), padded, padded);
    
      ✗
        output.assign(n * n, 0.0);
    
      ✗
        for (std::size_t i = 0; i < n; ++i) {
    
      ✗
          std::copy_n(c_pad.data() + (i * padded), n, output.data() + (i * n));
    
        }
    
        return true;
    
      }
    
      12
      bool AkhmetovDStrassenDenseDoubleTBB::PostProcessingImpl() {
    
        const auto &input = GetInput();
    
      12
        const size_t n = format::GetN(input);
    
      12
        return GetOutput().size() == n * n;
    
      }
    
      }  // namespace akhmetov_daniil_strassen_dense_double

Line	Branch	Exec	Source
1			#include "akhmetov_daniil_strassen_dense_double/tbb/include/ops_tbb.hpp"
2
3			#include <oneapi/tbb/global_control.h>
4			#include <oneapi/tbb/parallel_invoke.h>
5
6			#include <algorithm>
7			#include <cstddef>
8			#include <vector>
9
10			#include "akhmetov_daniil_strassen_dense_double/common/include/common.hpp"
11			#include "util/include/util.hpp"
12
13			namespace akhmetov_daniil_strassen_dense_double {
14
15			namespace {
16
17			constexpr std::size_t kCutoff = 256;
18			constexpr std::size_t kBlockSize = 64;
19
20			std::size_t NextPow2(std::size_t x) {
21			if (x <= 1) {
22			return 1;
23			}
24			std::size_t p = 1;
25		✗	while (p < x) {
26		✗	p <<= 1;
27			}
28			return p;
29			}
30
31			void ZeroMatrix(double *dst, std::size_t stride, std::size_t n) {
32	2/2 ✓ Branch 0 taken 1792 times. ✓ Branch 1 taken 12 times.	1804	for (std::size_t i = 0; i < n; ++i) {
33		1792	std::fill_n(dst + (i * stride), n, 0.0);
34			}
35			}
36
37		✗	void AddToBuffer(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *dst,
38			std::size_t n, double b_coeff) {
39		✗	for (std::size_t i = 0; i < n; ++i) {
40		✗	const double a_row = a + (i a_stride);
41		✗	const double b_row = b + (i b_stride);
42		✗	double dst_row = dst + (i n);
43		✗	for (std::size_t j = 0; j < n; ++j) {
44		✗	dst_row[j] = a_row[j] + (b_coeff * b_row[j]);
45			}
46			}
47		✗	}
48
49			void MulMicroBlock(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
50			std::size_t c_stride, std::size_t i_begin, std::size_t i_end, std::size_t k_begin, std::size_t k_end,
51			std::size_t j_begin, std::size_t j_end) {
52	2/2 ✓ Branch 0 taken 18688 times. ✓ Branch 1 taken 292 times.	18980	for (std::size_t i = i_begin; i < i_end; ++i) {
53		18688	double c_row = c + (i c_stride);
54		18688	const double a_row = a + (i a_stride);
55	2/2 ✓ Branch 0 taken 1196032 times. ✓ Branch 1 taken 18688 times.	1214720	for (std::size_t k = k_begin; k < k_end; ++k) {
56		1196032	const double aik = a_row[k];
57		1196032	const double b_row = b + (k b_stride);
58	2/2 ✓ Branch 0 taken 76546048 times. ✓ Branch 1 taken 1196032 times.	77742080	for (std::size_t j = j_begin; j < j_end; ++j) {
59		76546048	c_row[j] += aik * b_row[j];
60			}
61			}
62			}
63			}
64
65		84	void MulJBlocks(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
66			std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end, std::size_t kk,
67			std::size_t k_end) {
68	2/2 ✓ Branch 0 taken 292 times. ✓ Branch 1 taken 84 times.	376	for (std::size_t jj = 0; jj < n; jj += kBlockSize) {
69		292	const std::size_t j_end = std::min(jj + kBlockSize, n);
70			MulMicroBlock(a, a_stride, b, b_stride, c, c_stride, ii, i_end, kk, k_end, jj, j_end);
71			}
72		84	}
73
74			void MulKBlocks(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
75			std::size_t c_stride, std::size_t n, std::size_t ii, std::size_t i_end) {
76	2/2 ✓ Branch 0 taken 84 times. ✓ Branch 1 taken 28 times.	112	for (std::size_t kk = 0; kk < n; kk += kBlockSize) {
77		84	const std::size_t k_end = std::min(kk + kBlockSize, n);
78		84	MulJBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end, kk, k_end);
79			}
80			}
81
82		12	void NaiveMulBlocked(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
83			std::size_t c_stride, std::size_t n) {
84		12	ZeroMatrix(c, c_stride, n);
85	2/2 ✓ Branch 0 taken 28 times. ✓ Branch 1 taken 12 times.	40	for (std::size_t ii = 0; ii < n; ii += kBlockSize) {
86		28	const std::size_t i_end = std::min(ii + kBlockSize, n);
87			MulKBlocks(a, a_stride, b, b_stride, c, c_stride, n, ii, i_end);
88			}
89		12	}
90
91		✗	void CombineQuadrants(const std::vector<double> &m1, const std::vector<double> &m2, const std::vector<double> &m3,
92			const std::vector<double> &m4, const std::vector<double> &m5, const std::vector<double> &m6,
93			const std::vector<double> &m7, double *c, std::size_t c_stride, std::size_t half) {
94		✗	for (std::size_t i = 0; i < half; ++i) {
95		✗	double c11 = c + (i c_stride);
96			double *c12 = c11 + half;
97		✗	double c21 = c + ((i + half) c_stride);
98			double *c22 = c21 + half;
99
100		✗	const double m1_row = m1.data() + (i half);
101			const double m2_row = m2.data() + (i half);
102			const double m3_row = m3.data() + (i half);
103			const double m4_row = m4.data() + (i half);
104			const double m5_row = m5.data() + (i half);
105			const double m6_row = m6.data() + (i half);
106			const double m7_row = m7.data() + (i half);
107
108		✗	for (std::size_t j = 0; j < half; ++j) {
109		✗	c11[j] = m1_row[j] + m4_row[j] - m5_row[j] + m7_row[j];
110		✗	c12[j] = m3_row[j] + m5_row[j];
111		✗	c21[j] = m2_row[j] + m4_row[j];
112		✗	c22[j] = m1_row[j] - m2_row[j] + m3_row[j] + m6_row[j];
113			}
114			}
115		✗	}
116
117			using StrassenFn = void ()(const double , std::size_t, const double , std::size_t, double , std::size_t,
118			std::size_t);
119			void StrassenSeqImpl(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
120			std::size_t c_stride, std::size_t n);
121			constexpr StrassenFn kStrassenSeqFn = &StrassenSeqImpl;
122
123		✗	void ComputeProduct(const double a1, std::size_t a1_stride, const double a2, std::size_t a2_stride, double a2_coeff,
124			const double b1, std::size_t b1_stride, const double b2, std::size_t b2_stride, double b2_coeff,
125			std::vector<double> &out, std::size_t n) {
126		✗	std::vector<double> lhs(n * n);
127		✗	std::vector<double> rhs(n * n);
128		✗	AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
129		✗	AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
130		✗	out.assign(n * n, 0.0);
131		✗	kStrassenSeqFn(lhs.data(), n, rhs.data(), n, out.data(), n, n);
132		✗	}
133
134		✗	void ComputeProductSingle(const double a, std::size_t a_stride, const double b1, std::size_t b1_stride,
135			const double *b2, std::size_t b2_stride, double b2_coeff, std::vector<double> &out,
136			std::size_t n) {
137		✗	std::vector<double> rhs(n * n);
138		✗	AddToBuffer(b1, b1_stride, b2, b2_stride, rhs.data(), n, b2_coeff);
139		✗	out.assign(n * n, 0.0);
140		✗	kStrassenSeqFn(a, a_stride, rhs.data(), n, out.data(), n, n);
141		✗	}
142
143		✗	void ComputeProductSingleLeft(const double a1, std::size_t a1_stride, const double a2, std::size_t a2_stride,
144			double a2_coeff, const double *b, std::size_t b_stride, std::vector<double> &out,
145			std::size_t n) {
146		✗	std::vector<double> lhs(n * n);
147		✗	AddToBuffer(a1, a1_stride, a2, a2_stride, lhs.data(), n, a2_coeff);
148		✗	out.assign(n * n, 0.0);
149		✗	kStrassenSeqFn(lhs.data(), n, b, b_stride, out.data(), n, n);
150		✗	}
151
152		✗	void StrassenTopTbb(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
153			std::size_t c_stride, std::size_t n) {
154		✗	if (n <= kCutoff \|\| ppc::util::GetNumThreads() <= 1) {
155		✗	kStrassenSeqFn(a, a_stride, b, b_stride, c, c_stride, n);
156		✗	return;
157			}
158
159		✗	const std::size_t half = n / 2;
160
161		✗	const double *a11 = a;
162		✗	const double *a12 = a + half;
163		✗	const double a21 = a + (half a_stride);
164		✗	const double *a22 = a21 + half;
165
166		✗	const double *b11 = b;
167		✗	const double *b12 = b + half;
168		✗	const double b21 = b + (half b_stride);
169		✗	const double *b22 = b21 + half;
170
171		✗	std::vector<double> m1;
172		✗	std::vector<double> m2;
173		✗	std::vector<double> m3;
174		✗	std::vector<double> m4;
175		✗	std::vector<double> m5;
176		✗	std::vector<double> m6;
177		✗	std::vector<double> m7;
178
179			oneapi::tbb::parallel_invoke(
180		✗	[&] { ComputeProduct(a11, a_stride, a22, a_stride, 1.0, b11, b_stride, b22, b_stride, 1.0, m1, half); }, [&] {
181		✗	ComputeProductSingleLeft(a21, a_stride, a22, a_stride, 1.0, b11, b_stride, m2, half);
182		✗	}, [&] { ComputeProductSingle(a11, a_stride, b12, b_stride, b22, b_stride, -1.0, m3, half); }, [&] {
183		✗	ComputeProductSingle(a22, a_stride, b21, b_stride, b11, b_stride, -1.0, m4, half);
184		✗	}, [&] { ComputeProductSingleLeft(a11, a_stride, a12, a_stride, 1.0, b22, b_stride, m5, half); }, [&] {
185		✗	ComputeProduct(a21, a_stride, a11, a_stride, -1.0, b11, b_stride, b12, b_stride, 1.0, m6, half);
186		✗	}, [&] { ComputeProduct(a12, a_stride, a22, a_stride, -1.0, b21, b_stride, b22, b_stride, 1.0, m7, half); });
187
188		✗	CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
189			}
190
191		✗	void StrassenSeqImpl(const double a, std::size_t a_stride, const double b, std::size_t b_stride, double *c,
192			std::size_t c_stride, std::size_t n) {
193		✗	if (n <= kCutoff) {
194		✗	NaiveMulBlocked(a, a_stride, b, b_stride, c, c_stride, n);
195		✗	return;
196			}
197
198		✗	const std::size_t half = n / 2;
199
200			const double *a11 = a;
201		✗	const double *a12 = a + half;
202		✗	const double a21 = a + (half a_stride);
203		✗	const double *a22 = a21 + half;
204
205			const double *b11 = b;
206		✗	const double *b12 = b + half;
207		✗	const double b21 = b + (half b_stride);
208		✗	const double *b22 = b21 + half;
209
210		✗	std::vector<double> m1(half * half);
211		✗	std::vector<double> m2(half * half);
212		✗	std::vector<double> m3(half * half);
213		✗	std::vector<double> m4(half * half);
214		✗	std::vector<double> m5(half * half);
215		✗	std::vector<double> m6(half * half);
216		✗	std::vector<double> m7(half * half);
217		✗	std::vector<double> lhs(half * half);
218		✗	std::vector<double> rhs(half * half);
219
220		✗	AddToBuffer(a11, a_stride, a22, a_stride, lhs.data(), half, 1.0);
221		✗	AddToBuffer(b11, b_stride, b22, b_stride, rhs.data(), half, 1.0);
222		✗	kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m1.data(), half, half);
223
224		✗	AddToBuffer(a21, a_stride, a22, a_stride, lhs.data(), half, 1.0);
225		✗	kStrassenSeqFn(lhs.data(), half, b11, b_stride, m2.data(), half, half);
226
227		✗	AddToBuffer(b12, b_stride, b22, b_stride, rhs.data(), half, -1.0);
228		✗	kStrassenSeqFn(a11, a_stride, rhs.data(), half, m3.data(), half, half);
229
230		✗	AddToBuffer(b21, b_stride, b11, b_stride, rhs.data(), half, -1.0);
231		✗	kStrassenSeqFn(a22, a_stride, rhs.data(), half, m4.data(), half, half);
232
233		✗	AddToBuffer(a11, a_stride, a12, a_stride, lhs.data(), half, 1.0);
234		✗	kStrassenSeqFn(lhs.data(), half, b22, b_stride, m5.data(), half, half);
235
236		✗	AddToBuffer(a21, a_stride, a11, a_stride, lhs.data(), half, -1.0);
237		✗	AddToBuffer(b11, b_stride, b12, b_stride, rhs.data(), half, 1.0);
238		✗	kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m6.data(), half, half);
239
240		✗	AddToBuffer(a12, a_stride, a22, a_stride, lhs.data(), half, -1.0);
241		✗	AddToBuffer(b21, b_stride, b22, b_stride, rhs.data(), half, 1.0);
242		✗	kStrassenSeqFn(lhs.data(), half, rhs.data(), half, m7.data(), half, half);
243
244		✗	CombineQuadrants(m1, m2, m3, m4, m5, m6, m7, c, c_stride, half);
245			}
246
247			} // namespace
248
249	1/2 ✓ Branch 1 taken 12 times. ✗ Branch 2 not taken.	12	AkhmetovDStrassenDenseDoubleTBB::AkhmetovDStrassenDenseDoubleTBB(const InType &in) {
250			SetTypeOfTask(GetStaticTypeOfTask());
251	1/2 ✓ Branch 1 taken 12 times. ✗ Branch 2 not taken.	12	GetInput() = in;
252		12	}
253
254	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 12 times.	12	bool AkhmetovDStrassenDenseDoubleTBB::ValidationImpl() {
255			const auto &input = GetInput();
256	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 12 times.	12	if (input.empty()) {
257			return false;
258			}
259		12	const size_t n = format::GetN(input);
260	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 12 times.	12	if (n == 0) {
261			return false;
262			}
263		12	const size_t expected_size = 1 + (2 * n * n);
264		12	return input.size() == expected_size;
265			}
266
267		12	bool AkhmetovDStrassenDenseDoubleTBB::PreProcessingImpl() {
268			const auto &input = GetInput();
269		12	const size_t n = format::GetN(input);
270		12	GetOutput().assign(n * n, 0.0);
271		12	return true;
272			}
273
274		12	bool AkhmetovDStrassenDenseDoubleTBB::RunImpl() {
275			const auto &input = GetInput();
276			auto &output = GetOutput();
277
278		12	const std::size_t n = format::GetN(input);
279		12	const Matrix a = format::GetA(input);
280	1/2 ✓ Branch 1 taken 12 times. ✗ Branch 2 not taken.	12	const Matrix b = format::GetB(input);
281
282	1/4 ✗ Branch 0 not taken. ✓ Branch 1 taken 12 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken.	12	if (n <= kCutoff \|\| (n % 2U) != 0U) {
283	1/2 ✓ Branch 1 taken 12 times. ✗ Branch 2 not taken.	12	output.assign(n * n, 0.0);
284		12	NaiveMulBlocked(a.data(), n, b.data(), n, output.data(), n, n);
285		12	return true;
286			}
287
288			const std::size_t padded = NextPow2(n);
289		✗	Matrix a_pad(padded * padded, 0.0);
290		✗	Matrix b_pad(padded * padded, 0.0);
291		✗	Matrix c_pad(padded * padded, 0.0);
292
293		✗	for (std::size_t i = 0; i < n; ++i) {
294		✗	std::copy_n(a.data() + (i * n), n, a_pad.data() + (i * padded));
295		✗	std::copy_n(b.data() + (i * n), n, b_pad.data() + (i * padded));
296			}
297
298			oneapi::tbb::global_control control(oneapi::tbb::global_control::max_allowed_parallelism,
299		✗	static_cast<std::size_t>(std::max(1, ppc::util::GetNumThreads())));
300			(void)control;
301		✗	StrassenTopTbb(a_pad.data(), padded, b_pad.data(), padded, c_pad.data(), padded, padded);
302
303		✗	output.assign(n * n, 0.0);
304		✗	for (std::size_t i = 0; i < n; ++i) {
305		✗	std::copy_n(c_pad.data() + (i * padded), n, output.data() + (i * n));
306			}
307
308			return true;
309			}
310
311		12	bool AkhmetovDStrassenDenseDoubleTBB::PostProcessingImpl() {
312			const auto &input = GetInput();
313		12	const size_t n = format::GetN(input);
314		12	return GetOutput().size() == n * n;
315			}
316
317			} // namespace akhmetov_daniil_strassen_dense_double
318