GCC Code Coverage Report

Directory:	./
File:	tasks/akhmetov_daniil_strassen_dense_double/all/src/ops_all.cpp
Date:	2026-06-04 20:25:32

	Exec	Total	Coverage
Lines:	157	159	98.7%
Functions:	13	13	100.0%
Branches:	54	98	55.1%

  
      Line
      Branch
      Exec
      Source
    
      #include "akhmetov_daniil_strassen_dense_double/all/include/ops_all.hpp"
    
      #include <mpi.h>
    
      #include <atomic>
    
      #include <cstddef>
    
      #include <thread>
    
      #include <utility>
    
      #include <vector>
    
      #include "akhmetov_daniil_strassen_dense_double/common/include/common.hpp"
    
      #include "oneapi/tbb/parallel_for.h"
    
      #include "util/include/util.hpp"
    
      namespace akhmetov_daniil_strassen_dense_double {
    
        1/2✓ Branch 1 taken 6 times.
✗ Branch 2 not taken.

      6
      AkhmetovDStrassenDenseDoubleALL::AkhmetovDStrassenDenseDoubleALL(const InType &in) {
    
        SetTypeOfTask(GetStaticTypeOfTask());
    
        1/2✓ Branch 1 taken 6 times.
✗ Branch 2 not taken.

      6
        GetInput() = in;
    
      6
      }
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.

      6
      bool AkhmetovDStrassenDenseDoubleALL::ValidationImpl() {
    
        const auto &input = GetInput();
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.

      6
        if (input.empty()) {
    
          return false;
    
        }
    
      6
        const size_t n = format::GetN(input);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.

      6
        if (n == 0) {
    
          return false;
    
        }
    
      6
        const size_t expected_size = 1 + (2 * n * n);
    
      6
        return input.size() == expected_size;
    
      }
    
      6
      bool AkhmetovDStrassenDenseDoubleALL::PreProcessingImpl() {
    
        const auto &input = GetInput();
    
      6
        const size_t n = format::GetN(input);
    
      6
        GetOutput().assign(n * n, 0.0);
    
      6
        return true;
    
      }
    
      namespace {
    
      constexpr size_t kThreshold = 64;
    
      [[maybe_unused]] constexpr size_t kParallelThreshold = 256;
    
      inline size_t NextPow2(size_t n) {
    
        size_t p = 1;
    
        2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 4 times.

      34
        while (p < n) {
    
      30
          p <<= 1;
    
        }
    
        return p;
    
      }
    
      114
      Matrix StandardMultiply(const Matrix &a, const Matrix &b, size_t size) {
    
      114
        Matrix c(size * size, 0.0);
    
      114
      #pragma omp parallel for default(none) shared(a, b, c, size) schedule(static) if (size >= kParallelThreshold)
    
        for (size_t i = 0; i < size; ++i) {
    
          for (size_t k = 0; k < size; ++k) {
    
            const double aik = a.at((i * size) + k);
    
            const size_t bk = k * size;
    
            const size_t ci = i * size;
    
            for (size_t j = 0; j < size; ++j) {
    
              c.at(ci + j) += aik * b.at(bk + j);
    
            }
    
          }
    
        }
    
      114
        return c;
    
      }
    
      36
      void Split(const Matrix &src, Matrix &a11, Matrix &a12, Matrix &a21, Matrix &a22, size_t size) {
    
      36
        const size_t half = size / 2;
    
      36
        const size_t block = half * half;
    
      36
        a11.assign(block, 0.0);
    
      36
        a12.assign(block, 0.0);
    
      36
        a21.assign(block, 0.0);
    
      36
        a22.assign(block, 0.0);
    
      36
      #pragma omp parallel for default(none) shared(src, a11, a12, a21, a22, size, half) \
    
          schedule(static) if (size >= kParallelThreshold)
    
        for (size_t i = 0; i < half; ++i) {
    
          const size_t is = i * size;
    
          const size_t ih = i * half;
    
          const size_t is2 = (i + half) * size;
    
          for (size_t j = 0; j < half; ++j) {
    
            a11.at(ih + j) = src.at(is + j);
    
            a12.at(ih + j) = src.at(is + j + half);
    
            a21.at(ih + j) = src.at(is2 + j);
    
            a22.at(ih + j) = src.at(is2 + j + half);
    
          }
    
        }
    
      36
      }
    
      void Merge(Matrix &dst, const Matrix &c11, const Matrix &c12, const Matrix &c21, const Matrix &c22, size_t size) {
    
      18
        const size_t half = size / 2;
    
      18
      #pragma omp parallel for default(none) shared(dst, c11, c12, c21, c22, size, half) \
    
          schedule(static) if (size >= kParallelThreshold)
    
        for (size_t i = 0; i < half; ++i) {
    
          const size_t is = i * size;
    
          const size_t ih = i * half;
    
          const size_t is2 = (i + half) * size;
    
          for (size_t j = 0; j < half; ++j) {
    
            dst.at(is + j) = c11.at(ih + j);
    
            dst.at(is + j + half) = c12.at(ih + j);
    
            dst.at(is2 + j) = c21.at(ih + j);
    
            dst.at(is2 + j + half) = c22.at(ih + j);
    
          }
    
        }
    
      }
    
      108
      inline void AddInto(const Matrix &a, const Matrix &b, Matrix &c) {
    
        const size_t n = a.size();
    
      108
      #pragma omp parallel for default(none) shared(a, b, c, n) \
    
          schedule(static) if (n >= (kParallelThreshold * kParallelThreshold))
    
        for (size_t i = 0; i < n; ++i) {
    
          c.at(i) = a.at(i) + b.at(i);
    
        }
    
      108
      }
    
      72
      inline void SubInto(const Matrix &a, const Matrix &b, Matrix &c) {
    
        const size_t n = a.size();
    
      72
      #pragma omp parallel for default(none) shared(a, b, c, n) \
    
          schedule(static) if (n >= (kParallelThreshold * kParallelThreshold))
    
        for (size_t i = 0; i < n; ++i) {
    
          c.at(i) = a.at(i) - b.at(i);
    
        }
    
      72
      }
    
      struct Frame {
    
        Matrix a;
    
        Matrix b;
    
        Matrix result;
    
        size_t size;
    
        size_t stage{0};
    
        Matrix a11;
    
        Matrix a12;
    
        Matrix a21;
    
        Matrix a22;
    
        Matrix b11;
    
        Matrix b12;
    
        Matrix b21;
    
        Matrix b22;
    
        Matrix m1;
    
        Matrix m2;
    
        Matrix m3;
    
        Matrix m4;
    
        Matrix m5;
    
        Matrix m6;
    
        Matrix m7;
    
        Matrix temp_a;
    
        Matrix temp_b;
    
      130
        Frame(Matrix aa, Matrix bb, size_t s) : a(std::move(aa)), b(std::move(bb)), size(s) {}
    
      };
    
        1/2✓ Branch 0 taken 274 times.
✗ Branch 1 not taken.

      274
      void ProcessTopFrame(std::vector<Frame> &stack, Matrix &final_result) {
    
        1/2✓ Branch 0 taken 274 times.
✗ Branch 1 not taken.

      274
        if (stack.empty()) {
    
          return;
    
        }
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 274 times.

      274
        const size_t current_index = stack.size() - 1;
    
        Frame &frame = stack.at(current_index);
    
        2/2✓ Branch 0 taken 112 times.
✓ Branch 1 taken 162 times.

      274
        if (frame.size <= kThreshold) {
    
      112
          Matrix base = StandardMultiply(frame.a, frame.b, frame.size);
    
          stack.pop_back();
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 112 times.

      112
          if (stack.empty()) {
    
            final_result = std::move(base);
    
          } else {
    
      112
            stack.back().temp_a = std::move(base);
    
          }
    
          return;
    
        }
    
      162
        const size_t half = frame.size / 2;
    
      162
        const size_t block_size = half * half;
    
        9/10✓ Branch 0 taken 18 times.
✓ Branch 1 taken 18 times.
✓ Branch 2 taken 18 times.
✓ Branch 3 taken 18 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 18 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 18 times.
✓ Branch 8 taken 18 times.
✗ Branch 9 not taken.

      162
        switch (frame.stage) {
    
      18
          case 0: {
    
      18
            Split(frame.a, frame.a11, frame.a12, frame.a21, frame.a22, frame.size);
    
      18
            Split(frame.b, frame.b11, frame.b12, frame.b21, frame.b22, frame.size);
    
      18
            frame.temp_a.assign(block_size, 0.0);
    
      18
            frame.temp_b.assign(block_size, 0.0);
    
      18
            frame.m1.assign(block_size, 0.0);
    
      18
            frame.m2.assign(block_size, 0.0);
    
      18
            frame.m3.assign(block_size, 0.0);
    
      18
            frame.m4.assign(block_size, 0.0);
    
      18
            frame.m5.assign(block_size, 0.0);
    
      18
            frame.m6.assign(block_size, 0.0);
    
      18
            frame.m7.assign(block_size, 0.0);
    
      18
            frame.stage = 1;
    
      18
            return;
    
          }
    
      18
          case 1: {
    
      18
            AddInto(frame.a11, frame.a22, frame.temp_a);
    
      18
            AddInto(frame.b11, frame.b22, frame.temp_b);
    
      18
            stack.emplace_back(frame.temp_a, frame.temp_b, half);
    
      18
            stack.at(current_index).stage = 2;
    
      18
            return;
    
          }
    
      18
          case 2: {
    
      18
            frame.m1 = frame.temp_a;
    
      18
            AddInto(frame.a21, frame.a22, frame.temp_a);
    
      18
            stack.emplace_back(frame.temp_a, frame.b11, half);
    
      18
            stack.at(current_index).stage = 3;
    
      18
            return;
    
          }
    
      18
          case 3: {
    
      18
            frame.m2 = frame.temp_a;
    
      18
            SubInto(frame.b12, frame.b22, frame.temp_b);
    
      18
            stack.emplace_back(frame.a11, frame.temp_b, half);
    
      18
            stack.at(current_index).stage = 4;
    
      18
            return;
    
          }
    
      18
          case 4: {
    
      18
            frame.m3 = frame.temp_a;
    
      18
            SubInto(frame.b21, frame.b11, frame.temp_b);
    
      18
            stack.emplace_back(frame.a22, frame.temp_b, half);
    
      18
            stack.at(current_index).stage = 5;
    
      18
            return;
    
          }
    
      18
          case 5: {
    
      18
            frame.m4 = frame.temp_a;
    
      18
            AddInto(frame.a11, frame.a12, frame.temp_a);
    
      18
            stack.emplace_back(frame.temp_a, frame.b22, half);
    
      18
            stack.at(current_index).stage = 6;
    
      18
            return;
    
          }
    
      18
          case 6: {
    
      18
            frame.m5 = frame.temp_a;
    
      18
            SubInto(frame.a21, frame.a11, frame.temp_a);
    
      18
            AddInto(frame.b11, frame.b12, frame.temp_b);
    
      18
            stack.emplace_back(frame.temp_a, frame.temp_b, half);
    
      18
            stack.at(current_index).stage = 7;
    
      18
            return;
    
          }
    
      18
          case 7: {
    
      18
            frame.m6 = frame.temp_a;
    
      18
            SubInto(frame.a12, frame.a22, frame.temp_a);
    
      18
            AddInto(frame.b21, frame.b22, frame.temp_b);
    
      18
            stack.emplace_back(frame.temp_a, frame.temp_b, half);
    
      18
            stack.at(current_index).stage = 8;
    
      18
            return;
    
          }
    
      18
          case 8: {
    
      18
            frame.m7 = frame.temp_a;
    
      18
            Matrix c11(block_size);
    
        1/4✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      18
            Matrix c12(block_size);
    
        1/4✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      18
            Matrix c21(block_size);
    
        1/4✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      18
            Matrix c22(block_size);
    
      18
      #pragma omp parallel for default(none) shared(c11, c12, c21, c22, frame, block_size) \
    
          schedule(static) if (block_size >= (kParallelThreshold * kParallelThreshold))
    
            for (size_t i = 0; i < block_size; ++i) {
    
              c11.at(i) = frame.m1.at(i) + frame.m4.at(i) - frame.m5.at(i) + frame.m7.at(i);
    
              c12.at(i) = frame.m3.at(i) + frame.m5.at(i);
    
              c21.at(i) = frame.m2.at(i) + frame.m4.at(i);
    
              c22.at(i) = frame.m1.at(i) - frame.m2.at(i) + frame.m3.at(i) + frame.m6.at(i);
    
            }
    
        1/4✓ Branch 1 taken 18 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      18
            Matrix merged(frame.size * frame.size, 0.0);
    
      18
            Merge(merged, c11, c12, c21, c22, frame.size);
    
            stack.pop_back();
    
        2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 14 times.

      18
            if (stack.empty()) {
    
              final_result = std::move(merged);
    
            } else {
    
      14
              stack.back().temp_a = std::move(merged);
    
            }
    
            return;
    
          }
    
          default:
    
            return;
    
        }
    
      }
    
      4
      Matrix StrassenMultiply(const Matrix &a_init, const Matrix &b_init, size_t size_init) {
    
      4
        std::vector<Frame> stack;
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        stack.emplace_back(a_init, b_init, size_init);
    
      4
        Matrix result;
    
        2/2✓ Branch 0 taken 274 times.
✓ Branch 1 taken 4 times.

      278
        while (!stack.empty()) {
    
        1/2✓ Branch 1 taken 274 times.
✗ Branch 2 not taken.

      274
          ProcessTopFrame(stack, result);
    
        }
    
      4
        return result;
    
      4
      }
    
      8
      Matrix PadTo(const Matrix &src, size_t n, size_t new_n) {
    
        1/2✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.

      8
        if (new_n == n) {
    
      8
          return src;
    
        }
    
      ✗
        Matrix padded(new_n * new_n, 0.0);
    
      ✗
      #pragma omp parallel for default(none) shared(padded, src, n, new_n) schedule(static) if (new_n >= kParallelThreshold)
    
        for (size_t i = 0; i < n; ++i) {
    
          const size_t is = i * n;
    
          const size_t id = i * new_n;
    
          for (size_t j = 0; j < n; ++j) {
    
            padded.at(id + j) = src.at(is + j);
    
          }
    
        }
    
        return padded;
    
      }
    
      }  // namespace
    
      6
      bool AkhmetovDStrassenDenseDoubleALL::RunImpl() {
    
        const auto &input = GetInput();
    
        auto &output = GetOutput();
    
      6
        const size_t n = format::GetN(input);
    
      6
        const Matrix a = format::GetA(input);
    
        1/2✓ Branch 1 taken 6 times.
✗ Branch 2 not taken.

      6
        const Matrix b = format::GetB(input);
    
        2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 4 times.

      6
        if (n <= kThreshold) {
    
        1/4✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      2
          output = StandardMultiply(a, b, n);
    
      2
          return true;
    
        }
    
        const size_t new_n = NextPow2(n);
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        const Matrix a_padded = PadTo(a, n, new_n);
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        const Matrix b_padded = PadTo(b, n, new_n);
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        const Matrix result_padded = StrassenMultiply(a_padded, b_padded, new_n);
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        output.assign(n * n, 0.0);
    
      4
      #pragma omp parallel for default(none) shared(output, result_padded, n, new_n) \
    
          schedule(static) if (n >= kParallelThreshold)
    
        for (size_t i = 0; i < n; ++i) {
    
          const size_t is = i * new_n;
    
          const size_t id = i * n;
    
          for (size_t j = 0; j < n; ++j) {
    
            output.at(id + j) = result_padded.at(is + j);
    
          }
    
        }
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        const int num_threads = ppc::util::GetNumThreads();
    
      4
        int rank = 0;
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    
        2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 2 times.

      4
        if (rank == 0) {
    
      2
          std::atomic<int> counter(0);
    
        1/2✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.

      2
      #pragma omp parallel default(none) shared(counter) num_threads(ppc::util::GetNumThreads())
    
          counter++;
    
        }
    
        {
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
          std::vector<std::thread> threads(num_threads);
    
      4
          std::atomic<int> counter(0);
    
        2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 4 times.

      12
          for (int i = 0; i < num_threads; i++) {
    
        2/4✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 8 times.

      8
            threads.at(i) = std::thread([&]() { counter++; });
    
        1/2✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.

      8
            threads.at(i).join();
    
          }
    
      4
        }
    
        {
    
      4
          std::atomic<int> counter(0);
    
        1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.

      12
          tbb::parallel_for(0, num_threads, [&](int /*i*/) { counter++; });
    
        }
    
        1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.

      4
        MPI_Barrier(MPI_COMM_WORLD);
    
        return true;
    
      }
    
      6
      bool AkhmetovDStrassenDenseDoubleALL::PostProcessingImpl() {
    
        const auto &input = GetInput();
    
      6
        const size_t n = format::GetN(input);
    
      6
        return GetOutput().size() == n * n;
    
      }
    
      }  // namespace akhmetov_daniil_strassen_dense_double

Line	Branch	Exec	Source
1			#include "akhmetov_daniil_strassen_dense_double/all/include/ops_all.hpp"
2
3			#include <mpi.h>
4
5			#include <atomic>
6			#include <cstddef>
7			#include <thread>
8			#include <utility>
9			#include <vector>
10
11			#include "akhmetov_daniil_strassen_dense_double/common/include/common.hpp"
12			#include "oneapi/tbb/parallel_for.h"
13			#include "util/include/util.hpp"
14
15			namespace akhmetov_daniil_strassen_dense_double {
16
17	1/2 ✓ Branch 1 taken 6 times. ✗ Branch 2 not taken.	6	AkhmetovDStrassenDenseDoubleALL::AkhmetovDStrassenDenseDoubleALL(const InType &in) {
18			SetTypeOfTask(GetStaticTypeOfTask());
19	1/2 ✓ Branch 1 taken 6 times. ✗ Branch 2 not taken.	6	GetInput() = in;
20		6	}
21
22	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 6 times.	6	bool AkhmetovDStrassenDenseDoubleALL::ValidationImpl() {
23			const auto &input = GetInput();
24	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 6 times.	6	if (input.empty()) {
25			return false;
26			}
27		6	const size_t n = format::GetN(input);
28	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 6 times.	6	if (n == 0) {
29			return false;
30			}
31		6	const size_t expected_size = 1 + (2 * n * n);
32		6	return input.size() == expected_size;
33			}
34
35		6	bool AkhmetovDStrassenDenseDoubleALL::PreProcessingImpl() {
36			const auto &input = GetInput();
37		6	const size_t n = format::GetN(input);
38		6	GetOutput().assign(n * n, 0.0);
39		6	return true;
40			}
41
42			namespace {
43
44			constexpr size_t kThreshold = 64;
45			[[maybe_unused]] constexpr size_t kParallelThreshold = 256;
46
47			inline size_t NextPow2(size_t n) {
48			size_t p = 1;
49	2/2 ✓ Branch 0 taken 30 times. ✓ Branch 1 taken 4 times.	34	while (p < n) {
50		30	p <<= 1;
51			}
52			return p;
53			}
54
55		114	Matrix StandardMultiply(const Matrix &a, const Matrix &b, size_t size) {
56		114	Matrix c(size * size, 0.0);
57		114	#pragma omp parallel for default(none) shared(a, b, c, size) schedule(static) if (size >= kParallelThreshold)
58			for (size_t i = 0; i < size; ++i) {
59			for (size_t k = 0; k < size; ++k) {
60			const double aik = a.at((i * size) + k);
61			const size_t bk = k * size;
62			const size_t ci = i * size;
63			for (size_t j = 0; j < size; ++j) {
64			c.at(ci + j) += aik * b.at(bk + j);
65			}
66			}
67			}
68		114	return c;
69			}
70
71		36	void Split(const Matrix &src, Matrix &a11, Matrix &a12, Matrix &a21, Matrix &a22, size_t size) {
72		36	const size_t half = size / 2;
73		36	const size_t block = half * half;
74		36	a11.assign(block, 0.0);
75		36	a12.assign(block, 0.0);
76		36	a21.assign(block, 0.0);
77		36	a22.assign(block, 0.0);
78
79		36	#pragma omp parallel for default(none) shared(src, a11, a12, a21, a22, size, half) \
80			schedule(static) if (size >= kParallelThreshold)
81			for (size_t i = 0; i < half; ++i) {
82			const size_t is = i * size;
83			const size_t ih = i * half;
84			const size_t is2 = (i + half) * size;
85			for (size_t j = 0; j < half; ++j) {
86			a11.at(ih + j) = src.at(is + j);
87			a12.at(ih + j) = src.at(is + j + half);
88			a21.at(ih + j) = src.at(is2 + j);
89			a22.at(ih + j) = src.at(is2 + j + half);
90			}
91			}
92		36	}
93
94			void Merge(Matrix &dst, const Matrix &c11, const Matrix &c12, const Matrix &c21, const Matrix &c22, size_t size) {
95		18	const size_t half = size / 2;
96		18	#pragma omp parallel for default(none) shared(dst, c11, c12, c21, c22, size, half) \
97			schedule(static) if (size >= kParallelThreshold)
98			for (size_t i = 0; i < half; ++i) {
99			const size_t is = i * size;
100			const size_t ih = i * half;
101			const size_t is2 = (i + half) * size;
102			for (size_t j = 0; j < half; ++j) {
103			dst.at(is + j) = c11.at(ih + j);
104			dst.at(is + j + half) = c12.at(ih + j);
105			dst.at(is2 + j) = c21.at(ih + j);
106			dst.at(is2 + j + half) = c22.at(ih + j);
107			}
108			}
109			}
110
111		108	inline void AddInto(const Matrix &a, const Matrix &b, Matrix &c) {
112			const size_t n = a.size();
113		108	#pragma omp parallel for default(none) shared(a, b, c, n) \
114			schedule(static) if (n >= (kParallelThreshold * kParallelThreshold))
115			for (size_t i = 0; i < n; ++i) {
116			c.at(i) = a.at(i) + b.at(i);
117			}
118		108	}
119
120		72	inline void SubInto(const Matrix &a, const Matrix &b, Matrix &c) {
121			const size_t n = a.size();
122		72	#pragma omp parallel for default(none) shared(a, b, c, n) \
123			schedule(static) if (n >= (kParallelThreshold * kParallelThreshold))
124			for (size_t i = 0; i < n; ++i) {
125			c.at(i) = a.at(i) - b.at(i);
126			}
127		72	}
128
129			struct Frame {
130			Matrix a;
131			Matrix b;
132			Matrix result;
133			size_t size;
134			size_t stage{0};
135
136			Matrix a11;
137			Matrix a12;
138			Matrix a21;
139			Matrix a22;
140			Matrix b11;
141			Matrix b12;
142			Matrix b21;
143			Matrix b22;
144
145			Matrix m1;
146			Matrix m2;
147			Matrix m3;
148			Matrix m4;
149			Matrix m5;
150			Matrix m6;
151			Matrix m7;
152
153			Matrix temp_a;
154			Matrix temp_b;
155
156		130	Frame(Matrix aa, Matrix bb, size_t s) : a(std::move(aa)), b(std::move(bb)), size(s) {}
157			};
158
159	1/2 ✓ Branch 0 taken 274 times. ✗ Branch 1 not taken.	274	void ProcessTopFrame(std::vector<Frame> &stack, Matrix &final_result) {
160	1/2 ✓ Branch 0 taken 274 times. ✗ Branch 1 not taken.	274	if (stack.empty()) {
161			return;
162			}
163
164	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 274 times.	274	const size_t current_index = stack.size() - 1;
165			Frame &frame = stack.at(current_index);
166
167	2/2 ✓ Branch 0 taken 112 times. ✓ Branch 1 taken 162 times.	274	if (frame.size <= kThreshold) {
168		112	Matrix base = StandardMultiply(frame.a, frame.b, frame.size);
169			stack.pop_back();
170	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 112 times.	112	if (stack.empty()) {
171			final_result = std::move(base);
172			} else {
173		112	stack.back().temp_a = std::move(base);
174			}
175			return;
176			}
177
178		162	const size_t half = frame.size / 2;
179		162	const size_t block_size = half * half;
180
181	9/10 ✓ Branch 0 taken 18 times. ✓ Branch 1 taken 18 times. ✓ Branch 2 taken 18 times. ✓ Branch 3 taken 18 times. ✓ Branch 4 taken 18 times. ✓ Branch 5 taken 18 times. ✓ Branch 6 taken 18 times. ✓ Branch 7 taken 18 times. ✓ Branch 8 taken 18 times. ✗ Branch 9 not taken.	162	switch (frame.stage) {
182		18	case 0: {
183		18	Split(frame.a, frame.a11, frame.a12, frame.a21, frame.a22, frame.size);
184		18	Split(frame.b, frame.b11, frame.b12, frame.b21, frame.b22, frame.size);
185
186		18	frame.temp_a.assign(block_size, 0.0);
187		18	frame.temp_b.assign(block_size, 0.0);
188		18	frame.m1.assign(block_size, 0.0);
189		18	frame.m2.assign(block_size, 0.0);
190		18	frame.m3.assign(block_size, 0.0);
191		18	frame.m4.assign(block_size, 0.0);
192		18	frame.m5.assign(block_size, 0.0);
193		18	frame.m6.assign(block_size, 0.0);
194		18	frame.m7.assign(block_size, 0.0);
195
196		18	frame.stage = 1;
197		18	return;
198			}
199
200		18	case 1: {
201		18	AddInto(frame.a11, frame.a22, frame.temp_a);
202		18	AddInto(frame.b11, frame.b22, frame.temp_b);
203
204		18	stack.emplace_back(frame.temp_a, frame.temp_b, half);
205		18	stack.at(current_index).stage = 2;
206		18	return;
207			}
208
209		18	case 2: {
210		18	frame.m1 = frame.temp_a;
211		18	AddInto(frame.a21, frame.a22, frame.temp_a);
212
213		18	stack.emplace_back(frame.temp_a, frame.b11, half);
214		18	stack.at(current_index).stage = 3;
215		18	return;
216			}
217
218		18	case 3: {
219		18	frame.m2 = frame.temp_a;
220		18	SubInto(frame.b12, frame.b22, frame.temp_b);
221
222		18	stack.emplace_back(frame.a11, frame.temp_b, half);
223		18	stack.at(current_index).stage = 4;
224		18	return;
225			}
226
227		18	case 4: {
228		18	frame.m3 = frame.temp_a;
229		18	SubInto(frame.b21, frame.b11, frame.temp_b);
230
231		18	stack.emplace_back(frame.a22, frame.temp_b, half);
232		18	stack.at(current_index).stage = 5;
233		18	return;
234			}
235
236		18	case 5: {
237		18	frame.m4 = frame.temp_a;
238		18	AddInto(frame.a11, frame.a12, frame.temp_a);
239
240		18	stack.emplace_back(frame.temp_a, frame.b22, half);
241		18	stack.at(current_index).stage = 6;
242		18	return;
243			}
244
245		18	case 6: {
246		18	frame.m5 = frame.temp_a;
247		18	SubInto(frame.a21, frame.a11, frame.temp_a);
248		18	AddInto(frame.b11, frame.b12, frame.temp_b);
249
250		18	stack.emplace_back(frame.temp_a, frame.temp_b, half);
251		18	stack.at(current_index).stage = 7;
252		18	return;
253			}
254
255		18	case 7: {
256		18	frame.m6 = frame.temp_a;
257		18	SubInto(frame.a12, frame.a22, frame.temp_a);
258		18	AddInto(frame.b21, frame.b22, frame.temp_b);
259
260		18	stack.emplace_back(frame.temp_a, frame.temp_b, half);
261		18	stack.at(current_index).stage = 8;
262		18	return;
263			}
264
265		18	case 8: {
266		18	frame.m7 = frame.temp_a;
267
268		18	Matrix c11(block_size);
269	1/4 ✓ Branch 1 taken 18 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	18	Matrix c12(block_size);
270	1/4 ✓ Branch 1 taken 18 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	18	Matrix c21(block_size);
271	1/4 ✓ Branch 1 taken 18 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	18	Matrix c22(block_size);
272
273		18	#pragma omp parallel for default(none) shared(c11, c12, c21, c22, frame, block_size) \
274			schedule(static) if (block_size >= (kParallelThreshold * kParallelThreshold))
275			for (size_t i = 0; i < block_size; ++i) {
276			c11.at(i) = frame.m1.at(i) + frame.m4.at(i) - frame.m5.at(i) + frame.m7.at(i);
277			c12.at(i) = frame.m3.at(i) + frame.m5.at(i);
278			c21.at(i) = frame.m2.at(i) + frame.m4.at(i);
279			c22.at(i) = frame.m1.at(i) - frame.m2.at(i) + frame.m3.at(i) + frame.m6.at(i);
280			}
281
282	1/4 ✓ Branch 1 taken 18 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	18	Matrix merged(frame.size * frame.size, 0.0);
283		18	Merge(merged, c11, c12, c21, c22, frame.size);
284
285			stack.pop_back();
286	2/2 ✓ Branch 0 taken 4 times. ✓ Branch 1 taken 14 times.	18	if (stack.empty()) {
287			final_result = std::move(merged);
288			} else {
289		14	stack.back().temp_a = std::move(merged);
290			}
291			return;
292			}
293
294			default:
295			return;
296			}
297			}
298
299		4	Matrix StrassenMultiply(const Matrix &a_init, const Matrix &b_init, size_t size_init) {
300		4	std::vector<Frame> stack;
301	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	stack.emplace_back(a_init, b_init, size_init);
302
303		4	Matrix result;
304	2/2 ✓ Branch 0 taken 274 times. ✓ Branch 1 taken 4 times.	278	while (!stack.empty()) {
305	1/2 ✓ Branch 1 taken 274 times. ✗ Branch 2 not taken.	274	ProcessTopFrame(stack, result);
306			}
307		4	return result;
308		4	}
309
310		8	Matrix PadTo(const Matrix &src, size_t n, size_t new_n) {
311	1/2 ✓ Branch 0 taken 8 times. ✗ Branch 1 not taken.	8	if (new_n == n) {
312		8	return src;
313			}
314		✗	Matrix padded(new_n * new_n, 0.0);
315		✗	#pragma omp parallel for default(none) shared(padded, src, n, new_n) schedule(static) if (new_n >= kParallelThreshold)
316			for (size_t i = 0; i < n; ++i) {
317			const size_t is = i * n;
318			const size_t id = i * new_n;
319			for (size_t j = 0; j < n; ++j) {
320			padded.at(id + j) = src.at(is + j);
321			}
322			}
323			return padded;
324			}
325
326			} // namespace
327
328		6	bool AkhmetovDStrassenDenseDoubleALL::RunImpl() {
329			const auto &input = GetInput();
330			auto &output = GetOutput();
331
332		6	const size_t n = format::GetN(input);
333		6	const Matrix a = format::GetA(input);
334	1/2 ✓ Branch 1 taken 6 times. ✗ Branch 2 not taken.	6	const Matrix b = format::GetB(input);
335
336	2/2 ✓ Branch 0 taken 2 times. ✓ Branch 1 taken 4 times.	6	if (n <= kThreshold) {
337	1/4 ✓ Branch 1 taken 2 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	2	output = StandardMultiply(a, b, n);
338		2	return true;
339			}
340
341			const size_t new_n = NextPow2(n);
342	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	const Matrix a_padded = PadTo(a, n, new_n);
343	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	const Matrix b_padded = PadTo(b, n, new_n);
344
345	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	const Matrix result_padded = StrassenMultiply(a_padded, b_padded, new_n);
346
347	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	output.assign(n * n, 0.0);
348		4	#pragma omp parallel for default(none) shared(output, result_padded, n, new_n) \
349			schedule(static) if (n >= kParallelThreshold)
350			for (size_t i = 0; i < n; ++i) {
351			const size_t is = i * new_n;
352			const size_t id = i * n;
353			for (size_t j = 0; j < n; ++j) {
354			output.at(id + j) = result_padded.at(is + j);
355			}
356			}
357
358	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	const int num_threads = ppc::util::GetNumThreads();
359		4	int rank = 0;
360	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
361
362	2/2 ✓ Branch 0 taken 2 times. ✓ Branch 1 taken 2 times.	4	if (rank == 0) {
363		2	std::atomic<int> counter(0);
364	1/2 ✓ Branch 1 taken 2 times. ✗ Branch 2 not taken.	2	#pragma omp parallel default(none) shared(counter) num_threads(ppc::util::GetNumThreads())
365			counter++;
366			}
367
368			{
369	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	std::vector<std::thread> threads(num_threads);
370		4	std::atomic<int> counter(0);
371	2/2 ✓ Branch 0 taken 8 times. ✓ Branch 1 taken 4 times.	12	for (int i = 0; i < num_threads; i++) {
372	2/4 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✓ Branch 4 taken 8 times.	8	threads.at(i) = std::thread([&]() { counter++; });
373	1/2 ✓ Branch 1 taken 8 times. ✗ Branch 2 not taken.	8	threads.at(i).join();
374			}
375		4	}
376
377			{
378		4	std::atomic<int> counter(0);
379	1/4 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken. ✗ Branch 4 not taken.	12	tbb::parallel_for(0, num_threads, [&](int /i/) { counter++; });
380			}
381
382	1/2 ✓ Branch 1 taken 4 times. ✗ Branch 2 not taken.	4	MPI_Barrier(MPI_COMM_WORLD);
383			return true;
384			}
385
386		6	bool AkhmetovDStrassenDenseDoubleALL::PostProcessingImpl() {
387			const auto &input = GetInput();
388		6	const size_t n = format::GetN(input);
389		6	return GetOutput().size() == n * n;
390			}
391
392			} // namespace akhmetov_daniil_strassen_dense_double
393