| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "cheremkhin_a_matr_mult_cannon_alg/stl/include/ops_stl.hpp" | ||
| 2 | |||
| 3 | #include <algorithm> | ||
| 4 | #include <cmath> | ||
| 5 | #include <cstddef> | ||
| 6 | #include <future> | ||
| 7 | #include <utility> | ||
| 8 | #include <vector> | ||
| 9 | |||
| 10 | #include "cheremkhin_a_matr_mult_cannon_alg/common/include/common.hpp" | ||
| 11 | #include "util/include/util.hpp" | ||
| 12 | |||
| 13 | namespace cheremkhin_a_matr_mult_cannon_alg { | ||
| 14 | |||
| 15 | namespace { | ||
| 16 | |||
| 17 | inline std::size_t Idx(std::size_t n, std::size_t r, std::size_t c) { | ||
| 18 | 6464 | return (r * n) + c; | |
| 19 | } | ||
| 20 | |||
| 21 | std::size_t ChooseQ(std::size_t n) { | ||
| 22 | 56 | if (n <= 1) { | |
| 23 | return 1; | ||
| 24 | } | ||
| 25 | |||
| 26 | 48 | const auto root = static_cast<std::size_t>(std::sqrt(static_cast<double>(n))); | |
| 27 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 48 times.
|
48 | return (root == 0) ? 1 : root; |
| 28 | } | ||
| 29 | |||
| 30 | std::size_t CeilDiv(std::size_t a, std::size_t b) { | ||
| 31 | 152 | return (a + b - 1) / b; | |
| 32 | } | ||
| 33 | |||
| 34 | 584 | void MulAddBlock(const std::vector<double> &a, const std::vector<double> &b, std::vector<double> &c, std::size_t n, | |
| 35 | std::size_t bs, std::size_t bi, std::size_t bk, std::size_t bj) { | ||
| 36 | 584 | const std::size_t i0 = bi * bs; | |
| 37 | 584 | const std::size_t k0 = bk * bs; | |
| 38 | 584 | const std::size_t j0 = bj * bs; | |
| 39 | |||
| 40 |
2/2✓ Branch 0 taken 2376 times.
✓ Branch 1 taken 584 times.
|
2960 | for (std::size_t ii = 0; ii < bs; ++ii) { |
| 41 | 2376 | const std::size_t i = i0 + ii; | |
| 42 | 2376 | const std::size_t a_row = i * n; | |
| 43 | const std::size_t c_row = i * n; | ||
| 44 | 2376 | double *c_block = c.data() + c_row + j0; | |
| 45 | |||
| 46 |
2/2✓ Branch 0 taken 10248 times.
✓ Branch 1 taken 2376 times.
|
12624 | for (std::size_t kk = 0; kk < bs; ++kk) { |
| 47 | 10248 | const std::size_t k = k0 + kk; | |
| 48 | 10248 | const double aik = a[a_row + k]; | |
| 49 | 10248 | const double *b_block = b.data() + (k * n) + j0; | |
| 50 |
2/2✓ Branch 0 taken 45720 times.
✓ Branch 1 taken 10248 times.
|
55968 | for (std::size_t jj = 0; jj < bs; ++jj) { |
| 51 | 45720 | c_block[jj] += aik * b_block[jj]; | |
| 52 | } | ||
| 53 | } | ||
| 54 | } | ||
| 55 | 584 | } | |
| 56 | |||
| 57 | template <class Func> | ||
| 58 | 336 | void ParallelFor(std::size_t count, std::size_t requested_threads, Func fn) { | |
| 59 |
1/2✓ Branch 0 taken 168 times.
✗ Branch 1 not taken.
|
336 | if (count == 0) { |
| 60 | 144 | return; | |
| 61 | } | ||
| 62 | |||
| 63 |
2/2✓ Branch 0 taken 96 times.
✓ Branch 1 taken 72 times.
|
336 | const std::size_t workers = std::max<std::size_t>(1, std::min(count, requested_threads)); |
| 64 |
2/2✓ Branch 0 taken 72 times.
✓ Branch 1 taken 96 times.
|
336 | if (workers == 1) { |
| 65 |
2/2✓ Branch 0 taken 256 times.
✓ Branch 1 taken 72 times.
|
656 | for (std::size_t idx = 0; idx < count; ++idx) { |
| 66 | 512 | fn(idx); | |
| 67 | } | ||
| 68 | return; | ||
| 69 | } | ||
| 70 | |||
| 71 | const std::size_t chunk = CeilDiv(count, workers); | ||
| 72 | 192 | std::vector<std::future<void>> tasks; | |
| 73 |
1/2✓ Branch 1 taken 96 times.
✗ Branch 2 not taken.
|
192 | tasks.reserve(workers); |
| 74 | |||
| 75 |
2/2✓ Branch 0 taken 272 times.
✓ Branch 1 taken 84 times.
|
712 | for (std::size_t worker = 0; worker < workers; ++worker) { |
| 76 | 544 | const std::size_t begin = worker * chunk; | |
| 77 |
2/2✓ Branch 0 taken 260 times.
✓ Branch 1 taken 12 times.
|
544 | const std::size_t end = std::min(begin + chunk, count); |
| 78 |
2/2✓ Branch 0 taken 260 times.
✓ Branch 1 taken 12 times.
|
544 | if (begin >= end) { |
| 79 | break; | ||
| 80 | } | ||
| 81 | |||
| 82 |
2/4✓ Branch 1 taken 260 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 260 times.
✗ Branch 5 not taken.
|
1040 | tasks.emplace_back(std::async(std::launch::async, [begin, end, &fn] { |
| 83 |
6/6✓ Branch 0 taken 246 times.
✓ Branch 1 taken 98 times.
✓ Branch 2 taken 156 times.
✓ Branch 3 taken 64 times.
✓ Branch 4 taken 246 times.
✓ Branch 5 taken 98 times.
|
908 | for (std::size_t idx = begin; idx < end; ++idx) { |
| 84 | 648 | fn(idx); | |
| 85 | } | ||
| 86 | })); | ||
| 87 | } | ||
| 88 | |||
| 89 |
2/2✓ Branch 0 taken 260 times.
✓ Branch 1 taken 96 times.
|
712 | for (auto &task : tasks) { |
| 90 |
1/2✓ Branch 1 taken 260 times.
✗ Branch 2 not taken.
|
520 | task.get(); |
| 91 | } | ||
| 92 | 192 | } | |
| 93 | |||
| 94 | } // namespace | ||
| 95 | |||
| 96 |
1/2✓ Branch 1 taken 56 times.
✗ Branch 2 not taken.
|
56 | CheremkhinAMatrMultCannonAlgSTL::CheremkhinAMatrMultCannonAlgSTL(const InType &in) { |
| 97 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 98 | GetInput() = in; | ||
| 99 | GetOutput() = {}; | ||
| 100 | 56 | } | |
| 101 | |||
| 102 | 56 | bool CheremkhinAMatrMultCannonAlgSTL::ValidationImpl() { | |
| 103 | 56 | const std::size_t n = std::get<0>(GetInput()); | |
| 104 | const auto &a = std::get<1>(GetInput()); | ||
| 105 | const auto &b = std::get<2>(GetInput()); | ||
| 106 |
3/6✓ Branch 0 taken 56 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 56 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 56 times.
|
56 | return n > 0 && a.size() == n * n && b.size() == n * n; |
| 107 | } | ||
| 108 | |||
| 109 | 56 | bool CheremkhinAMatrMultCannonAlgSTL::PreProcessingImpl() { | |
| 110 | GetOutput() = {}; | ||
| 111 | 56 | return true; | |
| 112 | } | ||
| 113 | |||
| 114 | 56 | bool CheremkhinAMatrMultCannonAlgSTL::RunImpl() { | |
| 115 | 56 | const std::size_t n = std::get<0>(GetInput()); | |
| 116 | const auto &a_in = std::get<1>(GetInput()); | ||
| 117 | const auto &b_in = std::get<2>(GetInput()); | ||
| 118 | 56 | const std::size_t threads = static_cast<std::size_t>(std::max(1, ppc::util::GetNumThreads())); | |
| 119 | |||
| 120 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 48 times.
|
56 | const std::size_t q = ChooseQ(n); |
| 121 | 56 | const std::size_t bs = CeilDiv(n, q); | |
| 122 | 56 | const std::size_t np = q * bs; | |
| 123 | |||
| 124 | 56 | std::vector<double> a(np * np, 0.0); | |
| 125 |
1/4✓ Branch 1 taken 56 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
56 | std::vector<double> b(np * np, 0.0); |
| 126 |
1/4✓ Branch 1 taken 56 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
56 | std::vector<double> c(np * np, 0.0); |
| 127 | |||
| 128 |
1/2✓ Branch 1 taken 56 times.
✗ Branch 2 not taken.
|
56 | ParallelFor(n, threads, [&](std::size_t i) { |
| 129 |
2/2✓ Branch 0 taken 3232 times.
✓ Branch 1 taken 336 times.
|
3568 | for (std::size_t j = 0; j < n; ++j) { |
| 130 | 3232 | a[Idx(np, i, j)] = a_in[Idx(n, i, j)]; | |
| 131 | 3232 | b[Idx(np, i, j)] = b_in[Idx(n, i, j)]; | |
| 132 | } | ||
| 133 | 336 | }); | |
| 134 | |||
| 135 |
1/2✓ Branch 1 taken 56 times.
✗ Branch 2 not taken.
|
56 | ParallelFor(q * q, threads, [&](std::size_t block_idx) { |
| 136 | 232 | const std::size_t bi = block_idx / q; | |
| 137 | 232 | const std::size_t bj = block_idx % q; | |
| 138 | |||
| 139 |
2/2✓ Branch 0 taken 584 times.
✓ Branch 1 taken 232 times.
|
816 | for (std::size_t step = 0; step < q; ++step) { |
| 140 | 584 | const std::size_t bk = (bi + bj + step) % q; | |
| 141 | 584 | MulAddBlock(a, b, c, np, bs, bi, bk, bj); | |
| 142 | } | ||
| 143 | 232 | }); | |
| 144 | |||
| 145 |
1/4✓ Branch 1 taken 56 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
56 | std::vector<double> out(n * n, 0.0); |
| 146 |
1/2✓ Branch 1 taken 56 times.
✗ Branch 2 not taken.
|
56 | ParallelFor(n, threads, [&](std::size_t i) { |
| 147 |
2/2✓ Branch 0 taken 3232 times.
✓ Branch 1 taken 336 times.
|
3568 | for (std::size_t j = 0; j < n; ++j) { |
| 148 | 3232 | out[Idx(n, i, j)] = c[Idx(np, i, j)]; | |
| 149 | } | ||
| 150 | 336 | }); | |
| 151 | |||
| 152 | GetOutput() = std::move(out); | ||
| 153 | 56 | return true; | |
| 154 | } | ||
| 155 | |||
| 156 | 56 | bool CheremkhinAMatrMultCannonAlgSTL::PostProcessingImpl() { | |
| 157 | 56 | return true; | |
| 158 | } | ||
| 159 | |||
| 160 | } // namespace cheremkhin_a_matr_mult_cannon_alg | ||
| 161 |