| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "remizov_k_dense_matrix_multiplication_cannon_algorithm/stl/include/ops_stl.hpp" | ||
| 2 | |||
| 3 | #include <algorithm> | ||
| 4 | #include <cstddef> | ||
| 5 | #include <thread> | ||
| 6 | #include <utility> | ||
| 7 | #include <vector> | ||
| 8 | |||
| 9 | #include "remizov_k_dense_matrix_multiplication_cannon_algorithm/common/include/common.hpp" | ||
| 10 | |||
| 11 | namespace remizov_k_dense_matrix_multiplication_cannon_algorithm { | ||
| 12 | |||
| 13 | namespace { | ||
| 14 | |||
| 15 | template <typename IndexType, typename Func> | ||
| 16 | 720 | void ParallelFor(IndexType begin, IndexType end, const Func &func) { | |
| 17 | const std::size_t num_threads = | ||
| 18 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 360 times.
|
720 | std::max(static_cast<std::size_t>(1U), static_cast<std::size_t>(std::thread::hardware_concurrency())); |
| 19 | 720 | const IndexType range_length = end - begin; | |
| 20 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 360 times.
|
720 | if (range_length <= 0) { |
| 21 | ✗ | return; | |
| 22 | } | ||
| 23 | |||
| 24 | 720 | std::vector<std::thread> threads; | |
| 25 |
1/2✓ Branch 1 taken 360 times.
✗ Branch 2 not taken.
|
720 | threads.reserve(num_threads); |
| 26 | |||
| 27 | 720 | IndexType chunk_size = (range_length + static_cast<IndexType>(num_threads) - 1) / static_cast<IndexType>(num_threads); | |
| 28 | IndexType start = begin; | ||
| 29 | |||
| 30 |
2/2✓ Branch 0 taken 1280 times.
✓ Branch 1 taken 224 times.
|
3008 | for (std::size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { |
| 31 |
2/2✓ Branch 0 taken 1144 times.
✓ Branch 1 taken 136 times.
|
2560 | IndexType chunk_end = std::min(end, start + chunk_size); |
| 32 |
2/2✓ Branch 0 taken 1144 times.
✓ Branch 1 taken 136 times.
|
2560 | if (start >= chunk_end) { |
| 33 | break; | ||
| 34 | } | ||
| 35 | |||
| 36 |
1/2✓ Branch 1 taken 1144 times.
✗ Branch 2 not taken.
|
2288 | threads.emplace_back([start, chunk_end, &func]() { |
| 37 |
10/10✓ Branch 0 taken 112 times.
✓ Branch 1 taken 112 times.
✓ Branch 2 taken 112 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 456 times.
✓ Branch 5 taken 456 times.
✓ Branch 6 taken 232 times.
✓ Branch 7 taken 232 times.
✓ Branch 8 taken 232 times.
✓ Branch 9 taken 232 times.
|
2288 | for (IndexType i = start; i < chunk_end; ++i) { |
| 38 | 1144 | func(i); | |
| 39 | } | ||
| 40 | }); | ||
| 41 | start = chunk_end; | ||
| 42 | } | ||
| 43 | |||
| 44 |
2/2✓ Branch 0 taken 1144 times.
✓ Branch 1 taken 360 times.
|
3008 | for (auto &th : threads) { |
| 45 |
1/2✓ Branch 0 taken 1144 times.
✗ Branch 1 not taken.
|
2288 | if (th.joinable()) { |
| 46 |
1/2✓ Branch 1 taken 1144 times.
✗ Branch 2 not taken.
|
2288 | th.join(); |
| 47 | } | ||
| 48 | } | ||
| 49 | 720 | } | |
| 50 | |||
| 51 | template <typename Func> | ||
| 52 | 496 | void ParallelFor2D(int rows_begin, int rows_end, int cols_begin, int cols_end, const Func &func) { | |
| 53 | 496 | const int rows = rows_end - rows_begin; | |
| 54 | 496 | const int cols = cols_end - cols_begin; | |
| 55 | 496 | const int total = rows * cols; | |
| 56 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 248 times.
|
496 | if (total <= 0) { |
| 57 | ✗ | return; | |
| 58 | } | ||
| 59 | |||
| 60 | 2336 | ParallelFor(0, total, [&](int linear_idx) { | |
| 61 | 920 | int i = rows_begin + (linear_idx / cols); | |
| 62 | 920 | int j = cols_begin + (linear_idx % cols); | |
| 63 | 920 | func(i, j); | |
| 64 | }); | ||
| 65 | } | ||
| 66 | |||
| 67 | } // namespace | ||
| 68 | |||
| 69 | 64 | RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::RemizovKDenseMatrixMultiplicationCannonAlgorithmStl( | |
| 70 |
1/2✓ Branch 1 taken 64 times.
✗ Branch 2 not taken.
|
64 | const InType &in) { |
| 71 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 72 | GetInput() = in; | ||
| 73 | 64 | } | |
| 74 | |||
| 75 | 64 | bool RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::ValidationImpl() { | |
| 76 | const auto &input_data = GetInput(); | ||
| 77 | 64 | int block_dim = std::get<0>(input_data); | |
| 78 | const auto &mat_a = std::get<1>(input_data); | ||
| 79 | const auto &mat_b = std::get<2>(input_data); | ||
| 80 | |||
| 81 |
1/2✓ Branch 0 taken 64 times.
✗ Branch 1 not taken.
|
64 | if (block_dim <= 0) { |
| 82 | return false; | ||
| 83 | } | ||
| 84 |
2/4✓ Branch 0 taken 64 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 64 times.
✗ Branch 3 not taken.
|
64 | if (mat_a.empty() || mat_b.empty()) { |
| 85 | return false; | ||
| 86 | } | ||
| 87 | |||
| 88 | size_t n = mat_a.size(); | ||
| 89 |
1/2✓ Branch 0 taken 64 times.
✗ Branch 1 not taken.
|
64 | if (n != mat_a[0].size()) { |
| 90 | return false; | ||
| 91 | } | ||
| 92 |
2/4✓ Branch 0 taken 64 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 64 times.
✗ Branch 3 not taken.
|
64 | if (n != mat_b.size() || n != mat_b[0].size()) { |
| 93 | return false; | ||
| 94 | } | ||
| 95 | |||
| 96 | 64 | return (n % static_cast<size_t>(block_dim) == 0); | |
| 97 | } | ||
| 98 | |||
| 99 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 64 times.
|
64 | bool RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::PreProcessingImpl() { |
| 100 | GetOutput().clear(); | ||
| 101 | 64 | return true; | |
| 102 | } | ||
| 103 | |||
| 104 | 456 | void RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::MultiplyBlock(const std::vector<std::vector<double>> &a, | |
| 105 | const std::vector<std::vector<double>> &b, | ||
| 106 | std::vector<std::vector<double>> &c, | ||
| 107 | int block_size) { | ||
| 108 |
2/2✓ Branch 0 taken 984 times.
✓ Branch 1 taken 456 times.
|
1440 | for (int i = 0; i < block_size; ++i) { |
| 109 |
2/2✓ Branch 0 taken 2568 times.
✓ Branch 1 taken 984 times.
|
3552 | for (int j = 0; j < block_size; ++j) { |
| 110 | double acc = 0.0; | ||
| 111 |
2/2✓ Branch 0 taken 7704 times.
✓ Branch 1 taken 2568 times.
|
10272 | for (int k = 0; k < block_size; ++k) { |
| 112 | 7704 | acc += a[i][k] * b[k][j]; | |
| 113 | } | ||
| 114 | 2568 | c[i][j] += acc; | |
| 115 | } | ||
| 116 | } | ||
| 117 | 456 | } | |
| 118 | |||
| 119 | ✗ | void RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::ShiftBlocksLeft( | |
| 120 | std::vector<std::vector<std::vector<std::vector<double>>>> &matrix_blocks, int block_count) { | ||
| 121 | 56 | ParallelFor(0, block_count, [&](int i) { | |
| 122 | 112 | auto first = std::move(matrix_blocks[i][0]); | |
| 123 |
2/2✓ Branch 0 taken 112 times.
✓ Branch 1 taken 112 times.
|
224 | for (int j = 1; j < block_count; ++j) { |
| 124 | 112 | matrix_blocks[i][j - 1] = std::move(matrix_blocks[i][j]); | |
| 125 | } | ||
| 126 | 112 | matrix_blocks[i][block_count - 1] = std::move(first); | |
| 127 | 112 | }); | |
| 128 | ✗ | } | |
| 129 | |||
| 130 | ✗ | void RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::ShiftBlocksUp( | |
| 131 | std::vector<std::vector<std::vector<std::vector<double>>>> &matrix_blocks, int block_count) { | ||
| 132 | ✗ | ParallelFor(0, block_count, [&](int j) { | |
| 133 | 112 | auto first = std::move(matrix_blocks[0][j]); | |
| 134 |
2/2✓ Branch 0 taken 112 times.
✓ Branch 1 taken 112 times.
|
224 | for (int i = 1; i < block_count; ++i) { |
| 135 | 112 | matrix_blocks[i - 1][j] = std::move(matrix_blocks[i][j]); | |
| 136 | } | ||
| 137 | 112 | matrix_blocks[block_count - 1][j] = std::move(first); | |
| 138 | 112 | }); | |
| 139 | ✗ | } | |
| 140 | |||
| 141 | 64 | void RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::RunCannonCycle( | |
| 142 | std::vector<std::vector<std::vector<std::vector<double>>>> &a_blocks, | ||
| 143 | std::vector<std::vector<std::vector<std::vector<double>>>> &b_blocks, | ||
| 144 | std::vector<std::vector<std::vector<std::vector<double>>>> &c_blocks, int block_size, int block_count) { | ||
| 145 |
2/2✓ Branch 0 taken 120 times.
✓ Branch 1 taken 64 times.
|
184 | for (int step = 0; step < block_count; ++step) { |
| 146 | 120 | ParallelFor2D(0, block_count, 0, block_count, | |
| 147 | 576 | [&](int i, int j) { MultiplyBlock(a_blocks[i][j], b_blocks[i][j], c_blocks[i][j], block_size); }); | |
| 148 | |||
| 149 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 64 times.
|
120 | if (step < block_count - 1) { |
| 150 | 56 | ShiftBlocksLeft(a_blocks, block_count); | |
| 151 | 56 | ShiftBlocksUp(b_blocks, block_count); | |
| 152 | } | ||
| 153 | } | ||
| 154 | 64 | } | |
| 155 | |||
| 156 | ✗ | void RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::InitializeBlocks( | |
| 157 | const std::vector<std::vector<double>> &matrix_a, const std::vector<std::vector<double>> &matrix_b, | ||
| 158 | std::vector<std::vector<std::vector<std::vector<double>>>> &a_blocks, | ||
| 159 | std::vector<std::vector<std::vector<std::vector<double>>>> &b_blocks, int block_size, int block_count) { | ||
| 160 | ✗ | ParallelFor2D(0, block_count, 0, block_count, [&](int i, int j) { | |
| 161 | 232 | int shift = (i + j) % block_count; | |
| 162 |
2/2✓ Branch 0 taken 504 times.
✓ Branch 1 taken 232 times.
|
736 | for (int bi = 0; bi < block_size; ++bi) { |
| 163 |
2/2✓ Branch 0 taken 1320 times.
✓ Branch 1 taken 504 times.
|
1824 | for (int bj = 0; bj < block_size; ++bj) { |
| 164 | 1320 | a_blocks[i][j][bi][bj] = matrix_a[(i * block_size) + bi][(shift * block_size) + bj]; | |
| 165 | 1320 | b_blocks[i][j][bi][bj] = matrix_b[(shift * block_size) + bi][(j * block_size) + bj]; | |
| 166 | } | ||
| 167 | } | ||
| 168 | 232 | }); | |
| 169 | ✗ | } | |
| 170 | |||
| 171 | ✗ | void RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::AssembleOutput( | |
| 172 | std::vector<std::vector<std::vector<std::vector<double>>>> &c_blocks, std::vector<std::vector<double>> &output, | ||
| 173 | int block_size, int block_count) { | ||
| 174 | ✗ | ParallelFor2D(0, block_count, 0, block_count, [&](int i, int j) { | |
| 175 |
2/2✓ Branch 0 taken 504 times.
✓ Branch 1 taken 232 times.
|
736 | for (int bi = 0; bi < block_size; ++bi) { |
| 176 |
2/2✓ Branch 0 taken 1320 times.
✓ Branch 1 taken 504 times.
|
1824 | for (int bj = 0; bj < block_size; ++bj) { |
| 177 | 1320 | output[(i * block_size) + bi][(j * block_size) + bj] = c_blocks[i][j][bi][bj]; | |
| 178 | } | ||
| 179 | } | ||
| 180 | 232 | }); | |
| 181 | ✗ | } | |
| 182 | |||
| 183 | 64 | bool RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::RunImpl() { | |
| 184 | const auto ¶ms = GetInput(); | ||
| 185 | 64 | int block_dim = std::get<0>(params); | |
| 186 | const auto &source_a = std::get<1>(params); | ||
| 187 | const auto &source_b = std::get<2>(params); | ||
| 188 | |||
| 189 | 64 | int matrix_size = static_cast<int>(source_a.size()); | |
| 190 | 64 | int blocks_per_dim = matrix_size / block_dim; | |
| 191 | |||
| 192 | using Block4D = std::vector<std::vector<std::vector<std::vector<double>>>>; | ||
| 193 | 64 | Block4D blocks_a(blocks_per_dim, std::vector<std::vector<std::vector<double>>>( | |
| 194 | 64 | blocks_per_dim, std::vector<std::vector<double>>( | |
| 195 |
3/6✓ Branch 2 taken 64 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 64 times.
✗ Branch 6 not taken.
✓ Branch 8 taken 64 times.
✗ Branch 9 not taken.
|
64 | block_dim, std::vector<double>(block_dim, 0.0)))); |
| 196 | 64 | Block4D blocks_b(blocks_per_dim, std::vector<std::vector<std::vector<double>>>( | |
| 197 | 64 | blocks_per_dim, std::vector<std::vector<double>>( | |
| 198 |
4/8✓ Branch 1 taken 64 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 64 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 64 times.
✗ Branch 8 not taken.
✓ Branch 10 taken 64 times.
✗ Branch 11 not taken.
|
64 | block_dim, std::vector<double>(block_dim, 0.0)))); |
| 199 | 64 | Block4D blocks_c(blocks_per_dim, std::vector<std::vector<std::vector<double>>>( | |
| 200 | 64 | blocks_per_dim, std::vector<std::vector<double>>( | |
| 201 |
5/10✓ Branch 1 taken 64 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 64 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 64 times.
✗ Branch 8 not taken.
✓ Branch 10 taken 64 times.
✗ Branch 11 not taken.
✓ Branch 13 taken 64 times.
✗ Branch 14 not taken.
|
128 | block_dim, std::vector<double>(block_dim, 0.0)))); |
| 202 | |||
| 203 | 64 | InitializeBlocks(source_a, source_b, blocks_a, blocks_b, block_dim, blocks_per_dim); | |
| 204 |
1/2✓ Branch 1 taken 64 times.
✗ Branch 2 not taken.
|
64 | RunCannonCycle(blocks_a, blocks_b, blocks_c, block_dim, blocks_per_dim); |
| 205 | |||
| 206 |
3/6✓ Branch 1 taken 64 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 64 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 64 times.
✗ Branch 8 not taken.
|
128 | std::vector<std::vector<double>> result(matrix_size, std::vector<double>(matrix_size, 0.0)); |
| 207 | 64 | AssembleOutput(blocks_c, result, block_dim, blocks_per_dim); | |
| 208 | |||
| 209 | 64 | GetOutput() = std::move(result); | |
| 210 | 64 | return true; | |
| 211 | 64 | } | |
| 212 | |||
| 213 | 64 | bool RemizovKDenseMatrixMultiplicationCannonAlgorithmStl::PostProcessingImpl() { | |
| 214 | 64 | return true; | |
| 215 | } | ||
| 216 | |||
| 217 | } // namespace remizov_k_dense_matrix_multiplication_cannon_algorithm | ||
| 218 |