| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "remizov_k_dense_matrix_multiplication_cannon_algorithm/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <tbb/blocked_range2d.h> | ||
| 4 | #include <tbb/parallel_for.h> | ||
| 5 | |||
| 6 | #ifdef _OPENMP | ||
| 7 | # include <omp.h> | ||
| 8 | #endif | ||
| 9 | |||
| 10 | #include <algorithm> | ||
| 11 | #include <cstddef> | ||
| 12 | #include <thread> | ||
| 13 | #include <utility> | ||
| 14 | #include <vector> | ||
| 15 | |||
| 16 | #include "remizov_k_dense_matrix_multiplication_cannon_algorithm/common/include/common.hpp" | ||
| 17 | |||
| 18 | namespace remizov_k_dense_matrix_multiplication_cannon_algorithm { | ||
| 19 | |||
| 20 | 16 | RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::RemizovKDenseMatrixMultiplicationCannonAlgorithmAll( | |
| 21 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | const InType &in) { |
| 22 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 23 | GetInput() = in; | ||
| 24 | 16 | } | |
| 25 | |||
| 26 | 16 | bool RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::ValidationImpl() { | |
| 27 | const auto &input_data = GetInput(); | ||
| 28 | |||
| 29 | 16 | int block_dim = std::get<0>(input_data); | |
| 30 | const auto &mat_a = std::get<1>(input_data); | ||
| 31 | const auto &mat_b = std::get<2>(input_data); | ||
| 32 | |||
| 33 |
1/2✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
|
16 | if (block_dim <= 0) { |
| 34 | return false; | ||
| 35 | } | ||
| 36 |
2/4✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 16 times.
✗ Branch 3 not taken.
|
16 | if (mat_a.empty() || mat_b.empty()) { |
| 37 | return false; | ||
| 38 | } | ||
| 39 | |||
| 40 | size_t n = mat_a.size(); | ||
| 41 |
1/2✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
|
16 | if (n != mat_a[0].size()) { |
| 42 | return false; | ||
| 43 | } | ||
| 44 |
2/4✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 16 times.
✗ Branch 3 not taken.
|
16 | if (n != mat_b.size() || n != mat_b[0].size()) { |
| 45 | return false; | ||
| 46 | } | ||
| 47 | |||
| 48 | 16 | return (n % static_cast<size_t>(block_dim) == 0); | |
| 49 | } | ||
| 50 | |||
| 51 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
|
16 | bool RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::PreProcessingImpl() { |
| 52 | GetOutput().clear(); | ||
| 53 | 16 | return true; | |
| 54 | } | ||
| 55 | |||
| 56 | ✗ | void RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::MultiplyBlock(const std::vector<std::vector<double>> &a, | |
| 57 | const std::vector<std::vector<double>> &b, | ||
| 58 | std::vector<std::vector<double>> &c, | ||
| 59 | int block_size) { | ||
| 60 | #ifdef _OPENMP | ||
| 61 | 114 | # pragma omp parallel for collapse(2) schedule(static) default(none) shared(a, b, c, block_size) | |
| 62 | #endif | ||
| 63 | for (int i = 0; i < block_size; ++i) { | ||
| 64 | for (int j = 0; j < block_size; ++j) { | ||
| 65 | double acc = 0.0; | ||
| 66 | for (int k = 0; k < block_size; ++k) { | ||
| 67 | acc += a[i][k] * b[k][j]; | ||
| 68 | } | ||
| 69 | c[i][j] += acc; | ||
| 70 | } | ||
| 71 | } | ||
| 72 | ✗ | } | |
| 73 | |||
| 74 | 14 | void RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::ShiftBlocksLeft( | |
| 75 | std::vector<std::vector<std::vector<std::vector<double>>>> &matrix_blocks, int block_count) { | ||
| 76 | 14 | const unsigned int num_threads = std::max(1U, std::thread::hardware_concurrency()); | |
| 77 | 14 | std::vector<std::thread> threads; | |
| 78 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | threads.reserve(num_threads); |
| 79 | |||
| 80 | 14 | const int rows_per_thread = (block_count + static_cast<int>(num_threads) - 1) / static_cast<int>(num_threads); | |
| 81 |
1/2✓ Branch 0 taken 42 times.
✗ Branch 1 not taken.
|
42 | for (unsigned int thread_idx = 0; thread_idx < num_threads; ++thread_idx) { |
| 82 | 42 | const int start = static_cast<int>(thread_idx) * rows_per_thread; | |
| 83 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | const int end = std::min(start + rows_per_thread, block_count); |
| 84 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | if (start >= end) { |
| 85 | break; | ||
| 86 | } | ||
| 87 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | threads.emplace_back([&matrix_blocks, block_count, start, end]() { |
| 88 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 28 times.
|
56 | for (int i = start; i < end; ++i) { |
| 89 | 28 | auto first = std::move(matrix_blocks[i][0]); | |
| 90 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 28 times.
|
56 | for (int j = 1; j < block_count; ++j) { |
| 91 | 28 | matrix_blocks[i][j - 1] = std::move(matrix_blocks[i][j]); | |
| 92 | } | ||
| 93 | 28 | matrix_blocks[i][block_count - 1] = std::move(first); | |
| 94 | 28 | } | |
| 95 | 28 | }); | |
| 96 | } | ||
| 97 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | for (auto &th : threads) { |
| 98 |
1/2✓ Branch 0 taken 28 times.
✗ Branch 1 not taken.
|
28 | if (th.joinable()) { |
| 99 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | th.join(); |
| 100 | } | ||
| 101 | } | ||
| 102 | 14 | } | |
| 103 | |||
| 104 | 14 | void RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::ShiftBlocksUp( | |
| 105 | std::vector<std::vector<std::vector<std::vector<double>>>> &matrix_blocks, int block_count) { | ||
| 106 | 14 | const unsigned int num_threads = std::max(1U, std::thread::hardware_concurrency()); | |
| 107 | 14 | std::vector<std::thread> threads; | |
| 108 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | threads.reserve(num_threads); |
| 109 | |||
| 110 | 14 | const int cols_per_thread = (block_count + static_cast<int>(num_threads) - 1) / static_cast<int>(num_threads); | |
| 111 |
1/2✓ Branch 0 taken 42 times.
✗ Branch 1 not taken.
|
42 | for (unsigned int thread_idx = 0; thread_idx < num_threads; ++thread_idx) { |
| 112 | 42 | const int start = static_cast<int>(thread_idx) * cols_per_thread; | |
| 113 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | const int end = std::min(start + cols_per_thread, block_count); |
| 114 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | if (start >= end) { |
| 115 | break; | ||
| 116 | } | ||
| 117 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | threads.emplace_back([&matrix_blocks, block_count, start, end]() { |
| 118 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 28 times.
|
56 | for (int j = start; j < end; ++j) { |
| 119 | 28 | auto first = std::move(matrix_blocks[0][j]); | |
| 120 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 28 times.
|
56 | for (int i = 1; i < block_count; ++i) { |
| 121 | 28 | matrix_blocks[i - 1][j] = std::move(matrix_blocks[i][j]); | |
| 122 | } | ||
| 123 | 28 | matrix_blocks[block_count - 1][j] = std::move(first); | |
| 124 | 28 | } | |
| 125 | 28 | }); | |
| 126 | } | ||
| 127 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | for (auto &th : threads) { |
| 128 |
1/2✓ Branch 0 taken 28 times.
✗ Branch 1 not taken.
|
28 | if (th.joinable()) { |
| 129 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | th.join(); |
| 130 | } | ||
| 131 | } | ||
| 132 | 14 | } | |
| 133 | |||
| 134 | 16 | void RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::RunCannonCycle( | |
| 135 | std::vector<std::vector<std::vector<std::vector<double>>>> &a_blocks, | ||
| 136 | std::vector<std::vector<std::vector<std::vector<double>>>> &b_blocks, | ||
| 137 | std::vector<std::vector<std::vector<std::vector<double>>>> &c_blocks, int block_size, int block_count) { | ||
| 138 |
2/2✓ Branch 0 taken 30 times.
✓ Branch 1 taken 16 times.
|
46 | for (int step = 0; step < block_count; ++step) { |
| 139 | 30 | tbb::parallel_for(tbb::blocked_range2d<int>(0, block_count, 0, block_count), | |
| 140 | 144 | [&](const tbb::blocked_range2d<int> &r) { | |
| 141 |
2/2✓ Branch 0 taken 114 times.
✓ Branch 1 taken 114 times.
|
228 | for (int i = r.rows().begin(); i != r.rows().end(); ++i) { |
| 142 |
2/2✓ Branch 0 taken 114 times.
✓ Branch 1 taken 114 times.
|
228 | for (int j = r.cols().begin(); j != r.cols().end(); ++j) { |
| 143 | 114 | MultiplyBlock(a_blocks[i][j], b_blocks[i][j], c_blocks[i][j], block_size); | |
| 144 | } | ||
| 145 | } | ||
| 146 | 114 | }); | |
| 147 | |||
| 148 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 16 times.
|
30 | if (step < block_count - 1) { |
| 149 | 14 | ShiftBlocksLeft(a_blocks, block_count); | |
| 150 | 14 | ShiftBlocksUp(b_blocks, block_count); | |
| 151 | } | ||
| 152 | } | ||
| 153 | 16 | } | |
| 154 | |||
| 155 | 16 | void RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::InitializeBlocks( | |
| 156 | const std::vector<std::vector<double>> &matrix_a, const std::vector<std::vector<double>> &matrix_b, | ||
| 157 | std::vector<std::vector<std::vector<std::vector<double>>>> &a_blocks, | ||
| 158 | std::vector<std::vector<std::vector<std::vector<double>>>> &b_blocks, int block_size, int block_count) { | ||
| 159 | 74 | tbb::parallel_for(tbb::blocked_range2d<int>(0, block_count, 0, block_count), [&](const tbb::blocked_range2d<int> &r) { | |
| 160 |
2/2✓ Branch 0 taken 58 times.
✓ Branch 1 taken 58 times.
|
116 | for (int i = r.rows().begin(); i != r.rows().end(); ++i) { |
| 161 |
2/2✓ Branch 0 taken 58 times.
✓ Branch 1 taken 58 times.
|
116 | for (int j = r.cols().begin(); j != r.cols().end(); ++j) { |
| 162 | 58 | const int shift = (i + j) % block_count; | |
| 163 |
2/2✓ Branch 0 taken 126 times.
✓ Branch 1 taken 58 times.
|
184 | for (int bi = 0; bi < block_size; ++bi) { |
| 164 |
2/2✓ Branch 0 taken 330 times.
✓ Branch 1 taken 126 times.
|
456 | for (int bj = 0; bj < block_size; ++bj) { |
| 165 | 330 | a_blocks[i][j][bi][bj] = matrix_a[(i * block_size) + bi][(shift * block_size) + bj]; | |
| 166 | 330 | b_blocks[i][j][bi][bj] = matrix_b[(shift * block_size) + bi][(j * block_size) + bj]; | |
| 167 | } | ||
| 168 | } | ||
| 169 | } | ||
| 170 | } | ||
| 171 | 58 | }); | |
| 172 | 16 | } | |
| 173 | |||
| 174 | 16 | void RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::AssembleOutput( | |
| 175 | std::vector<std::vector<std::vector<std::vector<double>>>> &c_blocks, std::vector<std::vector<double>> &output, | ||
| 176 | int block_size, int block_count) { | ||
| 177 | 74 | tbb::parallel_for(tbb::blocked_range2d<int>(0, block_count, 0, block_count), [&](const tbb::blocked_range2d<int> &r) { | |
| 178 |
2/2✓ Branch 0 taken 58 times.
✓ Branch 1 taken 58 times.
|
116 | for (int i = r.rows().begin(); i != r.rows().end(); ++i) { |
| 179 |
2/2✓ Branch 0 taken 58 times.
✓ Branch 1 taken 58 times.
|
116 | for (int j = r.cols().begin(); j != r.cols().end(); ++j) { |
| 180 |
2/2✓ Branch 0 taken 126 times.
✓ Branch 1 taken 58 times.
|
184 | for (int bi = 0; bi < block_size; ++bi) { |
| 181 |
2/2✓ Branch 0 taken 330 times.
✓ Branch 1 taken 126 times.
|
456 | for (int bj = 0; bj < block_size; ++bj) { |
| 182 | 330 | output[(i * block_size) + bi][(j * block_size) + bj] = c_blocks[i][j][bi][bj]; | |
| 183 | } | ||
| 184 | } | ||
| 185 | } | ||
| 186 | } | ||
| 187 | 58 | }); | |
| 188 | 16 | } | |
| 189 | |||
| 190 | 16 | bool RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::RunImpl() { | |
| 191 | const auto ¶ms = GetInput(); | ||
| 192 | |||
| 193 | 16 | const int block_dim = std::get<0>(params); | |
| 194 | const auto &source_a = std::get<1>(params); | ||
| 195 | const auto &source_b = std::get<2>(params); | ||
| 196 | |||
| 197 | 16 | const int matrix_size = static_cast<int>(source_a.size()); | |
| 198 | 16 | const int blocks_per_dim = matrix_size / block_dim; | |
| 199 | |||
| 200 | using Block4D = std::vector<std::vector<std::vector<std::vector<double>>>>; | ||
| 201 | 16 | Block4D blocks_a(blocks_per_dim, std::vector<std::vector<std::vector<double>>>( | |
| 202 | 16 | blocks_per_dim, std::vector<std::vector<double>>( | |
| 203 |
3/6✓ Branch 2 taken 16 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 16 times.
✗ Branch 6 not taken.
✓ Branch 8 taken 16 times.
✗ Branch 9 not taken.
|
16 | block_dim, std::vector<double>(block_dim, 0.0)))); |
| 204 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | Block4D blocks_b = blocks_a; |
| 205 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | Block4D blocks_c = blocks_a; |
| 206 | |||
| 207 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | InitializeBlocks(source_a, source_b, blocks_a, blocks_b, block_dim, blocks_per_dim); |
| 208 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | RunCannonCycle(blocks_a, blocks_b, blocks_c, block_dim, blocks_per_dim); |
| 209 | |||
| 210 |
2/4✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 16 times.
✗ Branch 5 not taken.
|
16 | std::vector<std::vector<double>> result(matrix_size, std::vector<double>(matrix_size, 0.0)); |
| 211 |
1/2✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
|
16 | AssembleOutput(blocks_c, result, block_dim, blocks_per_dim); |
| 212 | |||
| 213 | 16 | GetOutput() = std::move(result); | |
| 214 | 16 | return true; | |
| 215 | 16 | } | |
| 216 | |||
| 217 | 16 | bool RemizovKDenseMatrixMultiplicationCannonAlgorithmAll::PostProcessingImpl() { | |
| 218 | 16 | return true; | |
| 219 | } | ||
| 220 | |||
| 221 | } // namespace remizov_k_dense_matrix_multiplication_cannon_algorithm | ||
| 222 |