| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "sinev_a_mult_matrix_fox_algorithm/stl/include/ops_stl.hpp" | ||
| 2 | |||
| 3 | #include <atomic> | ||
| 4 | #include <cmath> | ||
| 5 | #include <cstddef> | ||
| 6 | #include <thread> | ||
| 7 | #include <vector> | ||
| 8 | |||
| 9 | #include "sinev_a_mult_matrix_fox_algorithm/common/include/common.hpp" | ||
| 10 | |||
| 11 | namespace sinev_a_mult_matrix_fox_algorithm { | ||
| 12 | |||
| 13 |
1/2✓ Branch 1 taken 104 times.
✗ Branch 2 not taken.
|
104 | SinevAMultMatrixFoxAlgorithmSTL::SinevAMultMatrixFoxAlgorithmSTL(const InType &in) { |
| 14 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 15 | GetInput() = in; | ||
| 16 | GetOutput() = {}; | ||
| 17 | 104 | } | |
| 18 | |||
| 19 | 104 | bool SinevAMultMatrixFoxAlgorithmSTL::ValidationImpl() { | |
| 20 | const auto &[matrix_size, matrix_a, matrix_b] = GetInput(); | ||
| 21 |
3/6✓ Branch 0 taken 104 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 104 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 104 times.
|
104 | return matrix_size > 0 && matrix_a.size() == matrix_size * matrix_size && |
| 22 | 104 | matrix_b.size() == matrix_size * matrix_size; | |
| 23 | } | ||
| 24 | |||
| 25 | 104 | bool SinevAMultMatrixFoxAlgorithmSTL::PreProcessingImpl() { | |
| 26 | const auto &[matrix_size, matrix_a, matrix_b] = GetInput(); | ||
| 27 | 104 | GetOutput() = std::vector<double>(matrix_size * matrix_size, 0.0); | |
| 28 | 104 | return true; | |
| 29 | } | ||
| 30 | |||
| 31 | 104 | void SinevAMultMatrixFoxAlgorithmSTL::SimpleMultiply(size_t n, const std::vector<double> &a, | |
| 32 | const std::vector<double> &b, std::vector<double> &c) { | ||
| 33 |
2/2✓ Branch 0 taken 2592 times.
✓ Branch 1 taken 104 times.
|
2696 | for (size_t i = 0; i < n; ++i) { |
| 34 |
2/2✓ Branch 0 taken 157056 times.
✓ Branch 1 taken 2592 times.
|
159648 | for (size_t k = 0; k < n; ++k) { |
| 35 | 157056 | double tmp = a[(i * n) + k]; | |
| 36 |
2/2✓ Branch 0 taken 12599664 times.
✓ Branch 1 taken 157056 times.
|
12756720 | for (size_t j = 0; j < n; ++j) { |
| 37 | 12599664 | c[(i * n) + j] += tmp * b[(k * n) + j]; | |
| 38 | } | ||
| 39 | } | ||
| 40 | } | ||
| 41 | 104 | } | |
| 42 | |||
| 43 | ✗ | void SinevAMultMatrixFoxAlgorithmSTL::DecomposeToBlocks(const std::vector<double> &src, std::vector<double> &dst, | |
| 44 | size_t n, size_t bs, int q) { | ||
| 45 | ✗ | unsigned int num_threads = std::thread::hardware_concurrency(); | |
| 46 | ✗ | if (num_threads == 0) { | |
| 47 | num_threads = 2; | ||
| 48 | } | ||
| 49 | |||
| 50 | ✗ | std::vector<std::thread> threads; | |
| 51 | ✗ | threads.reserve(num_threads); | |
| 52 | ✗ | std::atomic<size_t> next_block(0); | |
| 53 | ✗ | size_t total_blocks = static_cast<size_t>(q) * static_cast<size_t>(q); | |
| 54 | |||
| 55 | ✗ | for (unsigned int thread_idx = 0; thread_idx < num_threads; ++thread_idx) { | |
| 56 | ✗ | threads.emplace_back([&]() { | |
| 57 | size_t block_idx = 0; | ||
| 58 | ✗ | while ((block_idx = next_block.fetch_add(1)) < total_blocks) { | |
| 59 | ✗ | int bi = static_cast<int>(block_idx / q); | |
| 60 | ✗ | int bj = static_cast<int>(block_idx % q); | |
| 61 | |||
| 62 | ✗ | const size_t block_off = block_idx * (bs * bs); | |
| 63 | ✗ | for (size_t i = 0; i < bs; ++i) { | |
| 64 | ✗ | for (size_t j = 0; j < bs; ++j) { | |
| 65 | ✗ | const size_t src_idx = ((static_cast<size_t>(bi) * bs + i) * n) + (static_cast<size_t>(bj) * bs + j); | |
| 66 | ✗ | const size_t dst_idx = block_off + (i * bs) + j; | |
| 67 | ✗ | dst[dst_idx] = src[src_idx]; | |
| 68 | } | ||
| 69 | } | ||
| 70 | } | ||
| 71 | ✗ | }); | |
| 72 | } | ||
| 73 | |||
| 74 | ✗ | for (auto &thread : threads) { | |
| 75 | ✗ | thread.join(); | |
| 76 | } | ||
| 77 | ✗ | } | |
| 78 | |||
| 79 | ✗ | void SinevAMultMatrixFoxAlgorithmSTL::AssembleFromBlocks(const std::vector<double> &src, std::vector<double> &dst, | |
| 80 | size_t n, size_t bs, int q) { | ||
| 81 | ✗ | unsigned int num_threads = std::thread::hardware_concurrency(); | |
| 82 | ✗ | if (num_threads == 0) { | |
| 83 | num_threads = 2; | ||
| 84 | } | ||
| 85 | |||
| 86 | ✗ | std::vector<std::thread> threads; | |
| 87 | ✗ | threads.reserve(num_threads); | |
| 88 | ✗ | std::atomic<size_t> next_block(0); | |
| 89 | ✗ | size_t total_blocks = static_cast<size_t>(q) * static_cast<size_t>(q); | |
| 90 | |||
| 91 | ✗ | for (unsigned int thread_idx = 0; thread_idx < num_threads; ++thread_idx) { | |
| 92 | ✗ | threads.emplace_back([&]() { | |
| 93 | size_t block_idx = 0; | ||
| 94 | ✗ | while ((block_idx = next_block.fetch_add(1)) < total_blocks) { | |
| 95 | ✗ | int bi = static_cast<int>(block_idx / q); | |
| 96 | ✗ | int bj = static_cast<int>(block_idx % q); | |
| 97 | |||
| 98 | ✗ | const size_t block_off = block_idx * (bs * bs); | |
| 99 | ✗ | for (size_t i = 0; i < bs; ++i) { | |
| 100 | ✗ | for (size_t j = 0; j < bs; ++j) { | |
| 101 | ✗ | const size_t src_idx = block_off + (i * bs) + j; | |
| 102 | ✗ | const size_t dst_idx = ((static_cast<size_t>(bi) * bs + i) * n) + (static_cast<size_t>(bj) * bs + j); | |
| 103 | ✗ | dst[dst_idx] = src[src_idx]; | |
| 104 | } | ||
| 105 | } | ||
| 106 | } | ||
| 107 | ✗ | }); | |
| 108 | } | ||
| 109 | |||
| 110 | ✗ | for (auto &thread : threads) { | |
| 111 | ✗ | thread.join(); | |
| 112 | } | ||
| 113 | ✗ | } | |
| 114 | |||
| 115 | ✗ | void SinevAMultMatrixFoxAlgorithmSTL::MultiplyBlocks(const std::vector<double> &blocks_a, | |
| 116 | const std::vector<double> &blocks_b, std::vector<double> &blocks_c, | ||
| 117 | size_t bs, size_t a_off, size_t b_off, size_t c_off) { | ||
| 118 | ✗ | for (size_t ii = 0; ii < bs; ++ii) { | |
| 119 | ✗ | for (size_t kk = 0; kk < bs; ++kk) { | |
| 120 | ✗ | const double val = blocks_a[a_off + (ii * bs) + kk]; | |
| 121 | ✗ | const size_t b_base = b_off + (kk * bs); | |
| 122 | ✗ | const size_t c_base = c_off + (ii * bs); | |
| 123 | ✗ | for (size_t jj = 0; jj < bs; ++jj) { | |
| 124 | ✗ | blocks_c[c_base + jj] += val * blocks_b[b_base + jj]; | |
| 125 | } | ||
| 126 | } | ||
| 127 | } | ||
| 128 | ✗ | } | |
| 129 | |||
| 130 | ✗ | void SinevAMultMatrixFoxAlgorithmSTL::FoxStep(const std::vector<double> &blocks_a, const std::vector<double> &blocks_b, | |
| 131 | std::vector<double> &blocks_c, size_t bs, int q, int step) { | ||
| 132 | ✗ | const size_t block_size = bs * bs; | |
| 133 | ✗ | unsigned int num_threads = std::thread::hardware_concurrency(); | |
| 134 | ✗ | if (num_threads == 0) { | |
| 135 | num_threads = 2; | ||
| 136 | } | ||
| 137 | |||
| 138 | ✗ | std::vector<std::thread> threads; | |
| 139 | ✗ | threads.reserve(num_threads); | |
| 140 | ✗ | std::atomic<size_t> next_cell(0); | |
| 141 | ✗ | size_t total_cells = static_cast<size_t>(q) * static_cast<size_t>(q); | |
| 142 | |||
| 143 | ✗ | for (unsigned int thread_idx = 0; thread_idx < num_threads; ++thread_idx) { | |
| 144 | ✗ | threads.emplace_back([&]() { | |
| 145 | size_t cell_idx = 0; | ||
| 146 | ✗ | while ((cell_idx = next_cell.fetch_add(1)) < total_cells) { | |
| 147 | ✗ | int i = static_cast<int>(cell_idx / q); | |
| 148 | ✗ | int j = static_cast<int>(cell_idx % q); | |
| 149 | ✗ | const int k = (i + step) % q; | |
| 150 | |||
| 151 | ✗ | const size_t a_off = (static_cast<size_t>((i * q) + k)) * block_size; | |
| 152 | ✗ | const size_t b_off = (static_cast<size_t>((k * q) + j)) * block_size; | |
| 153 | ✗ | const size_t c_off = (static_cast<size_t>((i * q) + j)) * block_size; | |
| 154 | |||
| 155 | ✗ | MultiplyBlocks(blocks_a, blocks_b, blocks_c, bs, a_off, b_off, c_off); | |
| 156 | } | ||
| 157 | ✗ | }); | |
| 158 | } | ||
| 159 | |||
| 160 | ✗ | for (auto &thread : threads) { | |
| 161 | ✗ | thread.join(); | |
| 162 | } | ||
| 163 | ✗ | } | |
| 164 | |||
| 165 | 104 | bool SinevAMultMatrixFoxAlgorithmSTL::RunImpl() { | |
| 166 | const auto &input = GetInput(); | ||
| 167 |
2/2✓ Branch 0 taken 88 times.
✓ Branch 1 taken 16 times.
|
104 | const size_t n = std::get<0>(input); |
| 168 | const auto &a = std::get<1>(input); | ||
| 169 | const auto &b = std::get<2>(input); | ||
| 170 | auto &c = GetOutput(); | ||
| 171 | |||
| 172 |
2/2✓ Branch 0 taken 88 times.
✓ Branch 1 taken 16 times.
|
104 | if (n <= 64) { |
| 173 | 88 | SimpleMultiply(n, a, b, c); | |
| 174 | 88 | return true; | |
| 175 | } | ||
| 176 | |||
| 177 | size_t bs = 64; | ||
| 178 |
3/4✓ Branch 0 taken 48 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 32 times.
✓ Branch 3 taken 16 times.
|
48 | while (n % bs != 0 && bs > 16) { |
| 179 | 32 | bs /= 2; | |
| 180 | } | ||
| 181 | |||
| 182 |
1/2✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
|
16 | if (n % bs != 0) { |
| 183 | 16 | SimpleMultiply(n, a, b, c); | |
| 184 | 16 | return true; | |
| 185 | } | ||
| 186 | |||
| 187 | ✗ | const int actual_q = static_cast<int>(n / bs); | |
| 188 | |||
| 189 | ✗ | const auto total_blocks = static_cast<size_t>(actual_q) * static_cast<size_t>(actual_q); | |
| 190 | ✗ | const auto block_elements = bs * bs; | |
| 191 | |||
| 192 | ✗ | std::vector<double> blocks_a(total_blocks * block_elements); | |
| 193 | ✗ | std::vector<double> blocks_b(total_blocks * block_elements); | |
| 194 | ✗ | std::vector<double> blocks_c(total_blocks * block_elements, 0.0); | |
| 195 | |||
| 196 | ✗ | DecomposeToBlocks(a, blocks_a, n, bs, actual_q); | |
| 197 | ✗ | DecomposeToBlocks(b, blocks_b, n, bs, actual_q); | |
| 198 | |||
| 199 | ✗ | for (int step = 0; step < actual_q; ++step) { | |
| 200 | ✗ | FoxStep(blocks_a, blocks_b, blocks_c, bs, actual_q, step); | |
| 201 | } | ||
| 202 | |||
| 203 | ✗ | AssembleFromBlocks(blocks_c, c, n, bs, actual_q); | |
| 204 | |||
| 205 | return true; | ||
| 206 | } | ||
| 207 | |||
| 208 | ✗ | size_t SinevAMultMatrixFoxAlgorithmSTL::ChooseBlockSize(size_t n) { | |
| 209 | ✗ | if (n % 128 == 0) { | |
| 210 | return 128; | ||
| 211 | } | ||
| 212 | ✗ | if (n % 64 == 0) { | |
| 213 | return 64; | ||
| 214 | } | ||
| 215 | ✗ | if (n % 32 == 0) { | |
| 216 | return 32; | ||
| 217 | } | ||
| 218 | ✗ | if (n % 16 == 0) { | |
| 219 | ✗ | return 16; | |
| 220 | } | ||
| 221 | return 1; | ||
| 222 | } | ||
| 223 | |||
| 224 | 104 | bool SinevAMultMatrixFoxAlgorithmSTL::PostProcessingImpl() { | |
| 225 | 104 | return true; | |
| 226 | } | ||
| 227 | |||
| 228 | } // namespace sinev_a_mult_matrix_fox_algorithm | ||
| 229 |