| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "chyokotov_a_dense_matrix_mul_foxs_algorithm/stl/include/ops_stl.hpp" | ||
| 2 | |||
| 3 | #include <algorithm> | ||
| 4 | #include <cmath> | ||
| 5 | #include <cstddef> | ||
| 6 | #include <thread> | ||
| 7 | #include <utility> | ||
| 8 | #include <vector> | ||
| 9 | |||
| 10 | #include "chyokotov_a_dense_matrix_mul_foxs_algorithm/common/include/common.hpp" | ||
| 11 | #include "util/include/util.hpp" | ||
| 12 | |||
| 13 | namespace chyokotov_a_dense_matrix_mul_foxs_algorithm { | ||
| 14 | |||
| 15 |
1/2✓ Branch 1 taken 32 times.
✗ Branch 2 not taken.
|
32 | ChyokotovADenseMatMulFoxAlgorithmSTL::ChyokotovADenseMatMulFoxAlgorithmSTL(const InType &in) { |
| 16 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 17 | GetInput() = in; | ||
| 18 | GetOutput().clear(); | ||
| 19 | 32 | } | |
| 20 | |||
| 21 | 32 | bool ChyokotovADenseMatMulFoxAlgorithmSTL::ValidationImpl() { | |
| 22 | 32 | return (GetInput().first.size() == GetInput().second.size()); | |
| 23 | } | ||
| 24 | |||
| 25 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 32 times.
|
32 | bool ChyokotovADenseMatMulFoxAlgorithmSTL::PreProcessingImpl() { |
| 26 | GetOutput().clear(); | ||
| 27 | 32 | GetOutput().resize(GetInput().first.size(), 0.0); | |
| 28 | 32 | return true; | |
| 29 | } | ||
| 30 | |||
| 31 | ✗ | int ChyokotovADenseMatMulFoxAlgorithmSTL::CalculateBlockSize(int n) { | |
| 32 | 24 | return static_cast<int>(std::sqrt(static_cast<double>(n))); | |
| 33 | } | ||
| 34 | |||
| 35 | ✗ | int ChyokotovADenseMatMulFoxAlgorithmSTL::CountBlock(int n, int size) { | |
| 36 | 24 | return (n + size - 1) / size; | |
| 37 | } | ||
| 38 | |||
| 39 | 24 | std::vector<std::pair<int, int>> ChyokotovADenseMatMulFoxAlgorithmSTL::Blocks(int count_block) { | |
| 40 | 24 | std::vector<std::pair<int, int>> blocks; | |
| 41 |
2/2✓ Branch 0 taken 40 times.
✓ Branch 1 taken 24 times.
|
64 | for (int ic = 0; ic < count_block; ic++) { |
| 42 |
2/2✓ Branch 0 taken 72 times.
✓ Branch 1 taken 40 times.
|
112 | for (int jc = 0; jc < count_block; jc++) { |
| 43 |
1/2✓ Branch 1 taken 72 times.
✗ Branch 2 not taken.
|
72 | blocks.emplace_back(ic, jc); |
| 44 | } | ||
| 45 | } | ||
| 46 | 24 | return blocks; | |
| 47 | } | ||
| 48 | |||
| 49 | 136 | void ChyokotovADenseMatMulFoxAlgorithmSTL::Matmul(std::vector<double> &a, std::vector<double> &b, int n, int istart, | |
| 50 | int iend, int jstart, int jend, int kstart, int kend) { | ||
| 51 |
2/2✓ Branch 0 taken 200 times.
✓ Branch 1 taken 136 times.
|
336 | for (int i = istart; i < iend; i++) { |
| 52 |
2/2✓ Branch 0 taken 328 times.
✓ Branch 1 taken 200 times.
|
528 | for (int j = jstart; j < jend; j++) { |
| 53 | double sum = 0.0; | ||
| 54 |
2/2✓ Branch 0 taken 584 times.
✓ Branch 1 taken 328 times.
|
912 | for (int k = kstart; k < kend; k++) { |
| 55 | 584 | sum += a[(i * n) + k] * b[(k * n) + j]; | |
| 56 | } | ||
| 57 | 328 | GetOutput()[(i * n) + j] += sum; | |
| 58 | } | ||
| 59 | } | ||
| 60 | 136 | } | |
| 61 | |||
| 62 | 32 | bool ChyokotovADenseMatMulFoxAlgorithmSTL::RunImpl() { | |
| 63 | 32 | std::vector<double> a = GetInput().first; | |
| 64 |
1/2✓ Branch 1 taken 32 times.
✗ Branch 2 not taken.
|
32 | std::vector<double> b = GetInput().second; |
| 65 | 32 | int n = static_cast<int>(std::sqrt(static_cast<double>(a.size()))); | |
| 66 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 8 times.
|
32 | if (n == 0) { |
| 67 | return true; | ||
| 68 | } | ||
| 69 | |||
| 70 | 24 | int block_size = CalculateBlockSize(n); | |
| 71 | 24 | int count_block = CountBlock(n, block_size); | |
| 72 |
1/2✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
|
24 | std::vector<std::pair<int, int>> blocks = Blocks(count_block); |
| 73 | |||
| 74 |
1/2✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
|
24 | auto num_threads = static_cast<size_t>(ppc::util::GetNumThreads()); |
| 75 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 24 times.
|
24 | if (num_threads == 0) { |
| 76 | num_threads = 4; | ||
| 77 | } | ||
| 78 | |||
| 79 | 24 | std::vector<std::thread> threads; | |
| 80 | 24 | size_t blocks_per_thread = blocks.size() / num_threads; | |
| 81 | |||
| 82 |
2/2✓ Branch 0 taken 60 times.
✓ Branch 1 taken 24 times.
|
84 | for (size_t tt = 0; tt < num_threads; ++tt) { |
| 83 | 60 | size_t start_idx = tt * blocks_per_thread; | |
| 84 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 36 times.
|
60 | size_t end_idx = (tt == num_threads - 1) ? blocks.size() : start_idx + blocks_per_thread; |
| 85 | |||
| 86 |
1/2✓ Branch 1 taken 60 times.
✗ Branch 2 not taken.
|
60 | threads.emplace_back([&, start_idx, end_idx]() { |
| 87 |
2/2✓ Branch 0 taken 72 times.
✓ Branch 1 taken 60 times.
|
132 | for (size_t idx = start_idx; idx < end_idx; ++idx) { |
| 88 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 72 times.
|
72 | auto [ic, jc] = blocks[idx]; |
| 89 | |||
| 90 | 72 | int istart = ic * block_size; | |
| 91 | 72 | int jstart = jc * block_size; | |
| 92 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 72 times.
|
72 | int iend = std::min(istart + block_size, n); |
| 93 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 72 times.
|
72 | int jend = std::min(jstart + block_size, n); |
| 94 | |||
| 95 |
2/2✓ Branch 0 taken 136 times.
✓ Branch 1 taken 72 times.
|
208 | for (int kc = 0; kc < count_block; kc++) { |
| 96 | 136 | int kstart = kc * block_size; | |
| 97 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 136 times.
|
136 | int kend = std::min(kstart + block_size, n); |
| 98 | 136 | Matmul(a, b, n, istart, iend, jstart, jend, kstart, kend); | |
| 99 | } | ||
| 100 | } | ||
| 101 | 60 | }); | |
| 102 | } | ||
| 103 | |||
| 104 |
2/2✓ Branch 0 taken 60 times.
✓ Branch 1 taken 24 times.
|
84 | for (auto &thread : threads) { |
| 105 |
1/2✓ Branch 1 taken 60 times.
✗ Branch 2 not taken.
|
60 | thread.join(); |
| 106 | } | ||
| 107 | |||
| 108 | return true; | ||
| 109 | 24 | } | |
| 110 | |||
| 111 | 32 | bool ChyokotovADenseMatMulFoxAlgorithmSTL::PostProcessingImpl() { | |
| 112 | 32 | return true; | |
| 113 | } | ||
| 114 | |||
| 115 | } // namespace chyokotov_a_dense_matrix_mul_foxs_algorithm | ||
| 116 |