| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "makoveeva_matmul_double/stl/include/ops_stl.hpp" | ||
| 2 | |||
| 3 | #include <cmath> | ||
| 4 | #include <cstddef> | ||
| 5 | #include <functional> | ||
| 6 | #include <mutex> | ||
| 7 | #include <thread> | ||
| 8 | #include <vector> | ||
| 9 | |||
| 10 | #include "makoveeva_matmul_double/stl/include/common.hpp" | ||
| 11 | |||
| 12 | namespace makoveeva_matmul_double_stl { | ||
| 13 | |||
| 14 | namespace { | ||
| 15 | |||
| 16 | [[nodiscard]] size_t SelectBlockSize(size_t n) { | ||
| 17 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 96 times.
|
96 | if (n <= 64) { |
| 18 | return n; | ||
| 19 | } | ||
| 20 | ✗ | if (n <= 256) { | |
| 21 | return 64; | ||
| 22 | } | ||
| 23 | ✗ | if (n <= 1024) { | |
| 24 | ✗ | return 128; | |
| 25 | } | ||
| 26 | return 256; | ||
| 27 | } | ||
| 28 | ✗ | void SimpleMultiplyThread(const std::vector<double> &a, const std::vector<double> &b, std::vector<double> &c, size_t n, | |
| 29 | size_t start_row, size_t end_row) { | ||
| 30 | ✗ | for (size_t i = start_row; i < end_row; ++i) { | |
| 31 | ✗ | for (size_t j = 0; j < n; ++j) { | |
| 32 | double sum = 0.0; | ||
| 33 | ✗ | for (size_t k = 0; k < n; ++k) { | |
| 34 | ✗ | sum += a[(i * n) + k] * b[(k * n) + j]; | |
| 35 | } | ||
| 36 | ✗ | c[(i * n) + j] = sum; | |
| 37 | } | ||
| 38 | } | ||
| 39 | ✗ | } | |
| 40 | |||
| 41 | } // namespace | ||
| 42 | |||
| 43 |
1/2✓ Branch 1 taken 96 times.
✗ Branch 2 not taken.
|
96 | MatmulDoubleSTLTask::MatmulDoubleSTLTask(const InType &in) { |
| 44 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 45 | GetInput() = in; | ||
| 46 | 96 | GetOutput() = std::vector<double>(); | |
| 47 | 96 | } | |
| 48 | |||
| 49 | 96 | bool MatmulDoubleSTLTask::ValidationImpl() { | |
| 50 | const auto &input = GetInput(); | ||
| 51 | 96 | const size_t n = std::get<0>(input); | |
| 52 | const auto &a = std::get<1>(input); | ||
| 53 | const auto &b = std::get<2>(input); | ||
| 54 | |||
| 55 |
3/6✓ Branch 0 taken 96 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 96 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 96 times.
|
96 | return n > 0 && a.size() == n * n && b.size() == n * n; |
| 56 | } | ||
| 57 | |||
| 58 | 96 | bool MatmulDoubleSTLTask::PreProcessingImpl() { | |
| 59 | const auto &input = GetInput(); | ||
| 60 | 96 | n_ = std::get<0>(input); | |
| 61 | 96 | A_ = std::get<1>(input); | |
| 62 | 96 | B_ = std::get<2>(input); | |
| 63 | 96 | C_.assign(n_ * n_, 0.0); | |
| 64 | |||
| 65 | 96 | return true; | |
| 66 | } | ||
| 67 | |||
| 68 | 96 | bool MatmulDoubleSTLTask::RunImpl() { | |
| 69 |
1/2✓ Branch 0 taken 96 times.
✗ Branch 1 not taken.
|
96 | if (n_ <= 0) { |
| 70 | return false; | ||
| 71 | } | ||
| 72 | |||
| 73 | const size_t n = n_; | ||
| 74 | |||
| 75 | 96 | const size_t block_size = SelectBlockSize(n); | |
| 76 | |||
| 77 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 96 times.
|
96 | if (n % block_size != 0) { |
| 78 | ✗ | return RunSimpleMultiply(); | |
| 79 | } | ||
| 80 | |||
| 81 | 96 | const size_t grid_size = n / block_size; | |
| 82 | |||
| 83 | 96 | const size_t num_threads = std::thread::hardware_concurrency(); | |
| 84 | |||
| 85 | 96 | std::mutex write_mutex; | |
| 86 | |||
| 87 | 96 | const size_t total_iterations = grid_size * grid_size * grid_size; | |
| 88 | |||
| 89 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 96 times.
|
96 | if (total_iterations >= num_threads) { |
| 90 | ✗ | std::vector<std::thread> threads; | |
| 91 | ✗ | threads.reserve(num_threads); | |
| 92 | |||
| 93 | ✗ | const size_t iterations_per_thread = total_iterations / num_threads; | |
| 94 | |||
| 95 | // Создаём потоки | ||
| 96 | ✗ | for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { | |
| 97 | ✗ | const size_t start_step = thread_idx * iterations_per_thread; | |
| 98 | ✗ | const size_t end_step = (thread_idx == num_threads - 1) ? total_iterations : start_step + iterations_per_thread; | |
| 99 | |||
| 100 | ✗ | threads.emplace_back(&MatmulDoubleSTLTask::Worker, this, start_step, end_step, grid_size, block_size, | |
| 101 | ✗ | std::ref(write_mutex)); | |
| 102 | } | ||
| 103 | |||
| 104 | ✗ | for (auto &thread : threads) { | |
| 105 | ✗ | thread.join(); | |
| 106 | } | ||
| 107 | ✗ | } else { | |
| 108 | 96 | Worker(0, total_iterations, grid_size, block_size, write_mutex); | |
| 109 | } | ||
| 110 | |||
| 111 | 96 | GetOutput() = C_; | |
| 112 | return true; | ||
| 113 | } | ||
| 114 | |||
| 115 | 96 | void MatmulDoubleSTLTask::Worker(size_t start_step, size_t end_step, size_t grid_size, size_t block_size, | |
| 116 | std::mutex &write_mutex) { | ||
| 117 | // Обрабатываем диапазон итераций [start_step, end_step) | ||
| 118 |
2/2✓ Branch 0 taken 96 times.
✓ Branch 1 taken 96 times.
|
192 | for (size_t step_i_j = start_step; step_i_j < end_step; ++step_i_j) { |
| 119 | 96 | const size_t step = step_i_j / (grid_size * grid_size); | |
| 120 | 96 | const size_t i = (step_i_j % (grid_size * grid_size)) / grid_size; | |
| 121 | 96 | const size_t j = step_i_j % grid_size; | |
| 122 | |||
| 123 | 96 | const size_t root = (i + step) % grid_size; | |
| 124 | |||
| 125 | 96 | std::vector<double> local_block(block_size * block_size, 0.0); | |
| 126 | |||
| 127 |
2/2✓ Branch 0 taken 824 times.
✓ Branch 1 taken 96 times.
|
920 | for (size_t bi = 0; bi < block_size; ++bi) { |
| 128 |
2/2✓ Branch 0 taken 13320 times.
✓ Branch 1 taken 824 times.
|
14144 | for (size_t bj = 0; bj < block_size; ++bj) { |
| 129 | double sum = 0.0; | ||
| 130 |
2/2✓ Branch 0 taken 319112 times.
✓ Branch 1 taken 13320 times.
|
332432 | for (size_t bk = 0; bk < block_size; ++bk) { |
| 131 | 319112 | const size_t idx_a = ((i * block_size + bi) * n_) + (root * block_size + bk); | |
| 132 | 319112 | const size_t idx_b = ((root * block_size + bk) * n_) + (j * block_size + bj); | |
| 133 | 319112 | sum += A_[idx_a] * B_[idx_b]; | |
| 134 | } | ||
| 135 | 13320 | local_block[(bi * block_size) + bj] += sum; | |
| 136 | } | ||
| 137 | } | ||
| 138 | |||
| 139 | { | ||
| 140 | std::scoped_lock<std::mutex> lock(write_mutex); | ||
| 141 |
2/2✓ Branch 0 taken 824 times.
✓ Branch 1 taken 96 times.
|
920 | for (size_t bi = 0; bi < block_size; ++bi) { |
| 142 |
2/2✓ Branch 0 taken 13320 times.
✓ Branch 1 taken 824 times.
|
14144 | for (size_t bj = 0; bj < block_size; ++bj) { |
| 143 | 13320 | const size_t idx_c = ((i * block_size + bi) * n_) + (j * block_size + bj); | |
| 144 | 13320 | C_[idx_c] += local_block[(bi * block_size) + bj]; | |
| 145 | } | ||
| 146 | } | ||
| 147 | } | ||
| 148 | } | ||
| 149 | 96 | } | |
| 150 | |||
| 151 | ✗ | bool MatmulDoubleSTLTask::RunSimpleMultiply() { | |
| 152 | ✗ | const size_t n = n_; | |
| 153 | ✗ | const auto &a = A_; | |
| 154 | ✗ | const auto &b = B_; | |
| 155 | ✗ | auto &c = C_; | |
| 156 | |||
| 157 | ✗ | const size_t num_threads = std::thread::hardware_concurrency(); | |
| 158 | |||
| 159 | ✗ | if (n >= num_threads) { | |
| 160 | ✗ | std::vector<std::thread> threads; | |
| 161 | ✗ | threads.reserve(num_threads); | |
| 162 | |||
| 163 | ✗ | const size_t rows_per_thread = n / num_threads; | |
| 164 | |||
| 165 | ✗ | for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { | |
| 166 | ✗ | const size_t start_row = thread_idx * rows_per_thread; | |
| 167 | ✗ | const size_t end_row = (thread_idx == num_threads - 1) ? n : start_row + rows_per_thread; | |
| 168 | |||
| 169 | ✗ | threads.emplace_back(SimpleMultiplyThread, std::cref(a), std::cref(b), std::ref(c), n, start_row, end_row); | |
| 170 | } | ||
| 171 | |||
| 172 | ✗ | for (auto &thread : threads) { | |
| 173 | ✗ | thread.join(); | |
| 174 | } | ||
| 175 | ✗ | } else { | |
| 176 | // Простое однопоточное умножение | ||
| 177 | ✗ | SimpleMultiplyThread(a, b, c, n, 0, n); | |
| 178 | } | ||
| 179 | |||
| 180 | ✗ | return true; | |
| 181 | } | ||
| 182 | |||
| 183 | 96 | bool MatmulDoubleSTLTask::PostProcessingImpl() { | |
| 184 | 96 | return true; | |
| 185 | } | ||
| 186 | |||
| 187 | } // namespace makoveeva_matmul_double_stl | ||
| 188 |