| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "makoveeva_matmul_double/omp/include/ops_omp.hpp" | ||
| 2 | |||
| 3 | #include <omp.h> | ||
| 4 | |||
| 5 | #include <cmath> | ||
| 6 | #include <cstddef> | ||
| 7 | #include <vector> | ||
| 8 | |||
| 9 | #include "makoveeva_matmul_double/omp/include/common.hpp" | ||
| 10 | |||
| 11 | namespace makoveeva_matmul_double_omp { | ||
| 12 | |||
| 13 | namespace { | ||
| 14 | |||
| 15 | // Выбирает размер блока в зависимости от размера матрицы | ||
| 16 | [[nodiscard]] size_t SelectBlockSize(size_t n) { | ||
| 17 | // Используем степени двойки для хорошей локальности кэша | ||
| 18 | 48 | if (n <= 64) { | |
| 19 | return n; | ||
| 20 | } | ||
| 21 | ✗ | if (n <= 256) { | |
| 22 | return 64; | ||
| 23 | } | ||
| 24 | ✗ | if (n <= 1024) { | |
| 25 | ✗ | return 128; | |
| 26 | } | ||
| 27 | return 256; | ||
| 28 | } | ||
| 29 | |||
| 30 | // Декодирует одномерный индекс в трёхмерный индекс (step, i, j) | ||
| 31 | void DecodeIndex(size_t step_i_j, size_t grid_size, size_t &step, size_t &i, size_t &j) { | ||
| 32 | step = step_i_j / (grid_size * grid_size); | ||
| 33 | i = (step_i_j % (grid_size * grid_size)) / grid_size; | ||
| 34 | j = step_i_j % grid_size; | ||
| 35 | } | ||
| 36 | |||
| 37 | // Вычисляет root блок для алгоритма Фокса | ||
| 38 | [[nodiscard]] size_t ComputeRoot(size_t i, size_t step, size_t grid_size) { | ||
| 39 | return (i + step) % grid_size; | ||
| 40 | } | ||
| 41 | |||
| 42 | // Умножает блок A[i][root] на блок B[root][j] и сохраняет в local_block | ||
| 43 | 48 | void MultiplyBlocks(const std::vector<double> &a, const std::vector<double> &b, std::vector<double> &local_block, | |
| 44 | size_t i, size_t root, size_t j, size_t block_size, size_t n) { | ||
| 45 |
2/2✓ Branch 0 taken 412 times.
✓ Branch 1 taken 48 times.
|
460 | for (size_t bi = 0; bi < block_size; ++bi) { |
| 46 |
2/2✓ Branch 0 taken 6660 times.
✓ Branch 1 taken 412 times.
|
7072 | for (size_t bj = 0; bj < block_size; ++bj) { |
| 47 | double sum = 0.0; | ||
| 48 |
2/2✓ Branch 0 taken 159556 times.
✓ Branch 1 taken 6660 times.
|
166216 | for (size_t bk = 0; bk < block_size; ++bk) { |
| 49 | 159556 | const size_t idx_a = ((i * block_size + bi) * n) + (root * block_size + bk); | |
| 50 | 159556 | const size_t idx_b = ((root * block_size + bk) * n) + (j * block_size + bj); | |
| 51 | 159556 | sum += a[idx_a] * b[idx_b]; | |
| 52 | } | ||
| 53 | 6660 | local_block[(bi * block_size) + bj] += sum; | |
| 54 | } | ||
| 55 | } | ||
| 56 | 48 | } | |
| 57 | |||
| 58 | // Добавляет результат из local_block в матрицу C | ||
| 59 | 48 | void AddBlockToResult(std::vector<double> &c, const std::vector<double> &local_block, size_t i, size_t j, | |
| 60 | size_t block_size, size_t n) { | ||
| 61 |
2/2✓ Branch 0 taken 412 times.
✓ Branch 1 taken 48 times.
|
460 | for (size_t bi = 0; bi < block_size; ++bi) { |
| 62 |
2/2✓ Branch 0 taken 6660 times.
✓ Branch 1 taken 412 times.
|
7072 | for (size_t bj = 0; bj < block_size; ++bj) { |
| 63 | 6660 | const size_t idx_c = ((i * block_size + bi) * n) + (j * block_size + bj); | |
| 64 | 6660 | c[idx_c] += local_block[(bi * block_size) + bj]; | |
| 65 | } | ||
| 66 | } | ||
| 67 | 48 | } | |
| 68 | |||
| 69 | } // namespace | ||
| 70 | |||
| 71 |
1/2✓ Branch 1 taken 48 times.
✗ Branch 2 not taken.
|
48 | MatmulDoubleOMPTask::MatmulDoubleOMPTask(const InType &in) { |
| 72 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 73 | GetInput() = in; | ||
| 74 | 48 | GetOutput() = std::vector<double>(); | |
| 75 | 48 | } | |
| 76 | |||
| 77 | 48 | bool MatmulDoubleOMPTask::ValidationImpl() { | |
| 78 | const auto &input = GetInput(); | ||
| 79 | 48 | const size_t n = std::get<0>(input); | |
| 80 | const auto &a = std::get<1>(input); | ||
| 81 | const auto &b = std::get<2>(input); | ||
| 82 | |||
| 83 |
3/6✓ Branch 0 taken 48 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 48 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 48 times.
|
48 | return n > 0 && a.size() == n * n && b.size() == n * n; |
| 84 | } | ||
| 85 | |||
| 86 | 48 | bool MatmulDoubleOMPTask::PreProcessingImpl() { | |
| 87 | const auto &input = GetInput(); | ||
| 88 | 48 | n_ = std::get<0>(input); | |
| 89 | 48 | a_ = std::get<1>(input); | |
| 90 | 48 | b_ = std::get<2>(input); | |
| 91 | 48 | c_.assign(n_ * n_, 0.0); | |
| 92 | |||
| 93 | 48 | return true; | |
| 94 | } | ||
| 95 | |||
| 96 | 48 | bool MatmulDoubleOMPTask::RunImpl() { | |
| 97 |
1/2✓ Branch 0 taken 48 times.
✗ Branch 1 not taken.
|
48 | if (n_ <= 0) { |
| 98 | return false; | ||
| 99 | } | ||
| 100 | |||
| 101 | const size_t n = n_; | ||
| 102 | 48 | const auto &a = a_; | |
| 103 | 48 | const auto &b = b_; | |
| 104 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 48 times.
|
48 | auto &c = c_; |
| 105 | |||
| 106 | // Выбираем размер блока для оптимальной локальности кэша | ||
| 107 | const size_t block_size = SelectBlockSize(n); | ||
| 108 | |||
| 109 | // Проверяем что матрица делится нацело на размер блока | ||
| 110 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 48 times.
|
48 | if (n % block_size != 0) { |
| 111 | ✗ | return RunSimpleMultiply(); | |
| 112 | } | ||
| 113 | |||
| 114 | 48 | const size_t grid_size = n / block_size; | |
| 115 | |||
| 116 | // Алгоритм Фокса: все итерации параллелизируются в один цикл | ||
| 117 | 48 | #pragma omp parallel for default(none) shared(a, b, c, n, block_size, grid_size) | |
| 118 | for (size_t step_i_j = 0; step_i_j < grid_size * grid_size * grid_size; ++step_i_j) { | ||
| 119 | size_t step = 0; | ||
| 120 | size_t i = 0; | ||
| 121 | size_t j = 0; | ||
| 122 | DecodeIndex(step_i_j, grid_size, step, i, j); | ||
| 123 | |||
| 124 | // Источник блока A: диагональный сдвиг на step позиций | ||
| 125 | const size_t root = ComputeRoot(i, step, grid_size); | ||
| 126 | |||
| 127 | // Локальный буфер для накопления результатов блока C[i][j] | ||
| 128 | std::vector<double> local_block(block_size * block_size, 0.0); | ||
| 129 | |||
| 130 | // Умножение блока A[i][root] на блок B[root][j] | ||
| 131 | MultiplyBlocks(a, b, local_block, i, root, j, block_size, n); | ||
| 132 | |||
| 133 | // Безопасно добавляем результат в матрицу C | ||
| 134 | #pragma omp critical | ||
| 135 | { | ||
| 136 | AddBlockToResult(c, local_block, i, j, block_size, n); | ||
| 137 | } | ||
| 138 | } | ||
| 139 | |||
| 140 | 48 | GetOutput() = c_; | |
| 141 | 48 | return true; | |
| 142 | } | ||
| 143 | |||
| 144 | ✗ | bool MatmulDoubleOMPTask::RunSimpleMultiply() { | |
| 145 | ✗ | const size_t n = n_; | |
| 146 | ✗ | const auto &a = a_; | |
| 147 | ✗ | const auto &b = b_; | |
| 148 | ✗ | auto &c = c_; | |
| 149 | |||
| 150 | ✗ | #pragma omp parallel for collapse(2) default(none) shared(a, b, c, n) | |
| 151 | for (size_t i = 0; i < n; ++i) { | ||
| 152 | for (size_t j = 0; j < n; ++j) { | ||
| 153 | double sum = 0.0; | ||
| 154 | for (size_t k = 0; k < n; ++k) { | ||
| 155 | sum += a[(i * n) + k] * b[(k * n) + j]; | ||
| 156 | } | ||
| 157 | c[(i * n) + j] = sum; | ||
| 158 | } | ||
| 159 | } | ||
| 160 | |||
| 161 | ✗ | return true; | |
| 162 | } | ||
| 163 | |||
| 164 | 48 | bool MatmulDoubleOMPTask::PostProcessingImpl() { | |
| 165 | 48 | return true; | |
| 166 | } | ||
| 167 | |||
| 168 | } // namespace makoveeva_matmul_double_omp | ||
| 169 |