| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "sokolov_k_matrix_double_fox/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <mpi.h> | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <cmath> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <vector> | ||
| 9 | |||
| 10 | #include "sokolov_k_matrix_double_fox/common/include/common.hpp" | ||
| 11 | |||
| 12 | namespace sokolov_k_matrix_double_fox { | ||
| 13 | |||
| 14 | namespace { | ||
| 15 | |||
| 16 | 48 | void DecomposeToBlocksAll(const std::vector<double> &flat, std::vector<double> &blocks, int n, int bs, int q) { | |
| 17 |
2/2✓ Branch 0 taken 220 times.
✓ Branch 1 taken 48 times.
|
268 | for (int bi = 0; bi < q; bi++) { |
| 18 |
2/2✓ Branch 0 taken 1404 times.
✓ Branch 1 taken 220 times.
|
1624 | for (int bj = 0; bj < q; bj++) { |
| 19 | 1404 | int block_off = ((bi * q) + bj) * (bs * bs); | |
| 20 |
2/2✓ Branch 0 taken 6892 times.
✓ Branch 1 taken 1404 times.
|
8296 | for (int i = 0; i < bs; i++) { |
| 21 |
2/2✓ Branch 0 taken 51540 times.
✓ Branch 1 taken 6892 times.
|
58432 | for (int j = 0; j < bs; j++) { |
| 22 | 51540 | blocks[block_off + (i * bs) + j] = flat[(((bi * bs) + i) * n) + ((bj * bs) + j)]; | |
| 23 | } | ||
| 24 | } | ||
| 25 | } | ||
| 26 | } | ||
| 27 | 48 | } | |
| 28 | |||
| 29 | 24 | void AssembleFromBlocksAll(const std::vector<double> &blocks, std::vector<double> &flat, int n, int bs, int q) { | |
| 30 |
2/2✓ Branch 0 taken 110 times.
✓ Branch 1 taken 24 times.
|
134 | for (int bi = 0; bi < q; bi++) { |
| 31 |
2/2✓ Branch 0 taken 702 times.
✓ Branch 1 taken 110 times.
|
812 | for (int bj = 0; bj < q; bj++) { |
| 32 | 702 | int block_off = ((bi * q) + bj) * (bs * bs); | |
| 33 |
2/2✓ Branch 0 taken 3446 times.
✓ Branch 1 taken 702 times.
|
4148 | for (int i = 0; i < bs; i++) { |
| 34 |
2/2✓ Branch 0 taken 25770 times.
✓ Branch 1 taken 3446 times.
|
29216 | for (int j = 0; j < bs; j++) { |
| 35 | 25770 | flat[(((bi * bs) + i) * n) + ((bj * bs) + j)] = blocks[block_off + (i * bs) + j]; | |
| 36 | } | ||
| 37 | } | ||
| 38 | } | ||
| 39 | } | ||
| 40 | 24 | } | |
| 41 | |||
| 42 | 2755 | void MultiplyBlocksAll(const double *a, const double *b, double *c, int bs) { | |
| 43 |
2/2✓ Branch 0 taken 16033 times.
✓ Branch 1 taken 2755 times.
|
18788 | for (int i = 0; i < bs; i++) { |
| 44 |
2/2✓ Branch 0 taken 126643 times.
✓ Branch 1 taken 16033 times.
|
142676 | for (int k = 0; k < bs; k++) { |
| 45 | 126643 | double val = a[(i * bs) + k]; | |
| 46 |
2/2✓ Branch 0 taken 1128025 times.
✓ Branch 1 taken 126643 times.
|
1254668 | for (int j = 0; j < bs; j++) { |
| 47 | 1128025 | c[(i * bs) + j] += val * b[(k * bs) + j]; | |
| 48 | } | ||
| 49 | } | ||
| 50 | } | ||
| 51 | 2755 | } | |
| 52 | |||
| 53 | void FoxStepMpiOmp(const std::vector<double> &a, const std::vector<double> &b, std::vector<double> &c, int bs, int q, | ||
| 54 | int step, int row_begin, int row_end) { | ||
| 55 | 110 | int bsq = bs * bs; | |
| 56 | 110 | #pragma omp parallel for default(none) shared(a, b, c, bs, q, bsq, step, row_begin, row_end) schedule(static) | |
| 57 | for (int i = row_begin; i < row_end; i++) { | ||
| 58 | int k = (i + step) % q; | ||
| 59 | for (int j = 0; j < q; j++) { | ||
| 60 | int a_off = ((i * q) + k) * bsq; | ||
| 61 | int b_off = ((k * q) + j) * bsq; | ||
| 62 | int c_off = ((i * q) + j) * bsq; | ||
| 63 | MultiplyBlocksAll(a.data() + a_off, b.data() + b_off, c.data() + c_off, bs); | ||
| 64 | } | ||
| 65 | } | ||
| 66 | } | ||
| 67 | |||
| 68 | 24 | int ChooseBlockSizeAll(int n) { | |
| 69 |
1/2✓ Branch 0 taken 34 times.
✗ Branch 1 not taken.
|
34 | for (int div = static_cast<int>(std::sqrt(static_cast<double>(n))); div >= 1; div--) { |
| 70 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 24 times.
|
34 | if (n % div == 0) { |
| 71 | return div; | ||
| 72 | } | ||
| 73 | } | ||
| 74 | return 1; | ||
| 75 | } | ||
| 76 | |||
| 77 | void ComputeRowRange(int rank, int num_procs, int rows_per, int leftover, int &row_start, int &row_count) { | ||
| 78 | 24 | if (rank < num_procs) { | |
| 79 |
4/6✓ Branch 0 taken 17 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 11 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 11 times.
✗ Branch 5 not taken.
|
45 | row_start = (rank * rows_per) + std::min(rank, leftover); |
| 80 |
4/6✓ Branch 0 taken 17 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 11 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 11 times.
✗ Branch 5 not taken.
|
62 | row_count = rows_per + (rank < leftover ? 1 : 0); |
| 81 | } else { | ||
| 82 | row_start = 0; | ||
| 83 | row_count = 0; | ||
| 84 | } | ||
| 85 | } | ||
| 86 | |||
| 87 | 24 | void GatherResults(std::vector<double> &blocks_c, int rank, int num_procs, int rows_per, int leftover, int q, int bsq) { | |
| 88 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
24 | if (rank == 0) { |
| 89 |
2/2✓ Branch 0 taken 11 times.
✓ Branch 1 taken 12 times.
|
23 | for (int pr = 1; pr < num_procs; pr++) { |
| 90 | int pr_start = 0; | ||
| 91 | int pr_count = 0; | ||
| 92 | ComputeRowRange(pr, num_procs, rows_per, leftover, pr_start, pr_count); | ||
| 93 |
1/2✓ Branch 0 taken 11 times.
✗ Branch 1 not taken.
|
11 | if (pr_count > 0) { |
| 94 | 11 | int offset = pr_start * q * bsq; | |
| 95 | 11 | int count = pr_count * q * bsq; | |
| 96 | 11 | MPI_Recv(blocks_c.data() + offset, count, MPI_DOUBLE, pr, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); | |
| 97 | } | ||
| 98 | } | ||
| 99 |
2/2✓ Branch 0 taken 11 times.
✓ Branch 1 taken 1 times.
|
12 | } else if (rank < num_procs) { |
| 100 | int my_start = 0; | ||
| 101 | int my_count = 0; | ||
| 102 | ComputeRowRange(rank, num_procs, rows_per, leftover, my_start, my_count); | ||
| 103 |
1/2✓ Branch 0 taken 11 times.
✗ Branch 1 not taken.
|
11 | if (my_count > 0) { |
| 104 | 11 | int offset = my_start * q * bsq; | |
| 105 | 11 | int count = my_count * q * bsq; | |
| 106 | 11 | MPI_Send(blocks_c.data() + offset, count, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); | |
| 107 | } | ||
| 108 | } | ||
| 109 | 24 | } | |
| 110 | |||
| 111 | } // namespace | ||
| 112 | |||
| 113 | 24 | SokolovKMatrixDoubleFoxALL::SokolovKMatrixDoubleFoxALL(const InType &in) { | |
| 114 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 115 | 24 | GetInput() = in; | |
| 116 | GetOutput() = 0; | ||
| 117 | 24 | } | |
| 118 | |||
| 119 | 24 | bool SokolovKMatrixDoubleFoxALL::ValidationImpl() { | |
| 120 |
2/4✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 24 times.
|
24 | return (GetInput() > 0) && (GetOutput() == 0); |
| 121 | } | ||
| 122 | |||
| 123 | 24 | bool SokolovKMatrixDoubleFoxALL::PreProcessingImpl() { | |
| 124 | 24 | GetOutput() = 0; | |
| 125 | 24 | n_ = GetInput(); | |
| 126 | 24 | block_size_ = ChooseBlockSizeAll(n_); | |
| 127 | 24 | q_ = n_ / block_size_; | |
| 128 | 24 | auto sz = static_cast<std::size_t>(n_) * n_; | |
| 129 | 24 | std::vector<double> a(sz, 1.5); | |
| 130 |
1/4✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
24 | std::vector<double> b(sz, 2.0); |
| 131 |
1/2✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
|
24 | blocks_a_.resize(sz); |
| 132 |
1/2✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
|
24 | blocks_b_.resize(sz); |
| 133 |
1/4✓ Branch 1 taken 24 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
24 | blocks_c_.assign(sz, 0.0); |
| 134 | 24 | DecomposeToBlocksAll(a, blocks_a_, n_, block_size_, q_); | |
| 135 | 24 | DecomposeToBlocksAll(b, blocks_b_, n_, block_size_, q_); | |
| 136 | 24 | return true; | |
| 137 | } | ||
| 138 | |||
| 139 |
1/2✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
|
24 | bool SokolovKMatrixDoubleFoxALL::RunImpl() { |
| 140 | std::ranges::fill(blocks_c_, 0.0); | ||
| 141 | |||
| 142 | 24 | int rank = 0; | |
| 143 | 24 | int world_size = 1; | |
| 144 | 24 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 145 | 24 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); | |
| 146 | |||
| 147 | 24 | int total = static_cast<int>(blocks_a_.size()); | |
| 148 | 24 | MPI_Bcast(&n_, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 149 | 24 | MPI_Bcast(&block_size_, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 150 | 24 | MPI_Bcast(&q_, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 151 | 24 | MPI_Bcast(&total, 1, MPI_INT, 0, MPI_COMM_WORLD); | |
| 152 | |||
| 153 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
24 | if (rank != 0) { |
| 154 | 12 | blocks_a_.resize(total); | |
| 155 | 12 | blocks_b_.resize(total); | |
| 156 | 12 | blocks_c_.assign(total, 0.0); | |
| 157 | } | ||
| 158 | |||
| 159 | 24 | MPI_Bcast(blocks_a_.data(), total, MPI_DOUBLE, 0, MPI_COMM_WORLD); | |
| 160 | 24 | MPI_Bcast(blocks_b_.data(), total, MPI_DOUBLE, 0, MPI_COMM_WORLD); | |
| 161 | |||
| 162 |
2/2✓ Branch 0 taken 23 times.
✓ Branch 1 taken 1 times.
|
24 | int num_procs = std::min(world_size, q_); |
| 163 | 24 | int rows_per = q_ / std::max(num_procs, 1); | |
| 164 | 24 | int leftover = q_ % std::max(num_procs, 1); | |
| 165 | |||
| 166 | int my_row_start = 0; | ||
| 167 | int my_row_count = 0; | ||
| 168 |
2/2✓ Branch 0 taken 23 times.
✓ Branch 1 taken 1 times.
|
24 | ComputeRowRange(rank, num_procs, rows_per, leftover, my_row_start, my_row_count); |
| 169 | |||
| 170 |
2/2✓ Branch 0 taken 110 times.
✓ Branch 1 taken 24 times.
|
134 | for (int step = 0; step < q_; step++) { |
| 171 | 110 | FoxStepMpiOmp(blocks_a_, blocks_b_, blocks_c_, block_size_, q_, step, my_row_start, my_row_start + my_row_count); | |
| 172 | } | ||
| 173 | |||
| 174 | 24 | int bsq = block_size_ * block_size_; | |
| 175 | 24 | GatherResults(blocks_c_, rank, num_procs, rows_per, leftover, q_, bsq); | |
| 176 | |||
| 177 | 24 | MPI_Bcast(blocks_c_.data(), total, MPI_DOUBLE, 0, MPI_COMM_WORLD); | |
| 178 | |||
| 179 | 24 | MPI_Barrier(MPI_COMM_WORLD); | |
| 180 | 24 | return true; | |
| 181 | } | ||
| 182 | |||
| 183 | 24 | bool SokolovKMatrixDoubleFoxALL::PostProcessingImpl() { | |
| 184 | 24 | std::vector<double> result(static_cast<std::size_t>(n_) * n_); | |
| 185 | 24 | AssembleFromBlocksAll(blocks_c_, result, n_, block_size_, q_); | |
| 186 | 24 | double expected = 3.0 * n_; | |
| 187 |
1/2✓ Branch 0 taken 25770 times.
✗ Branch 1 not taken.
|
25770 | bool ok = std::ranges::all_of(result, [expected](double v) { return std::abs(v - expected) <= 1e-9; }); |
| 188 |
2/4✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 24 times.
✗ Branch 3 not taken.
|
24 | GetOutput() = ok ? GetInput() : -1; |
| 189 | std::vector<double>().swap(blocks_a_); | ||
| 190 | std::vector<double>().swap(blocks_b_); | ||
| 191 | std::vector<double>().swap(blocks_c_); | ||
| 192 | 24 | return true; | |
| 193 | } | ||
| 194 | |||
| 195 | } // namespace sokolov_k_matrix_double_fox | ||
| 196 |