| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "cheremkhin_a_matr_mult_cannon_alg/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <mpi.h> | ||
| 4 | #include <omp.h> | ||
| 5 | |||
| 6 | #include <cmath> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <cstdint> | ||
| 9 | #include <utility> | ||
| 10 | #include <vector> | ||
| 11 | |||
| 12 | #include "cheremkhin_a_matr_mult_cannon_alg/common/include/common.hpp" | ||
| 13 | #include "util/include/util.hpp" | ||
| 14 | |||
| 15 | namespace cheremkhin_a_matr_mult_cannon_alg { | ||
| 16 | |||
| 17 | namespace { | ||
| 18 | |||
| 19 | inline std::size_t Idx(std::size_t n, std::size_t r, std::size_t c) { | ||
| 20 | return (r * n) + c; | ||
| 21 | } | ||
| 22 | |||
| 23 | std::size_t CeilDiv(std::size_t a, std::size_t b) { | ||
| 24 | 14 | return (a + b - 1) / b; | |
| 25 | } | ||
| 26 | |||
| 27 | 14 | int ChooseVirtualGridSize(int world_size) { | |
| 28 |
1/2✓ Branch 0 taken 14 times.
✗ Branch 1 not taken.
|
14 | if (world_size <= 1) { |
| 29 | return 1; | ||
| 30 | } | ||
| 31 | |||
| 32 | 14 | int grid_dim = static_cast<int>(std::sqrt(static_cast<double>(world_size))); | |
| 33 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 14 times.
|
28 | while ((grid_dim * grid_dim) < world_size) { |
| 34 | 14 | ++grid_dim; | |
| 35 | } | ||
| 36 | return grid_dim; | ||
| 37 | } | ||
| 38 | |||
| 39 | int MakeVirtualRank(int row, int col, int grid_dim) { | ||
| 40 | 252 | return (row * grid_dim) + col; | |
| 41 | } | ||
| 42 | |||
| 43 | int GetOwnerRank(int virtual_rank, int world_size) { | ||
| 44 | 112 | return virtual_rank % world_size; | |
| 45 | } | ||
| 46 | |||
| 47 | 14 | std::vector<int> GetOwnedVirtualRanks(int world_rank, int world_size, int grid_dim) { | |
| 48 | 14 | std::vector<int> owned_ranks; | |
| 49 | 14 | const int virtual_size = grid_dim * grid_dim; | |
| 50 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | for (int virtual_rank = world_rank; virtual_rank < virtual_size; virtual_rank += world_size) { |
| 51 | owned_ranks.push_back(virtual_rank); | ||
| 52 | } | ||
| 53 | 14 | return owned_ranks; | |
| 54 | } | ||
| 55 | |||
| 56 | void CopyGlobalToPadded(const std::vector<double> &src, std::vector<double> &dst, std::size_t src_n, | ||
| 57 | std::size_t dst_n) { | ||
| 58 | 7 | const auto src_n64 = static_cast<std::int64_t>(src_n); | |
| 59 | 7 | #pragma omp parallel for default(none) schedule(static) shared(src, dst, src_n, dst_n, src_n64) | |
| 60 | for (std::int64_t i = 0; i < src_n64; ++i) { | ||
| 61 | for (std::size_t j = 0; j < src_n; ++j) { | ||
| 62 | dst[Idx(dst_n, static_cast<std::size_t>(i), j)] = src[Idx(src_n, static_cast<std::size_t>(i), j)]; | ||
| 63 | } | ||
| 64 | } | ||
| 65 | 7 | } | |
| 66 | |||
| 67 | void CopyPaddedToGlobal(const std::vector<double> &src, std::vector<double> &dst, std::size_t src_n, | ||
| 68 | std::size_t dst_n) { | ||
| 69 | 14 | const auto dst_n64 = static_cast<std::int64_t>(dst_n); | |
| 70 | 14 | #pragma omp parallel for default(none) schedule(static) shared(src, dst, src_n, dst_n, dst_n64) | |
| 71 | for (std::int64_t i = 0; i < dst_n64; ++i) { | ||
| 72 | for (std::size_t j = 0; j < dst_n; ++j) { | ||
| 73 | dst[Idx(dst_n, static_cast<std::size_t>(i), j)] = src[Idx(src_n, static_cast<std::size_t>(i), j)]; | ||
| 74 | } | ||
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | void ExtractLocalBlock(const std::vector<double> &src, std::vector<double> &block, std::size_t global_n, | ||
| 79 | std::size_t block_n, int block_row, int block_col) { | ||
| 80 | 28 | const std::size_t row0 = static_cast<std::size_t>(block_row) * block_n; | |
| 81 | 28 | const std::size_t col0 = static_cast<std::size_t>(block_col) * block_n; | |
| 82 | 28 | const auto block_n64 = static_cast<std::int64_t>(block_n); | |
| 83 | 28 | #pragma omp parallel for default(none) schedule(static) shared(src, block, global_n, block_n, row0, col0, block_n64) | |
| 84 | for (std::int64_t i = 0; i < block_n64; ++i) { | ||
| 85 | const std::size_t src_row = (row0 + static_cast<std::size_t>(i)) * global_n; | ||
| 86 | const std::size_t dst_row = static_cast<std::size_t>(i) * block_n; | ||
| 87 | for (std::size_t j = 0; j < block_n; ++j) { | ||
| 88 | block[dst_row + j] = src[src_row + col0 + j]; | ||
| 89 | } | ||
| 90 | } | ||
| 91 | } | ||
| 92 | |||
| 93 | void InsertLocalBlock(const std::vector<double> &block, std::vector<double> &dst, std::size_t global_n, | ||
| 94 | std::size_t block_n, int block_row, int block_col) { | ||
| 95 | 28 | const std::size_t row0 = static_cast<std::size_t>(block_row) * block_n; | |
| 96 | 28 | const std::size_t col0 = static_cast<std::size_t>(block_col) * block_n; | |
| 97 | 28 | const auto block_n64 = static_cast<std::int64_t>(block_n); | |
| 98 |
1/2✓ Branch 0 taken 14 times.
✗ Branch 1 not taken.
|
14 | #pragma omp parallel for default(none) schedule(static) shared(block, dst, global_n, block_n, row0, col0, block_n64) |
| 99 | for (std::int64_t i = 0; i < block_n64; ++i) { | ||
| 100 | const std::size_t src_row = static_cast<std::size_t>(i) * block_n; | ||
| 101 | const std::size_t dst_row = (row0 + static_cast<std::size_t>(i)) * global_n; | ||
| 102 | for (std::size_t j = 0; j < block_n; ++j) { | ||
| 103 | dst[dst_row + col0 + j] = block[src_row + j]; | ||
| 104 | } | ||
| 105 | } | ||
| 106 | 14 | } | |
| 107 | |||
| 108 | void MulAddLocal(const std::vector<double> &a, const std::vector<double> &b, std::vector<double> &c, | ||
| 109 | std::size_t block_n) { | ||
| 110 | 56 | const auto block_n64 = static_cast<std::int64_t>(block_n); | |
| 111 | |||
| 112 | 56 | #pragma omp parallel for default(none) schedule(static) shared(a, b, c, block_n, block_n64) | |
| 113 | for (std::int64_t ii = 0; ii < block_n64; ++ii) { | ||
| 114 | const auto row = static_cast<std::size_t>(ii); | ||
| 115 | const std::size_t a_row = row * block_n; | ||
| 116 | const std::size_t c_row = row * block_n; | ||
| 117 | double *c_block = c.data() + c_row; | ||
| 118 | for (std::size_t kk = 0; kk < block_n; ++kk) { | ||
| 119 | const double aik = a[a_row + kk]; | ||
| 120 | const double *b_block = b.data() + (kk * block_n); | ||
| 121 | for (std::int64_t jj = 0; jj < block_n64; ++jj) { | ||
| 122 | c_block[jj] += aik * b_block[jj]; | ||
| 123 | } | ||
| 124 | } | ||
| 125 | } | ||
| 126 | } | ||
| 127 | |||
| 128 | 28 | struct LocalCell { | |
| 129 | int virtual_rank = 0; | ||
| 130 | std::vector<double> a; | ||
| 131 | std::vector<double> b; | ||
| 132 | std::vector<double> c; | ||
| 133 | }; | ||
| 134 | |||
| 135 | int GetRow(int virtual_rank, int grid_dim) { | ||
| 136 | 112 | return virtual_rank / grid_dim; | |
| 137 | } | ||
| 138 | |||
| 139 | int GetCol(int virtual_rank, int grid_dim) { | ||
| 140 | 112 | return virtual_rank % grid_dim; | |
| 141 | } | ||
| 142 | |||
| 143 | struct ShiftTargets { | ||
| 144 | int source_rank = 0; | ||
| 145 | int dest_rank = 0; | ||
| 146 | int source_owner = 0; | ||
| 147 | int dest_owner = 0; | ||
| 148 | }; | ||
| 149 | |||
| 150 | 112 | ShiftTargets ComputeShiftTargets(int virtual_rank, const std::vector<int> &owner_by_rank, int grid_dim, | |
| 151 | bool horizontal_shift) { | ||
| 152 | const int row = GetRow(virtual_rank, grid_dim); | ||
| 153 | const int col = GetCol(virtual_rank, grid_dim); | ||
| 154 | |||
| 155 | ShiftTargets targets; | ||
| 156 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 56 times.
|
112 | targets.source_rank = horizontal_shift ? MakeVirtualRank(row, (col + 1) % grid_dim, grid_dim) |
| 157 | 56 | : MakeVirtualRank((row + 1) % grid_dim, col, grid_dim); | |
| 158 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 56 times.
|
112 | targets.dest_rank = horizontal_shift ? MakeVirtualRank(row, (col + grid_dim - 1) % grid_dim, grid_dim) |
| 159 | 56 | : MakeVirtualRank((row + grid_dim - 1) % grid_dim, col, grid_dim); | |
| 160 | 112 | targets.source_owner = owner_by_rank[static_cast<std::size_t>(targets.source_rank)]; | |
| 161 | 112 | targets.dest_owner = owner_by_rank[static_cast<std::size_t>(targets.dest_rank)]; | |
| 162 | 112 | return targets; | |
| 163 | } | ||
| 164 | |||
| 165 | 28 | void ExchangePhase(const std::vector<std::vector<double>> ¤t_buffers, | |
| 166 | std::vector<std::vector<double>> &next_buffers, const std::vector<int> &virtual_ranks, | ||
| 167 | const std::vector<int> &owner_by_rank, const std::vector<int> &local_index_by_rank, int grid_dim, | ||
| 168 | int world_rank, int tag_base, bool horizontal_shift) { | ||
| 169 | std::size_t recv_count = 0; | ||
| 170 | std::size_t send_count = 0; | ||
| 171 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 28 times.
|
84 | for (int virtual_rank : virtual_ranks) { |
| 172 | 56 | const auto targets = ComputeShiftTargets(virtual_rank, owner_by_rank, grid_dim, horizontal_shift); | |
| 173 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 28 times.
|
56 | recv_count += (targets.source_owner != world_rank) ? 1U : 0U; |
| 174 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 28 times.
|
84 | send_count += (targets.dest_owner != world_rank) ? 1U : 0U; |
| 175 | } | ||
| 176 | |||
| 177 | 28 | std::vector<MPI_Request> recv_requests(recv_count, MPI_REQUEST_NULL); | |
| 178 |
1/4✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
28 | std::vector<MPI_Request> send_requests(send_count, MPI_REQUEST_NULL); |
| 179 | std::size_t recv_idx = 0; | ||
| 180 | std::size_t send_idx = 0; | ||
| 181 | |||
| 182 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 28 times.
|
84 | for (std::size_t idx = 0; idx < virtual_ranks.size(); ++idx) { |
| 183 | 56 | const int virtual_rank = virtual_ranks[idx]; | |
| 184 | 56 | const auto targets = ComputeShiftTargets(virtual_rank, owner_by_rank, grid_dim, horizontal_shift); | |
| 185 | |||
| 186 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 28 times.
|
56 | if (targets.source_owner == world_rank) { |
| 187 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | const int local_source_idx = local_index_by_rank[static_cast<std::size_t>(targets.source_rank)]; |
| 188 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | next_buffers[idx] = current_buffers[static_cast<std::size_t>(local_source_idx)]; |
| 189 | } else { | ||
| 190 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | MPI_Irecv(next_buffers[idx].data(), static_cast<int>(next_buffers[idx].size()), MPI_DOUBLE, targets.source_owner, |
| 191 | tag_base + virtual_rank, MPI_COMM_WORLD, &recv_requests[recv_idx]); | ||
| 192 | 28 | ++recv_idx; | |
| 193 | } | ||
| 194 | |||
| 195 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 28 times.
|
56 | if (targets.dest_owner != world_rank) { |
| 196 | 28 | MPI_Isend(current_buffers[idx].data(), static_cast<int>(current_buffers[idx].size()), MPI_DOUBLE, | |
| 197 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | targets.dest_owner, tag_base + targets.dest_rank, MPI_COMM_WORLD, &send_requests[send_idx]); |
| 198 | 28 | ++send_idx; | |
| 199 | } | ||
| 200 | } | ||
| 201 | |||
| 202 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 14 times.
|
28 | if (!recv_requests.empty()) { |
| 203 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | MPI_Waitall(static_cast<int>(recv_requests.size()), recv_requests.data(), MPI_STATUSES_IGNORE); |
| 204 | } | ||
| 205 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 14 times.
|
28 | if (!send_requests.empty()) { |
| 206 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | MPI_Waitall(static_cast<int>(send_requests.size()), send_requests.data(), MPI_STATUSES_IGNORE); |
| 207 | } | ||
| 208 | 28 | } | |
| 209 | |||
| 210 | 14 | void DistributeInitiallyAlignedBlocks(const std::vector<double> &a_global, const std::vector<double> &b_global, | |
| 211 | std::vector<LocalCell> &local_cells, const std::vector<int> &local_index_by_rank, | ||
| 212 | std::size_t global_n, std::size_t block_n, int grid_dim, int world_rank, | ||
| 213 | int world_size) { | ||
| 214 | constexpr int kTagA = 1000; | ||
| 215 | constexpr int kTagB = 2000; | ||
| 216 | |||
| 217 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 7 times.
|
14 | if (world_rank == 0) { |
| 218 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 7 times.
|
21 | for (int row = 0; row < grid_dim; ++row) { |
| 219 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | for (int col = 0; col < grid_dim; ++col) { |
| 220 | const int virtual_rank = MakeVirtualRank(row, col, grid_dim); | ||
| 221 | const int owner_rank = GetOwnerRank(virtual_rank, world_size); | ||
| 222 | 28 | const int a_col = (row + col) % grid_dim; | |
| 223 | const int b_row = (row + col) % grid_dim; | ||
| 224 | |||
| 225 | 28 | std::vector<double> a_block(block_n * block_n, 0.0); | |
| 226 |
1/4✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
28 | std::vector<double> b_block(block_n * block_n, 0.0); |
| 227 | ExtractLocalBlock(a_global, a_block, global_n, block_n, row, a_col); | ||
| 228 | ExtractLocalBlock(b_global, b_block, global_n, block_n, b_row, col); | ||
| 229 | |||
| 230 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 14 times.
|
28 | if (owner_rank == 0) { |
| 231 | 14 | const int local_idx = local_index_by_rank[static_cast<std::size_t>(virtual_rank)]; | |
| 232 | 14 | local_cells[static_cast<std::size_t>(local_idx)].a = std::move(a_block); | |
| 233 | 14 | local_cells[static_cast<std::size_t>(local_idx)].b = std::move(b_block); | |
| 234 | } else { | ||
| 235 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | MPI_Send(a_block.data(), static_cast<int>(a_block.size()), MPI_DOUBLE, owner_rank, kTagA + virtual_rank, |
| 236 | MPI_COMM_WORLD); | ||
| 237 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | MPI_Send(b_block.data(), static_cast<int>(b_block.size()), MPI_DOUBLE, owner_rank, kTagB + virtual_rank, |
| 238 | MPI_COMM_WORLD); | ||
| 239 | } | ||
| 240 | } | ||
| 241 | } | ||
| 242 | } else { | ||
| 243 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 7 times.
|
21 | for (auto &cell : local_cells) { |
| 244 | 14 | MPI_Recv(cell.a.data(), static_cast<int>(cell.a.size()), MPI_DOUBLE, 0, kTagA + cell.virtual_rank, MPI_COMM_WORLD, | |
| 245 | MPI_STATUS_IGNORE); | ||
| 246 | 14 | MPI_Recv(cell.b.data(), static_cast<int>(cell.b.size()), MPI_DOUBLE, 0, kTagB + cell.virtual_rank, MPI_COMM_WORLD, | |
| 247 | MPI_STATUS_IGNORE); | ||
| 248 | } | ||
| 249 | } | ||
| 250 | 14 | } | |
| 251 | |||
| 252 | 14 | void ShiftBlocksCannon(std::vector<LocalCell> &local_cells, const std::vector<int> &owner_by_rank, std::size_t block_n, | |
| 253 | int grid_dim, int world_rank) { | ||
| 254 | constexpr int kShiftATagBase = 3000; | ||
| 255 | constexpr int kShiftBTagBase = 5000; | ||
| 256 | |||
| 257 | 14 | std::vector<int> virtual_ranks(local_cells.size(), 0); | |
| 258 |
2/6✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 14 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
14 | std::vector<int> local_index_by_rank(static_cast<std::size_t>(grid_dim * grid_dim), -1); |
| 259 |
2/6✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 14 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
14 | std::vector<std::vector<double>> current_a(local_cells.size()); |
| 260 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | std::vector<std::vector<double>> current_b(local_cells.size()); |
| 261 |
2/4✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 14 times.
✗ Branch 5 not taken.
|
14 | std::vector<std::vector<double>> next_a(local_cells.size(), std::vector<double>(block_n * block_n, 0.0)); |
| 262 |
2/4✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 14 times.
✗ Branch 5 not taken.
|
14 | std::vector<std::vector<double>> next_b(local_cells.size(), std::vector<double>(block_n * block_n, 0.0)); |
| 263 | |||
| 264 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | for (std::size_t idx = 0; idx < local_cells.size(); ++idx) { |
| 265 | 28 | virtual_ranks[idx] = local_cells[idx].virtual_rank; | |
| 266 | 28 | local_index_by_rank[static_cast<std::size_t>(local_cells[idx].virtual_rank)] = static_cast<int>(idx); | |
| 267 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | current_a[idx] = local_cells[idx].a; |
| 268 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | current_b[idx] = local_cells[idx].b; |
| 269 | } | ||
| 270 | |||
| 271 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | ExchangePhase(current_a, next_a, virtual_ranks, owner_by_rank, local_index_by_rank, grid_dim, world_rank, |
| 272 | kShiftATagBase, true); | ||
| 273 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | ExchangePhase(current_b, next_b, virtual_ranks, owner_by_rank, local_index_by_rank, grid_dim, world_rank, |
| 274 | kShiftBTagBase, false); | ||
| 275 | |||
| 276 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | for (std::size_t idx = 0; idx < local_cells.size(); ++idx) { |
| 277 | 28 | local_cells[idx].a = std::move(next_a[idx]); | |
| 278 | 28 | local_cells[idx].b = std::move(next_b[idx]); | |
| 279 | } | ||
| 280 | 28 | } | |
| 281 | |||
| 282 | 14 | void GatherResultBlocks(const std::vector<LocalCell> &local_cells, std::vector<double> &global_matrix, | |
| 283 | const std::vector<int> &local_index_by_rank, std::size_t global_n, std::size_t block_n, | ||
| 284 | int grid_dim, int world_rank, int world_size) { | ||
| 285 | constexpr int kTagC = 7000; | ||
| 286 | |||
| 287 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 7 times.
|
14 | if (world_rank == 0) { |
| 288 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 7 times.
|
35 | for (int virtual_rank = 0; virtual_rank < grid_dim * grid_dim; ++virtual_rank) { |
| 289 | 28 | const int row = virtual_rank / grid_dim; | |
| 290 | 28 | const int col = virtual_rank % grid_dim; | |
| 291 | const int owner_rank = GetOwnerRank(virtual_rank, world_size); | ||
| 292 | |||
| 293 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 14 times.
|
28 | if (owner_rank == 0) { |
| 294 | 14 | const int local_idx = local_index_by_rank[static_cast<std::size_t>(virtual_rank)]; | |
| 295 | 14 | InsertLocalBlock(local_cells[static_cast<std::size_t>(local_idx)].c, global_matrix, global_n, block_n, row, | |
| 296 | col); | ||
| 297 | } else { | ||
| 298 | 14 | std::vector<double> block(block_n * block_n, 0.0); | |
| 299 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | MPI_Recv(block.data(), static_cast<int>(block.size()), MPI_DOUBLE, owner_rank, kTagC + virtual_rank, |
| 300 | MPI_COMM_WORLD, MPI_STATUS_IGNORE); | ||
| 301 | InsertLocalBlock(block, global_matrix, global_n, block_n, row, col); | ||
| 302 | } | ||
| 303 | } | ||
| 304 | } else { | ||
| 305 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 7 times.
|
21 | for (const auto &cell : local_cells) { |
| 306 | 14 | MPI_Send(cell.c.data(), static_cast<int>(cell.c.size()), MPI_DOUBLE, 0, kTagC + cell.virtual_rank, | |
| 307 | MPI_COMM_WORLD); | ||
| 308 | } | ||
| 309 | } | ||
| 310 | 14 | } | |
| 311 | |||
| 312 | } // namespace | ||
| 313 | |||
| 314 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | CheremkhinAMatrMultCannonAlgALL::CheremkhinAMatrMultCannonAlgALL(const InType &in) { |
| 315 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 316 | GetInput() = in; | ||
| 317 | GetOutput() = {}; | ||
| 318 | 14 | } | |
| 319 | |||
| 320 | 14 | bool CheremkhinAMatrMultCannonAlgALL::ValidationImpl() { | |
| 321 | 14 | const std::size_t n = std::get<0>(GetInput()); | |
| 322 | const auto &a = std::get<1>(GetInput()); | ||
| 323 | const auto &b = std::get<2>(GetInput()); | ||
| 324 |
3/6✓ Branch 0 taken 14 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 14 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 14 times.
|
14 | return n > 0 && a.size() == n * n && b.size() == n * n; |
| 325 | } | ||
| 326 | |||
| 327 | 14 | bool CheremkhinAMatrMultCannonAlgALL::PreProcessingImpl() { | |
| 328 | GetOutput() = {}; | ||
| 329 | 14 | return true; | |
| 330 | } | ||
| 331 | |||
| 332 | 14 | bool CheremkhinAMatrMultCannonAlgALL::RunImpl() { | |
| 333 | 14 | const std::size_t n = std::get<0>(GetInput()); | |
| 334 | const auto &a_in = std::get<1>(GetInput()); | ||
| 335 | const auto &b_in = std::get<2>(GetInput()); | ||
| 336 | 14 | const int requested_threads = ppc::util::GetNumThreads(); | |
| 337 | 14 | int world_rank = 0; | |
| 338 | 14 | int world_size = 0; | |
| 339 | 14 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); | |
| 340 | 14 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); | |
| 341 | |||
| 342 | 14 | omp_set_num_threads(requested_threads); | |
| 343 | |||
| 344 | 14 | const int q = ChooseVirtualGridSize(world_size); | |
| 345 | 14 | const int virtual_size = q * q; | |
| 346 | 14 | const std::size_t block_n = CeilDiv(n, static_cast<std::size_t>(q)); | |
| 347 | 14 | const std::size_t padded_n = block_n * static_cast<std::size_t>(q); | |
| 348 | |||
| 349 | 14 | std::vector<int> owner_by_rank(static_cast<std::size_t>(virtual_size), 0); | |
| 350 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 14 times.
|
70 | for (int virtual_rank = 0; virtual_rank < virtual_size; ++virtual_rank) { |
| 351 | 56 | owner_by_rank[static_cast<std::size_t>(virtual_rank)] = GetOwnerRank(virtual_rank, world_size); | |
| 352 | } | ||
| 353 | |||
| 354 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | const std::vector<int> owned_virtual_ranks = GetOwnedVirtualRanks(world_rank, world_size, q); |
| 355 |
1/4✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
14 | std::vector<int> local_index_by_rank(static_cast<std::size_t>(virtual_size), -1); |
| 356 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | std::vector<LocalCell> local_cells; |
| 357 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | local_cells.reserve(owned_virtual_ranks.size()); |
| 358 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | for (std::size_t idx = 0; idx < owned_virtual_ranks.size(); ++idx) { |
| 359 | 28 | const int virtual_rank = owned_virtual_ranks[idx]; | |
| 360 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | local_index_by_rank[static_cast<std::size_t>(virtual_rank)] = static_cast<int>(idx); |
| 361 | 28 | LocalCell cell; | |
| 362 | 28 | cell.virtual_rank = virtual_rank; | |
| 363 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | cell.a.assign(block_n * block_n, 0.0); |
| 364 |
1/2✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
|
28 | cell.b.assign(block_n * block_n, 0.0); |
| 365 |
2/4✓ Branch 1 taken 28 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 28 times.
✗ Branch 5 not taken.
|
28 | cell.c.assign(block_n * block_n, 0.0); |
| 366 | local_cells.push_back(std::move(cell)); | ||
| 367 | 28 | } | |
| 368 | |||
| 369 | 14 | std::vector<double> a_padded; | |
| 370 | 14 | std::vector<double> b_padded; | |
| 371 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 7 times.
|
14 | if (world_rank == 0) { |
| 372 |
1/2✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
|
7 | a_padded.assign(padded_n * padded_n, 0.0); |
| 373 |
1/2✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
|
7 | b_padded.assign(padded_n * padded_n, 0.0); |
| 374 | CopyGlobalToPadded(a_in, a_padded, n, padded_n); | ||
| 375 | CopyGlobalToPadded(b_in, b_padded, n, padded_n); | ||
| 376 | } | ||
| 377 | |||
| 378 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | DistributeInitiallyAlignedBlocks(a_padded, b_padded, local_cells, local_index_by_rank, padded_n, block_n, q, |
| 379 | world_rank, world_size); | ||
| 380 | |||
| 381 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 14 times.
|
42 | for (int step = 0; step < q; ++step) { |
| 382 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 28 times.
|
84 | for (auto &cell : local_cells) { |
| 383 | 56 | MulAddLocal(cell.a, cell.b, cell.c, block_n); | |
| 384 | } | ||
| 385 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 14 times.
|
28 | if (step + 1 < q) { |
| 386 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | ShiftBlocksCannon(local_cells, owner_by_rank, block_n, q, world_rank); |
| 387 | } | ||
| 388 | } | ||
| 389 | |||
| 390 |
1/4✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
14 | std::vector<double> c_padded(padded_n * padded_n, 0.0); |
| 391 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | GatherResultBlocks(local_cells, c_padded, local_index_by_rank, padded_n, block_n, q, world_rank, world_size); |
| 392 |
1/2✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
|
14 | MPI_Bcast(c_padded.data(), static_cast<int>(c_padded.size()), MPI_DOUBLE, 0, MPI_COMM_WORLD); |
| 393 | |||
| 394 |
1/4✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
14 | std::vector<double> out(n * n, 0.0); |
| 395 | CopyPaddedToGlobal(c_padded, out, padded_n, n); | ||
| 396 | |||
| 397 | GetOutput() = std::move(out); | ||
| 398 | 14 | return true; | |
| 399 | 14 | } | |
| 400 | |||
| 401 | 14 | bool CheremkhinAMatrMultCannonAlgALL::PostProcessingImpl() { | |
| 402 | 14 | return true; | |
| 403 | } | ||
| 404 | |||
| 405 | } // namespace cheremkhin_a_matr_mult_cannon_alg | ||
| 406 |