| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "volkov_a_sparse_mat_mul_ccs/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <mpi.h> | ||
| 4 | #include <omp.h> | ||
| 5 | |||
| 6 | #include <algorithm> | ||
| 7 | #include <cmath> | ||
| 8 | #include <tuple> | ||
| 9 | #include <utility> | ||
| 10 | #include <vector> | ||
| 11 | |||
| 12 | #include "volkov_a_sparse_mat_mul_ccs/common/include/common.hpp" | ||
| 13 | |||
| 14 | namespace volkov_a_sparse_mat_mul_ccs { | ||
| 15 | |||
| 16 | namespace { | ||
| 17 | |||
| 18 | template <typename Matrix> | ||
| 19 | 30 | void BroadcastMatrix(Matrix &mat, int root, MPI_Comm comm) { | |
| 20 | 30 | int rank = 0; | |
| 21 | 30 | MPI_Comm_rank(comm, &rank); | |
| 22 | |||
| 23 | 30 | MPI_Bcast(&mat.rows_count, 1, MPI_INT, root, comm); | |
| 24 | 30 | MPI_Bcast(&mat.cols_count, 1, MPI_INT, root, comm); | |
| 25 | 30 | MPI_Bcast(&mat.non_zeros, 1, MPI_INT, root, comm); | |
| 26 | |||
| 27 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 15 times.
|
30 | if (rank != root) { |
| 28 | 15 | mat.col_ptrs.resize(mat.cols_count + 1); | |
| 29 | 15 | mat.row_indices.resize(mat.non_zeros); | |
| 30 | 15 | mat.values.resize(mat.non_zeros); | |
| 31 | } | ||
| 32 | |||
| 33 | 30 | MPI_Bcast(mat.col_ptrs.data(), mat.cols_count + 1, MPI_INT, root, comm); | |
| 34 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 2 times.
|
30 | if (mat.non_zeros > 0) { |
| 35 | 28 | MPI_Bcast(mat.row_indices.data(), mat.non_zeros, MPI_INT, root, comm); | |
| 36 | 28 | MPI_Bcast(mat.values.data(), mat.non_zeros, MPI_DOUBLE, root, comm); | |
| 37 | } | ||
| 38 | 30 | } | |
| 39 | |||
| 40 | template <typename MatrixType> | ||
| 41 | 9 | void ProcessColumnOmp(int col_idx, const MatrixType &matrix_a, const MatrixType &matrix_b, | |
| 42 | std::vector<double> &col_accumulator, std::vector<int> &local_row_indices, | ||
| 43 | std::vector<double> &local_values) { | ||
| 44 | 9 | int b_start = matrix_b.col_ptrs[col_idx]; | |
| 45 | 9 | int b_end = matrix_b.col_ptrs[col_idx + 1]; | |
| 46 | |||
| 47 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 9 times.
|
21 | for (int k = b_start; k < b_end; ++k) { |
| 48 | 12 | int b_row = matrix_b.row_indices[k]; | |
| 49 | 12 | double b_val = matrix_b.values[k]; | |
| 50 | |||
| 51 | 12 | int a_start = matrix_a.col_ptrs[b_row]; | |
| 52 | 12 | int a_end = matrix_a.col_ptrs[b_row + 1]; | |
| 53 | |||
| 54 |
2/2✓ Branch 0 taken 14 times.
✓ Branch 1 taken 12 times.
|
26 | for (int idx = a_start; idx < a_end; ++idx) { |
| 55 | 14 | int a_row = matrix_a.row_indices[idx]; | |
| 56 | 14 | double a_val = matrix_a.values[idx]; | |
| 57 | 14 | col_accumulator[a_row] += a_val * b_val; | |
| 58 | } | ||
| 59 | } | ||
| 60 | |||
| 61 |
2/2✓ Branch 0 taken 20 times.
✓ Branch 1 taken 9 times.
|
29 | for (int i = 0; i < matrix_a.rows_count; ++i) { |
| 62 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 8 times.
|
20 | if (std::abs(col_accumulator[i]) > 1e-10) { |
| 63 | local_row_indices.push_back(i); | ||
| 64 | local_values.push_back(col_accumulator[i]); | ||
| 65 | } | ||
| 66 | 20 | col_accumulator[i] = 0.0; | |
| 67 | } | ||
| 68 | 9 | } | |
| 69 | |||
| 70 | 5 | void FlattenLocalData(int my_cols_count, const std::vector<std::vector<int>> &local_row_indices, | |
| 71 | const std::vector<std::vector<double>> &local_values, std::vector<int> &my_col_sizes, int &my_nnz, | ||
| 72 | std::vector<int> &my_rows_flat, std::vector<double> &my_vals_flat) { | ||
| 73 | 5 | my_nnz = 0; | |
| 74 | 5 | my_col_sizes.resize(my_cols_count); | |
| 75 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 5 times.
|
9 | for (int i = 0; i < my_cols_count; ++i) { |
| 76 | 4 | my_col_sizes[i] = static_cast<int>(local_row_indices[i].size()); | |
| 77 | 4 | my_nnz += my_col_sizes[i]; | |
| 78 | } | ||
| 79 | |||
| 80 | 5 | my_rows_flat.resize(my_nnz); | |
| 81 | 5 | my_vals_flat.resize(my_nnz); | |
| 82 | int offset = 0; | ||
| 83 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 5 times.
|
9 | for (int i = 0; i < my_cols_count; ++i) { |
| 84 | 4 | std::copy(local_row_indices[i].begin(), local_row_indices[i].end(), my_rows_flat.begin() + offset); | |
| 85 | 4 | std::copy(local_values[i].begin(), local_values[i].end(), my_vals_flat.begin() + offset); | |
| 86 | 4 | offset += my_col_sizes[i]; | |
| 87 | } | ||
| 88 | 5 | } | |
| 89 | |||
| 90 | 5 | void SendResultsToRoot(int my_cols_count, const std::vector<int> &my_col_sizes, int my_nnz, | |
| 91 | const std::vector<int> &my_rows_flat, const std::vector<double> &my_vals_flat) { | ||
| 92 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 1 times.
|
5 | if (my_cols_count > 0) { |
| 93 | 4 | MPI_Send(my_col_sizes.data(), my_cols_count, MPI_INT, 0, 0, MPI_COMM_WORLD); | |
| 94 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 1 times.
|
4 | if (my_nnz > 0) { |
| 95 | 3 | MPI_Send(my_rows_flat.data(), my_nnz, MPI_INT, 0, 1, MPI_COMM_WORLD); | |
| 96 | 3 | MPI_Send(my_vals_flat.data(), my_nnz, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD); | |
| 97 | } | ||
| 98 | } | ||
| 99 | 5 | } | |
| 100 | |||
| 101 | 5 | void ReceiveWorkerResults(int world_size, int chunk, int rem, std::vector<std::vector<int>> &all_row_indices, | |
| 102 | std::vector<std::vector<double>> &all_values) { | ||
| 103 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | for (int proc = 1; proc < world_size; ++proc) { |
| 104 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
5 | int proc_start_col = (proc * chunk) + std::min(proc, rem); |
| 105 |
1/2✓ Branch 0 taken 5 times.
✗ Branch 1 not taken.
|
5 | int proc_end_col = proc_start_col + chunk + (proc < rem ? 1 : 0); |
| 106 | 5 | int proc_cols_count = proc_end_col - proc_start_col; | |
| 107 | |||
| 108 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
|
5 | if (proc_cols_count <= 0) { |
| 109 | 1 | continue; | |
| 110 | } | ||
| 111 | |||
| 112 |
1/2✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
|
4 | std::vector<int> proc_col_sizes(proc_cols_count); |
| 113 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | MPI_Recv(proc_col_sizes.data(), proc_cols_count, MPI_INT, proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); |
| 114 | |||
| 115 | int proc_nnz = 0; | ||
| 116 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | for (int s : proc_col_sizes) { |
| 117 | 4 | proc_nnz += s; | |
| 118 | } | ||
| 119 | |||
| 120 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<int> proc_rows(proc_nnz); |
| 121 |
1/4✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
4 | std::vector<double> proc_vals(proc_nnz); |
| 122 | |||
| 123 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 1 times.
|
4 | if (proc_nnz > 0) { |
| 124 |
1/2✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
|
3 | MPI_Recv(proc_rows.data(), proc_nnz, MPI_INT, proc, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); |
| 125 |
1/2✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
|
3 | MPI_Recv(proc_vals.data(), proc_nnz, MPI_DOUBLE, proc, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); |
| 126 | } | ||
| 127 | |||
| 128 | int curr_offset = 0; | ||
| 129 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 4 times.
|
8 | for (int i = 0; i < proc_cols_count; ++i) { |
| 130 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | all_row_indices[proc_start_col + i].resize(proc_col_sizes[i]); |
| 131 |
1/2✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
|
4 | all_values[proc_start_col + i].resize(proc_col_sizes[i]); |
| 132 | |||
| 133 | 4 | std::copy(proc_rows.begin() + curr_offset, proc_rows.begin() + curr_offset + proc_col_sizes[i], | |
| 134 | all_row_indices[proc_start_col + i].begin()); | ||
| 135 | 4 | std::copy(proc_vals.begin() + curr_offset, proc_vals.begin() + curr_offset + proc_col_sizes[i], | |
| 136 | all_values[proc_start_col + i].begin()); | ||
| 137 | |||
| 138 | 4 | curr_offset += proc_col_sizes[i]; | |
| 139 | } | ||
| 140 | } | ||
| 141 | 5 | } | |
| 142 | |||
| 143 | template <typename MatrixType> | ||
| 144 | 5 | void AssembleFinalMatrix(int total_cols, const std::vector<std::vector<int>> &all_row_indices, | |
| 145 | const std::vector<std::vector<double>> &all_values, MatrixType &matrix_c) { | ||
| 146 | 5 | matrix_c.col_ptrs.assign(total_cols + 1, 0); | |
| 147 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 5 times.
|
14 | for (int j = 0; j < total_cols; ++j) { |
| 148 | 9 | matrix_c.col_ptrs[j + 1] = matrix_c.col_ptrs[j] + static_cast<int>(all_row_indices[j].size()); | |
| 149 | } | ||
| 150 | |||
| 151 | 5 | matrix_c.non_zeros = matrix_c.col_ptrs.back(); | |
| 152 | 5 | matrix_c.row_indices.resize(matrix_c.non_zeros); | |
| 153 | 5 | matrix_c.values.resize(matrix_c.non_zeros); | |
| 154 | |||
| 155 | 5 | #pragma omp parallel for schedule(static) default(none) shared(total_cols, matrix_c, all_row_indices, all_values) | |
| 156 | for (int j = 0; j < total_cols; ++j) { | ||
| 157 | int offset_c = matrix_c.col_ptrs[j]; | ||
| 158 | int size = static_cast<int>(all_row_indices[j].size()); | ||
| 159 | for (int k = 0; k < size; ++k) { | ||
| 160 | matrix_c.row_indices[offset_c + k] = all_row_indices[j][k]; | ||
| 161 | matrix_c.values[offset_c + k] = all_values[j][k]; | ||
| 162 | } | ||
| 163 | } | ||
| 164 | 5 | } | |
| 165 | |||
| 166 | template <typename MatrixType> | ||
| 167 | 5 | void GatherResultsToRoot(int total_cols, int world_size, int chunk, int rem, int start_col, int my_cols_count, | |
| 168 | std::vector<std::vector<int>> &local_row_indices, | ||
| 169 | std::vector<std::vector<double>> &local_values, MatrixType &matrix_c) { | ||
| 170 | 5 | std::vector<std::vector<int>> all_row_indices(total_cols); | |
| 171 |
1/2✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
|
5 | std::vector<std::vector<double>> all_values(total_cols); |
| 172 | |||
| 173 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | for (int i = 0; i < my_cols_count; ++i) { |
| 174 | 5 | all_row_indices[start_col + i] = std::move(local_row_indices[i]); | |
| 175 | all_values[start_col + i] = std::move(local_values[i]); | ||
| 176 | } | ||
| 177 | |||
| 178 |
1/2✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
|
5 | ReceiveWorkerResults(world_size, chunk, rem, all_row_indices, all_values); |
| 179 | |||
| 180 |
1/2✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
|
5 | AssembleFinalMatrix(total_cols, all_row_indices, all_values, matrix_c); |
| 181 | 5 | } | |
| 182 | |||
| 183 | } // namespace | ||
| 184 | |||
| 185 |
1/2✓ Branch 2 taken 10 times.
✗ Branch 3 not taken.
|
10 | VolkovASparseMatMulCcsAll::VolkovASparseMatMulCcsAll(const InType &in) { |
| 186 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 187 | GetInput() = in; | ||
| 188 | 10 | } | |
| 189 | |||
| 190 | 10 | bool VolkovASparseMatMulCcsAll::ValidationImpl() { | |
| 191 | 10 | int world_rank = 0; | |
| 192 | 10 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); | |
| 193 | |||
| 194 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | if (world_rank == 0) { |
| 195 | const auto &matrix_a = std::get<0>(GetInput()); | ||
| 196 | const auto &matrix_b = std::get<1>(GetInput()); | ||
| 197 | 5 | return (matrix_a.cols_count == matrix_b.rows_count); | |
| 198 | } | ||
| 199 | return true; | ||
| 200 | } | ||
| 201 | |||
| 202 | 10 | bool VolkovASparseMatMulCcsAll::PreProcessingImpl() { | |
| 203 | 10 | return true; | |
| 204 | } | ||
| 205 | |||
| 206 | 10 | bool VolkovASparseMatMulCcsAll::RunImpl() { | |
| 207 | 10 | int world_rank = 0; | |
| 208 | 10 | int world_size = 0; | |
| 209 | 10 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); | |
| 210 | 10 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); | |
| 211 | |||
| 212 | auto &matrix_a = std::get<0>(GetInput()); | ||
| 213 | auto &matrix_b = std::get<1>(GetInput()); | ||
| 214 | auto &matrix_c = GetOutput(); | ||
| 215 | |||
| 216 | 10 | BroadcastMatrix(matrix_a, 0, MPI_COMM_WORLD); | |
| 217 | 10 | BroadcastMatrix(matrix_b, 0, MPI_COMM_WORLD); | |
| 218 | |||
| 219 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | if (world_rank == 0) { |
| 220 | 5 | matrix_c.rows_count = matrix_a.rows_count; | |
| 221 | 5 | matrix_c.cols_count = matrix_b.cols_count; | |
| 222 | } | ||
| 223 | |||
| 224 | 10 | int total_cols = matrix_b.cols_count; | |
| 225 | 10 | int chunk = total_cols / world_size; | |
| 226 | 10 | int rem = total_cols % world_size; | |
| 227 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 1 times.
|
10 | int start_col = (world_rank * chunk) + std::min(world_rank, rem); |
| 228 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 1 times.
|
10 | int end_col = start_col + chunk + (world_rank < rem ? 1 : 0); |
| 229 | 10 | int my_cols_count = end_col - start_col; | |
| 230 | |||
| 231 | 10 | std::vector<std::vector<int>> local_row_indices(my_cols_count); | |
| 232 |
1/2✓ Branch 1 taken 10 times.
✗ Branch 2 not taken.
|
10 | std::vector<std::vector<double>> local_values(my_cols_count); |
| 233 | |||
| 234 | 10 | #pragma omp parallel default(none) shared(matrix_a, matrix_b, start_col, end_col, local_row_indices, local_values) | |
| 235 | { | ||
| 236 | std::vector<double> col_accumulator(matrix_a.rows_count, 0.0); | ||
| 237 | #pragma omp for schedule(dynamic) | ||
| 238 | for (int j = start_col; j < end_col; ++j) { | ||
| 239 | int local_j = j - start_col; | ||
| 240 | ProcessColumnOmp(j, matrix_a, matrix_b, col_accumulator, local_row_indices[local_j], local_values[local_j]); | ||
| 241 | } | ||
| 242 | } | ||
| 243 | |||
| 244 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 5 times.
|
10 | if (world_rank == 0) { |
| 245 |
1/2✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
|
5 | GatherResultsToRoot(total_cols, world_size, chunk, rem, start_col, my_cols_count, local_row_indices, local_values, |
| 246 | matrix_c); | ||
| 247 | } else { | ||
| 248 | 5 | int my_nnz = 0; | |
| 249 | 5 | std::vector<int> my_col_sizes; | |
| 250 | 5 | std::vector<int> my_rows_flat; | |
| 251 | 5 | std::vector<double> my_vals_flat; | |
| 252 | |||
| 253 |
1/2✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
|
5 | FlattenLocalData(my_cols_count, local_row_indices, local_values, my_col_sizes, my_nnz, my_rows_flat, my_vals_flat); |
| 254 |
1/2✓ Branch 1 taken 5 times.
✗ Branch 2 not taken.
|
5 | SendResultsToRoot(my_cols_count, my_col_sizes, my_nnz, my_rows_flat, my_vals_flat); |
| 255 | } | ||
| 256 | |||
| 257 | 10 | return true; | |
| 258 | 10 | } | |
| 259 | |||
| 260 | 10 | bool VolkovASparseMatMulCcsAll::PostProcessingImpl() { | |
| 261 | auto &matrix_c = GetOutput(); | ||
| 262 | 10 | BroadcastMatrix(matrix_c, 0, MPI_COMM_WORLD); | |
| 263 | 10 | return true; | |
| 264 | } | ||
| 265 | |||
| 266 | } // namespace volkov_a_sparse_mat_mul_ccs | ||
| 267 |