| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "ashihmin_d_mult_matr_crs/all/include/ops_all.hpp" | ||
| 2 | |||
| 3 | #include <mpi.h> | ||
| 4 | #include <omp.h> | ||
| 5 | #include <tbb/tbb.h> | ||
| 6 | |||
| 7 | #include <algorithm> | ||
| 8 | #include <cmath> | ||
| 9 | #include <map> | ||
| 10 | #include <thread> | ||
| 11 | #include <vector> | ||
| 12 | |||
| 13 | #include "ashihmin_d_mult_matr_crs/common/include/common.hpp" | ||
| 14 | #include "util/include/util.hpp" | ||
| 15 | |||
| 16 | namespace ashihmin_d_mult_matr_crs { | ||
| 17 | |||
| 18 |
1/2✓ Branch 2 taken 12 times.
✗ Branch 3 not taken.
|
12 | AshihminDMultMatrCrsALL::AshihminDMultMatrCrsALL(const InType &in) { |
| 19 | SetTypeOfTask(GetStaticTypeOfTask()); | ||
| 20 | GetInput() = in; | ||
| 21 | 12 | } | |
| 22 | |||
| 23 | 12 | bool AshihminDMultMatrCrsALL::ValidationImpl() { | |
| 24 | 12 | return GetInput().first.cols == GetInput().second.rows; | |
| 25 | } | ||
| 26 | |||
| 27 | 12 | bool AshihminDMultMatrCrsALL::PreProcessingImpl() { | |
| 28 | auto &matrix_c = GetOutput(); | ||
| 29 | |||
| 30 | 12 | matrix_c.rows = GetInput().first.rows; | |
| 31 | 12 | matrix_c.cols = GetInput().second.cols; | |
| 32 | 12 | return true; | |
| 33 | } | ||
| 34 | |||
| 35 | 12 | void AshihminDMultMatrCrsALL::MultiplyRow(int global_row_idx, int local_idx, const CRSMatrix &matrix_a, | |
| 36 | const CRSMatrix &matrix_b, std::vector<std::vector<int>> &local_cols, | ||
| 37 | std::vector<std::vector<double>> &local_vals) { | ||
| 38 | std::map<int, double> row_accumulator; | ||
| 39 |
2/2✓ Branch 0 taken 13 times.
✓ Branch 1 taken 12 times.
|
25 | for (int j = matrix_a.row_ptr[global_row_idx]; j < matrix_a.row_ptr[global_row_idx + 1]; ++j) { |
| 40 | 13 | int col_a = matrix_a.col_index[j]; | |
| 41 | 13 | double val_a = matrix_a.values[j]; | |
| 42 |
2/2✓ Branch 0 taken 20 times.
✓ Branch 1 taken 13 times.
|
33 | for (int k = matrix_b.row_ptr[col_a]; k < matrix_b.row_ptr[col_a + 1]; ++k) { |
| 43 |
1/2✓ Branch 1 taken 20 times.
✗ Branch 2 not taken.
|
20 | row_accumulator[matrix_b.col_index[k]] += val_a * matrix_b.values[k]; |
| 44 | } | ||
| 45 | } | ||
| 46 | |||
| 47 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 12 times.
|
27 | for (const auto &entry : row_accumulator) { |
| 48 |
1/2✓ Branch 0 taken 15 times.
✗ Branch 1 not taken.
|
15 | if (std::abs(entry.second) > 1e-15) { |
| 49 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 15 times.
|
15 | local_cols[local_idx].push_back(entry.first); |
| 50 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 15 times.
|
15 | local_vals[local_idx].push_back(entry.second); |
| 51 | } | ||
| 52 | } | ||
| 53 | 12 | } | |
| 54 | |||
| 55 | 12 | bool AshihminDMultMatrCrsALL::RunImpl() { | |
| 56 | 12 | const auto &matrix_a = GetInput().first; | |
| 57 | 12 | const auto &matrix_b = GetInput().second; | |
| 58 | auto &matrix_c = GetOutput(); | ||
| 59 | |||
| 60 | 12 | int rank = 0; | |
| 61 | 12 | int size = 0; | |
| 62 | 12 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); | |
| 63 | 12 | MPI_Comm_size(MPI_COMM_WORLD, &size); | |
| 64 | |||
| 65 | 12 | int rows_a = matrix_a.rows; | |
| 66 | 12 | int base_rows = rows_a / size; | |
| 67 | 12 | int rem = rows_a % size; | |
| 68 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 2 times.
|
12 | int my_start = (rank * base_rows) + std::min(rank, rem); |
| 69 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 2 times.
|
12 | int my_end = my_start + base_rows + (rank < rem ? 1 : 0); |
| 70 | 12 | int my_row_count = my_end - my_start; | |
| 71 | |||
| 72 | 12 | std::vector<std::vector<int>> local_cols(my_row_count); | |
| 73 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | std::vector<std::vector<double>> local_vals(my_row_count); |
| 74 | |||
| 75 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | int thread_count = ppc::util::GetNumThreads(); |
| 76 | 12 | std::vector<std::thread> threads; | |
| 77 | |||
| 78 | 12 | auto compute_rows = [&](int start_idx, int end_idx) { | |
| 79 | tbb::parallel_for(start_idx, end_idx, | ||
| 80 | 24 | [&](int i) { MultiplyRow(my_start + i, i, matrix_a, matrix_b, local_cols, local_vals); }); | |
| 81 | 24 | }; | |
| 82 | |||
| 83 | 12 | int stl_chunk = (my_row_count + thread_count - 1) / thread_count; | |
| 84 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 12 times.
|
36 | for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) { |
| 85 | 24 | int start_chunk = thread_idx * stl_chunk; | |
| 86 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
24 | int end_chunk = std::min(start_chunk + stl_chunk, my_row_count); |
| 87 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
24 | if (start_chunk < end_chunk) { |
| 88 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | threads.emplace_back(compute_rows, start_chunk, end_chunk); |
| 89 | } | ||
| 90 | } | ||
| 91 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
24 | for (auto &th : threads) { |
| 92 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | th.join(); |
| 93 | } | ||
| 94 | |||
| 95 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | std::vector<int> my_nnz_per_row(my_row_count); |
| 96 | 12 | #pragma omp parallel for default(none) shared(my_nnz_per_row, local_cols, my_row_count) | |
| 97 | for (int i = 0; i < my_row_count; ++i) { | ||
| 98 | my_nnz_per_row[i] = static_cast<int>(local_cols[i].size()); | ||
| 99 | } | ||
| 100 | |||
| 101 | 12 | std::vector<int> my_flat_cols; | |
| 102 | 12 | std::vector<double> my_flat_vals; | |
| 103 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
24 | for (int i = 0; i < my_row_count; ++i) { |
| 104 |
2/4✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 12 times.
✗ Branch 5 not taken.
|
12 | my_flat_cols.insert(my_flat_cols.end(), local_cols[i].begin(), local_cols[i].end()); |
| 105 | 12 | my_flat_vals.insert(my_flat_vals.end(), local_vals[i].begin(), local_vals[i].end()); | |
| 106 | } | ||
| 107 | |||
| 108 |
1/4✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
12 | std::vector<int> all_nnz_per_row(rows_a); |
| 109 |
1/4✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
12 | std::vector<int> recv_counts(size); |
| 110 |
1/4✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
12 | std::vector<int> displs(size); |
| 111 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 12 times.
|
36 | for (int i = 0; i < size; ++i) { |
| 112 |
4/4✓ Branch 0 taken 20 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 12 times.
|
44 | recv_counts[i] = (rows_a / size) + (i < (rows_a % size) ? 1 : 0); |
| 113 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 12 times.
|
24 | displs[i] = (i == 0) ? 0 : displs[i - 1] + recv_counts[i - 1]; |
| 114 | } | ||
| 115 | |||
| 116 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | MPI_Allgatherv(my_nnz_per_row.data(), my_row_count, MPI_INT, all_nnz_per_row.data(), recv_counts.data(), |
| 117 | displs.data(), MPI_INT, MPI_COMM_WORLD); | ||
| 118 | |||
| 119 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | matrix_c.row_ptr.assign(rows_a + 1, 0); |
| 120 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 12 times.
|
36 | for (int i = 0; i < rows_a; ++i) { |
| 121 | 24 | matrix_c.row_ptr[i + 1] = matrix_c.row_ptr[i] + all_nnz_per_row[i]; | |
| 122 | } | ||
| 123 | |||
| 124 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | matrix_c.col_index.resize(matrix_c.row_ptr[rows_a]); |
| 125 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | matrix_c.values.resize(matrix_c.row_ptr[rows_a]); |
| 126 | |||
| 127 |
1/4✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
12 | std::vector<int> val_recv_counts(size); |
| 128 |
1/4✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
|
12 | std::vector<int> val_displs(size); |
| 129 |
2/2✓ Branch 0 taken 24 times.
✓ Branch 1 taken 12 times.
|
36 | for (int i = 0; i < size; ++i) { |
| 130 | 24 | val_recv_counts[i] = matrix_c.row_ptr[displs[i] + recv_counts[i]] - matrix_c.row_ptr[displs[i]]; | |
| 131 | 24 | val_displs[i] = matrix_c.row_ptr[displs[i]]; | |
| 132 | } | ||
| 133 | |||
| 134 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | MPI_Allgatherv(my_flat_cols.data(), static_cast<int>(my_flat_cols.size()), MPI_INT, matrix_c.col_index.data(), |
| 135 | val_recv_counts.data(), val_displs.data(), MPI_INT, MPI_COMM_WORLD); | ||
| 136 |
1/2✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
|
12 | MPI_Allgatherv(my_flat_vals.data(), static_cast<int>(my_flat_vals.size()), MPI_DOUBLE, matrix_c.values.data(), |
| 137 | val_recv_counts.data(), val_displs.data(), MPI_DOUBLE, MPI_COMM_WORLD); | ||
| 138 | |||
| 139 | 12 | return true; | |
| 140 | 12 | } | |
| 141 | |||
| 142 | 12 | bool AshihminDMultMatrCrsALL::PostProcessingImpl() { | |
| 143 | 12 | return true; | |
| 144 | } | ||
| 145 | |||
| 146 | } // namespace ashihmin_d_mult_matr_crs | ||
| 147 |