5 #ifndef GKO_PUBLIC_CORE_MATRIX_CSR_HPP_
6 #define GKO_PUBLIC_CORE_MATRIX_CSR_HPP_
9 #include <ginkgo/core/base/array.hpp>
10 #include <ginkgo/core/base/index_set.hpp>
11 #include <ginkgo/core/base/lin_op.hpp>
12 #include <ginkgo/core/base/math.hpp>
13 #include <ginkgo/core/matrix/permutation.hpp>
14 #include <ginkgo/core/matrix/scaled_permutation.hpp>
21 template <
typename ValueType>
24 template <
typename ValueType>
27 template <
typename ValueType,
typename IndexType>
30 template <
typename ValueType,
typename IndexType>
33 template <
typename ValueType,
typename IndexType>
36 template <
typename ValueType,
typename IndexType>
39 template <
typename ValueType,
typename IndexType>
42 template <
typename ValueType,
typename IndexType>
45 template <
typename ValueType,
typename IndexType>
48 template <
typename ValueType,
typename IndexType>
55 template <
typename ValueType = default_precision,
typename IndexType =
int32>
100 template <
typename ValueType = default_precision,
typename IndexType =
int32>
102 public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>,
116 remove_complex<Csr<ValueType, IndexType>>>,
119 friend class Coo<ValueType, IndexType>;
120 friend class Dense<ValueType>;
122 friend class Ell<ValueType, IndexType>;
123 friend class Hybrid<ValueType, IndexType>;
124 friend class Sellp<ValueType, IndexType>;
126 friend class Fbcsr<ValueType, IndexType>;
127 friend class CsrBuilder<ValueType, IndexType>;
151 using value_type = ValueType;
152 using index_type = IndexType;
202 virtual int64_t
clac_size(
const int64_t nnz) = 0;
208 virtual std::shared_ptr<strategy_type>
copy() = 0;
211 void set_name(std::string name) { name_ = name; }
233 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
235 const bool is_mtx_on_host{host_mtx_exec ==
237 const index_type* row_ptrs{};
238 if (is_mtx_on_host) {
241 row_ptrs_host = mtx_row_ptrs;
244 auto num_rows = mtx_row_ptrs.
get_size() - 1;
245 max_length_per_row_ = 0;
246 for (
size_type i = 0; i < num_rows; i++) {
247 max_length_per_row_ = std::max(max_length_per_row_,
248 row_ptrs[i + 1] - row_ptrs[i]);
252 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
254 index_type get_max_length_per_row() const noexcept
256 return max_length_per_row_;
259 std::shared_ptr<strategy_type>
copy()
override
261 return std::make_shared<classical>();
265 index_type max_length_per_row_;
284 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
286 std::shared_ptr<strategy_type>
copy()
override
288 return std::make_shared<merge_path>();
309 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
311 std::shared_ptr<strategy_type>
copy()
override
313 return std::make_shared<cusparse>();
333 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
335 std::shared_ptr<strategy_type>
copy()
override
337 return std::make_shared<sparselib>();
363 :
load_balance(exec->get_num_warps(), exec->get_warp_size())
372 :
load_balance(exec->get_num_warps(), exec->get_warp_size(), false)
383 :
load_balance(exec->get_num_subgroups(), 32, false,
"intel")
398 bool cuda_strategy =
true,
399 std::string strategy_name =
"none")
402 warp_size_(warp_size),
403 cuda_strategy_(cuda_strategy),
404 strategy_name_(strategy_name)
413 auto host_srow_exec = mtx_srow->
get_executor()->get_master();
414 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
415 const bool is_srow_on_host{host_srow_exec ==
417 const bool is_mtx_on_host{host_mtx_exec ==
421 const index_type* row_ptrs{};
423 if (is_srow_on_host) {
426 srow_host = *mtx_srow;
429 if (is_mtx_on_host) {
432 row_ptrs_host = mtx_row_ptrs;
438 const auto num_rows = mtx_row_ptrs.
get_size() - 1;
439 const auto num_elems = row_ptrs[num_rows];
440 const auto bucket_divider =
441 num_elems > 0 ?
ceildiv(num_elems, warp_size_) : 1;
442 for (
size_type i = 0; i < num_rows; i++) {
446 if (bucket < nwarps) {
452 srow[i] += srow[i - 1];
454 if (!is_srow_on_host) {
455 *mtx_srow = srow_host;
462 if (warp_size_ > 0) {
464 if (nnz >= static_cast<int64_t>(2e8)) {
466 }
else if (nnz >= static_cast<int64_t>(2e7)) {
468 }
else if (nnz >= static_cast<int64_t>(2e6)) {
470 }
else if (nnz >= static_cast<int64_t>(2e5)) {
473 if (strategy_name_ ==
"intel") {
475 if (nnz >= static_cast<int64_t>(2e8)) {
477 }
else if (nnz >= static_cast<int64_t>(2e7)) {
481 #if GINKGO_HIP_PLATFORM_HCC
482 if (!cuda_strategy_) {
484 if (nnz >= static_cast<int64_t>(1e7)) {
486 }
else if (nnz >= static_cast<int64_t>(1e6)) {
490 #endif // GINKGO_HIP_PLATFORM_HCC
492 auto nwarps = nwarps_ * multiple;
499 std::shared_ptr<strategy_type>
copy()
override
501 return std::make_shared<load_balance>(
502 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
509 std::string strategy_name_;
516 const index_type nvidia_row_len_limit = 1024;
519 const index_type nvidia_nnz_limit{static_cast<index_type>(1e6)};
522 const index_type amd_row_len_limit = 768;
525 const index_type amd_nnz_limit{static_cast<index_type>(1e8)};
528 const index_type intel_row_len_limit = 25600;
531 const index_type intel_nnz_limit{static_cast<index_type>(3e8)};
551 :
automatical(exec->get_num_warps(), exec->get_warp_size())
560 :
automatical(exec->get_num_warps(), exec->get_warp_size(), false)
571 :
automatical(exec->get_num_subgroups(), 32, false,
"intel")
586 bool cuda_strategy =
true,
587 std::string strategy_name =
"none")
590 warp_size_(warp_size),
591 cuda_strategy_(cuda_strategy),
592 strategy_name_(strategy_name),
593 max_length_per_row_(0)
602 index_type nnz_limit = nvidia_nnz_limit;
603 index_type row_len_limit = nvidia_row_len_limit;
604 if (strategy_name_ ==
"intel") {
605 nnz_limit = intel_nnz_limit;
606 row_len_limit = intel_row_len_limit;
608 #if GINKGO_HIP_PLATFORM_HCC
609 if (!cuda_strategy_) {
610 nnz_limit = amd_nnz_limit;
611 row_len_limit = amd_row_len_limit;
613 #endif // GINKGO_HIP_PLATFORM_HCC
614 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
615 const bool is_mtx_on_host{host_mtx_exec ==
618 const index_type* row_ptrs{};
619 if (is_mtx_on_host) {
622 row_ptrs_host = mtx_row_ptrs;
625 const auto num_rows = mtx_row_ptrs.
get_size() - 1;
626 if (row_ptrs[num_rows] > nnz_limit) {
628 cuda_strategy_, strategy_name_);
629 if (is_mtx_on_host) {
630 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
632 actual_strategy.
process(row_ptrs_host, mtx_srow);
634 this->set_name(actual_strategy.
get_name());
636 index_type maxnum = 0;
637 for (
size_type i = 0; i < num_rows; i++) {
638 maxnum = std::max(maxnum, row_ptrs[i + 1] - row_ptrs[i]);
640 if (maxnum > row_len_limit) {
642 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
643 if (is_mtx_on_host) {
644 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
646 actual_strategy.
process(row_ptrs_host, mtx_srow);
648 this->set_name(actual_strategy.
get_name());
651 if (is_mtx_on_host) {
652 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
653 max_length_per_row_ =
654 actual_strategy.get_max_length_per_row();
656 actual_strategy.
process(row_ptrs_host, mtx_srow);
657 max_length_per_row_ =
658 actual_strategy.get_max_length_per_row();
660 this->set_name(actual_strategy.
get_name());
667 return std::make_shared<load_balance>(
668 nwarps_, warp_size_, cuda_strategy_, strategy_name_)
672 index_type get_max_length_per_row() const noexcept
674 return max_length_per_row_;
677 std::shared_ptr<strategy_type>
copy()
override
679 return std::make_shared<automatical>(
680 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
687 std::string strategy_name_;
688 index_type max_length_per_row_;
726 void read(
const mat_data& data)
override;
728 void read(
const device_mat_data& data)
override;
730 void read(device_mat_data&& data)
override;
732 void write(mat_data& data)
const override;
734 std::unique_ptr<LinOp>
transpose()
const override;
772 bool invert =
false)
const;
804 bool invert =
false)
const;
806 std::unique_ptr<LinOp>
permute(
841 bool is_sorted_by_column_index()
const;
955 strategy_ = std::move(strategy->copy());
968 GKO_ASSERT_EQUAL_DIMENSIONS(alpha,
dim<2>(1, 1));
981 GKO_ASSERT_EQUAL_DIMENSIONS(alpha,
dim<2>(1, 1));
993 static std::unique_ptr<Csr>
create(std::shared_ptr<const Executor> exec,
994 std::shared_ptr<strategy_type> strategy);
1007 static std::unique_ptr<Csr>
create(
1008 std::shared_ptr<const Executor> exec,
const dim<2>& size = {},
1010 std::shared_ptr<strategy_type> strategy =
nullptr);
1031 static std::unique_ptr<Csr>
create(
1032 std::shared_ptr<const Executor> exec,
const dim<2>& size,
1033 array<value_type> values, array<index_type> col_idxs,
1034 array<index_type> row_ptrs,
1035 std::shared_ptr<strategy_type> strategy =
nullptr);
1041 template <
typename InputValueType,
typename InputColumnIndexType,
1042 typename InputRowPtrType>
1044 "explicitly construct the gko::array argument instead of passing "
1045 "initializer lists")
1047 std::shared_ptr<const
Executor> exec, const
dim<2>& size,
1048 std::initializer_list<InputValueType> values,
1049 std::initializer_list<InputColumnIndexType> col_idxs,
1050 std::initializer_list<InputRowPtrType> row_ptrs)
1073 std::shared_ptr<const Executor> exec,
const dim<2>& size,
1074 gko::detail::const_array_view<ValueType>&& values,
1075 gko::detail::const_array_view<IndexType>&& col_idxs,
1076 gko::detail::const_array_view<IndexType>&& row_ptrs,
1077 std::shared_ptr<strategy_type> strategy =
nullptr);
1107 const span& row_span,
const span& column_span)
const;
1134 Csr(std::shared_ptr<const Executor> exec,
const dim<2>& size = {},
1136 std::shared_ptr<strategy_type> strategy =
nullptr);
1138 Csr(std::shared_ptr<const Executor> exec,
const dim<2>& size,
1139 array<value_type> values, array<index_type> col_idxs,
1140 array<index_type> row_ptrs,
1141 std::shared_ptr<strategy_type> strategy =
nullptr);
1143 void apply_impl(
const LinOp* b,
LinOp* x)
const override;
1145 void apply_impl(
const LinOp* alpha,
const LinOp* b,
const LinOp* beta,
1146 LinOp* x)
const override;
1149 static std::shared_ptr<strategy_type> make_default_strategy(
1150 std::shared_ptr<const Executor> exec)
1152 auto cuda_exec = std::dynamic_pointer_cast<const CudaExecutor>(exec);
1153 auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(exec);
1154 auto dpcpp_exec = std::dynamic_pointer_cast<const DpcppExecutor>(exec);
1155 std::shared_ptr<strategy_type> new_strategy;
1157 new_strategy = std::make_shared<automatical>(cuda_exec);
1158 }
else if (hip_exec) {
1159 new_strategy = std::make_shared<automatical>(hip_exec);
1160 }
else if (dpcpp_exec) {
1161 new_strategy = std::make_shared<automatical>(dpcpp_exec);
1163 new_strategy = std::make_shared<classical>();
1165 return new_strategy;
1169 template <
typename CsrType>
1170 void convert_strategy_helper(CsrType* result)
const
1173 std::shared_ptr<typename CsrType::strategy_type> new_strat;
1174 if (dynamic_cast<classical*>(strat)) {
1175 new_strat = std::make_shared<typename CsrType::classical>();
1176 }
else if (dynamic_cast<merge_path*>(strat)) {
1177 new_strat = std::make_shared<typename CsrType::merge_path>();
1178 }
else if (dynamic_cast<cusparse*>(strat)) {
1179 new_strat = std::make_shared<typename CsrType::cusparse>();
1180 }
else if (dynamic_cast<sparselib*>(strat)) {
1181 new_strat = std::make_shared<typename CsrType::sparselib>();
1183 auto rexec = result->get_executor();
1185 std::dynamic_pointer_cast<const CudaExecutor>(rexec);
1186 auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(rexec);
1188 std::dynamic_pointer_cast<const DpcppExecutor>(rexec);
1189 auto lb = dynamic_cast<load_balance*>(strat);
1193 std::make_shared<typename CsrType::load_balance>(
1196 new_strat = std::make_shared<typename CsrType::automatical>(
1199 }
else if (hip_exec) {
1202 std::make_shared<typename CsrType::load_balance>(
1205 new_strat = std::make_shared<typename CsrType::automatical>(
1208 }
else if (dpcpp_exec) {
1211 std::make_shared<typename CsrType::load_balance>(
1214 new_strat = std::make_shared<typename CsrType::automatical>(
1219 auto this_cuda_exec =
1220 std::dynamic_pointer_cast<const CudaExecutor>(
1222 auto this_hip_exec =
1223 std::dynamic_pointer_cast<const HipExecutor>(
1225 auto this_dpcpp_exec =
1226 std::dynamic_pointer_cast<const DpcppExecutor>(
1228 if (this_cuda_exec) {
1231 std::make_shared<typename CsrType::load_balance>(
1235 std::make_shared<typename CsrType::automatical>(
1238 }
else if (this_hip_exec) {
1241 std::make_shared<typename CsrType::load_balance>(
1245 std::make_shared<typename CsrType::automatical>(
1248 }
else if (this_dpcpp_exec) {
1251 std::make_shared<typename CsrType::load_balance>(
1255 std::make_shared<typename CsrType::automatical>(
1263 new_strat = std::make_shared<typename CsrType::classical>();
1267 result->set_strategy(new_strat);
1276 strategy_->process(row_ptrs_, &srow_);
1285 virtual void scale_impl(
const LinOp* alpha);
1293 virtual void inv_scale_impl(
const LinOp* alpha);
1296 std::shared_ptr<strategy_type> strategy_;
1297 array<value_type> values_;
1298 array<index_type> col_idxs_;
1299 array<index_type> row_ptrs_;
1300 array<index_type> srow_;
1302 void add_scaled_identity_impl(
const LinOp* a,
const LinOp* b)
override;
1315 template <
typename ValueType,
typename IndexType>
1316 void strategy_rebuild_helper(Csr<ValueType, IndexType>* result)
1318 using load_balance =
typename Csr<ValueType, IndexType>::load_balance;
1319 using automatical =
typename Csr<ValueType, IndexType>::automatical;
1320 auto strategy = result->get_strategy();
1321 auto executor = result->get_executor();
1322 if (std::dynamic_pointer_cast<load_balance>(strategy)) {
1324 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
1325 result->set_strategy(std::make_shared<load_balance>(exec));
1326 }
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
1328 result->set_strategy(std::make_shared<load_balance>(exec));
1330 }
else if (std::dynamic_pointer_cast<automatical>(strategy)) {
1332 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
1333 result->set_strategy(std::make_shared<automatical>(exec));
1334 }
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
1336 result->set_strategy(std::make_shared<automatical>(exec));
1347 #endif // GKO_PUBLIC_CORE_MATRIX_CSR_HPP_