5 #ifndef GKO_PUBLIC_CORE_MATRIX_CSR_HPP_
6 #define GKO_PUBLIC_CORE_MATRIX_CSR_HPP_
9 #include <ginkgo/core/base/array.hpp>
10 #include <ginkgo/core/base/index_set.hpp>
11 #include <ginkgo/core/base/lin_op.hpp>
12 #include <ginkgo/core/base/math.hpp>
13 #include <ginkgo/core/matrix/permutation.hpp>
14 #include <ginkgo/core/matrix/scaled_permutation.hpp>
21 template <
typename ValueType>
24 template <
typename ValueType>
27 template <
typename ValueType,
typename IndexType>
30 template <
typename ValueType,
typename IndexType>
33 template <
typename ValueType,
typename IndexType>
36 template <
typename ValueType,
typename IndexType>
39 template <
typename ValueType,
typename IndexType>
42 template <
typename ValueType,
typename IndexType>
45 template <
typename ValueType,
typename IndexType>
48 template <
typename ValueType,
typename IndexType>
55 template <
typename ValueType = default_precision,
typename IndexType =
int32>
100 template <
typename ValueType = default_precision,
typename IndexType =
int32>
102 public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>,
103 #if GINKGO_ENABLE_HALF
105 Csr<next_precision<next_precision<ValueType>>, IndexType>>,
120 remove_complex<Csr<ValueType, IndexType>>>,
123 friend class Coo<ValueType, IndexType>;
124 friend class Dense<ValueType>;
126 friend class Ell<ValueType, IndexType>;
127 friend class Hybrid<ValueType, IndexType>;
128 friend class Sellp<ValueType, IndexType>;
130 friend class Fbcsr<ValueType, IndexType>;
131 friend class CsrBuilder<ValueType, IndexType>;
155 using value_type = ValueType;
156 using index_type = IndexType;
206 virtual int64_t
clac_size(
const int64_t nnz) = 0;
212 virtual std::shared_ptr<strategy_type>
copy() = 0;
215 void set_name(std::string name) { name_ = name; }
237 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
239 const bool is_mtx_on_host{host_mtx_exec ==
241 const index_type* row_ptrs{};
242 if (is_mtx_on_host) {
245 row_ptrs_host = mtx_row_ptrs;
248 auto num_rows = mtx_row_ptrs.
get_size() - 1;
249 max_length_per_row_ = 0;
250 for (
size_type i = 0; i < num_rows; i++) {
251 max_length_per_row_ = std::max(max_length_per_row_,
252 row_ptrs[i + 1] - row_ptrs[i]);
256 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
258 index_type get_max_length_per_row() const noexcept
260 return max_length_per_row_;
263 std::shared_ptr<strategy_type>
copy()
override
265 return std::make_shared<classical>();
269 index_type max_length_per_row_;
288 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
290 std::shared_ptr<strategy_type>
copy()
override
292 return std::make_shared<merge_path>();
313 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
315 std::shared_ptr<strategy_type>
copy()
override
317 return std::make_shared<cusparse>();
337 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
339 std::shared_ptr<strategy_type>
copy()
override
341 return std::make_shared<sparselib>();
367 :
load_balance(exec->get_num_warps(), exec->get_warp_size())
376 :
load_balance(exec->get_num_warps(), exec->get_warp_size(), false)
387 :
load_balance(exec->get_num_subgroups(), 32, false,
"intel")
402 bool cuda_strategy =
true,
403 std::string strategy_name =
"none")
406 warp_size_(warp_size),
407 cuda_strategy_(cuda_strategy),
408 strategy_name_(strategy_name)
417 auto host_srow_exec = mtx_srow->
get_executor()->get_master();
418 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
419 const bool is_srow_on_host{host_srow_exec ==
421 const bool is_mtx_on_host{host_mtx_exec ==
425 const index_type* row_ptrs{};
427 if (is_srow_on_host) {
430 srow_host = *mtx_srow;
433 if (is_mtx_on_host) {
436 row_ptrs_host = mtx_row_ptrs;
442 const auto num_rows = mtx_row_ptrs.
get_size() - 1;
443 const auto num_elems = row_ptrs[num_rows];
444 const auto bucket_divider =
445 num_elems > 0 ?
ceildiv(num_elems, warp_size_) : 1;
446 for (
size_type i = 0; i < num_rows; i++) {
450 if (bucket < nwarps) {
456 srow[i] += srow[i - 1];
458 if (!is_srow_on_host) {
459 *mtx_srow = srow_host;
466 if (warp_size_ > 0) {
468 if (nnz >= static_cast<int64_t>(2e8)) {
470 }
else if (nnz >= static_cast<int64_t>(2e7)) {
472 }
else if (nnz >= static_cast<int64_t>(2e6)) {
474 }
else if (nnz >= static_cast<int64_t>(2e5)) {
477 if (strategy_name_ ==
"intel") {
479 if (nnz >= static_cast<int64_t>(2e8)) {
481 }
else if (nnz >= static_cast<int64_t>(2e7)) {
485 #if GINKGO_HIP_PLATFORM_HCC
486 if (!cuda_strategy_) {
488 if (nnz >= static_cast<int64_t>(1e7)) {
490 }
else if (nnz >= static_cast<int64_t>(1e6)) {
494 #endif // GINKGO_HIP_PLATFORM_HCC
496 auto nwarps = nwarps_ * multiple;
503 std::shared_ptr<strategy_type>
copy()
override
505 return std::make_shared<load_balance>(
506 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
513 std::string strategy_name_;
520 const index_type nvidia_row_len_limit = 1024;
523 const index_type nvidia_nnz_limit{static_cast<index_type>(1e6)};
526 const index_type amd_row_len_limit = 768;
529 const index_type amd_nnz_limit{static_cast<index_type>(1e8)};
532 const index_type intel_row_len_limit = 25600;
535 const index_type intel_nnz_limit{static_cast<index_type>(3e8)};
555 :
automatical(exec->get_num_warps(), exec->get_warp_size())
564 :
automatical(exec->get_num_warps(), exec->get_warp_size(), false)
575 :
automatical(exec->get_num_subgroups(), 32, false,
"intel")
590 bool cuda_strategy =
true,
591 std::string strategy_name =
"none")
594 warp_size_(warp_size),
595 cuda_strategy_(cuda_strategy),
596 strategy_name_(strategy_name),
597 max_length_per_row_(0)
606 index_type nnz_limit = nvidia_nnz_limit;
607 index_type row_len_limit = nvidia_row_len_limit;
608 if (strategy_name_ ==
"intel") {
609 nnz_limit = intel_nnz_limit;
610 row_len_limit = intel_row_len_limit;
612 #if GINKGO_HIP_PLATFORM_HCC
613 if (!cuda_strategy_) {
614 nnz_limit = amd_nnz_limit;
615 row_len_limit = amd_row_len_limit;
617 #endif // GINKGO_HIP_PLATFORM_HCC
618 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
619 const bool is_mtx_on_host{host_mtx_exec ==
622 const index_type* row_ptrs{};
623 if (is_mtx_on_host) {
626 row_ptrs_host = mtx_row_ptrs;
629 const auto num_rows = mtx_row_ptrs.
get_size() - 1;
630 if (row_ptrs[num_rows] > nnz_limit) {
632 cuda_strategy_, strategy_name_);
633 if (is_mtx_on_host) {
634 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
636 actual_strategy.
process(row_ptrs_host, mtx_srow);
638 this->set_name(actual_strategy.
get_name());
640 index_type maxnum = 0;
641 for (
size_type i = 0; i < num_rows; i++) {
642 maxnum = std::max(maxnum, row_ptrs[i + 1] - row_ptrs[i]);
644 if (maxnum > row_len_limit) {
646 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
647 if (is_mtx_on_host) {
648 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
650 actual_strategy.
process(row_ptrs_host, mtx_srow);
652 this->set_name(actual_strategy.
get_name());
655 if (is_mtx_on_host) {
656 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
657 max_length_per_row_ =
658 actual_strategy.get_max_length_per_row();
660 actual_strategy.
process(row_ptrs_host, mtx_srow);
661 max_length_per_row_ =
662 actual_strategy.get_max_length_per_row();
664 this->set_name(actual_strategy.
get_name());
671 return std::make_shared<load_balance>(
672 nwarps_, warp_size_, cuda_strategy_, strategy_name_)
676 index_type get_max_length_per_row() const noexcept
678 return max_length_per_row_;
681 std::shared_ptr<strategy_type>
copy()
override
683 return std::make_shared<automatical>(
684 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
691 std::string strategy_name_;
692 index_type max_length_per_row_;
695 friend class Csr<previous_precision<ValueType>, IndexType>;
702 #if GINKGO_ENABLE_HALF
703 friend class Csr<previous_precision<previous_precision<ValueType>>,
711 result)
const override;
745 void read(
const mat_data& data)
override;
747 void read(
const device_mat_data& data)
override;
749 void read(device_mat_data&& data)
override;
751 void write(mat_data& data)
const override;
753 std::unique_ptr<LinOp>
transpose()
const override;
791 bool invert =
false)
const;
823 bool invert =
false)
const;
825 std::unique_ptr<LinOp>
permute(
860 bool is_sorted_by_column_index()
const;
974 strategy_ = std::move(strategy->copy());
987 GKO_ASSERT_EQUAL_DIMENSIONS(alpha,
dim<2>(1, 1));
1000 GKO_ASSERT_EQUAL_DIMENSIONS(alpha,
dim<2>(1, 1));
1012 static std::unique_ptr<Csr>
create(std::shared_ptr<const Executor> exec,
1013 std::shared_ptr<strategy_type> strategy);
1026 static std::unique_ptr<Csr>
create(
1027 std::shared_ptr<const Executor> exec,
const dim<2>& size = {},
1029 std::shared_ptr<strategy_type> strategy =
nullptr);
1050 static std::unique_ptr<Csr>
create(
1051 std::shared_ptr<const Executor> exec,
const dim<2>& size,
1052 array<value_type> values, array<index_type> col_idxs,
1053 array<index_type> row_ptrs,
1054 std::shared_ptr<strategy_type> strategy =
nullptr);
1060 template <
typename InputValueType,
typename InputColumnIndexType,
1061 typename InputRowPtrType>
1063 "explicitly construct the gko::array argument instead of passing "
1064 "initializer lists")
1066 std::shared_ptr<const
Executor> exec, const
dim<2>& size,
1067 std::initializer_list<InputValueType> values,
1068 std::initializer_list<InputColumnIndexType> col_idxs,
1069 std::initializer_list<InputRowPtrType> row_ptrs)
1092 std::shared_ptr<const Executor> exec,
const dim<2>& size,
1093 gko::detail::const_array_view<ValueType>&& values,
1094 gko::detail::const_array_view<IndexType>&& col_idxs,
1095 gko::detail::const_array_view<IndexType>&& row_ptrs,
1096 std::shared_ptr<strategy_type> strategy =
nullptr);
1126 const span& row_span,
const span& column_span)
const;
1153 Csr(std::shared_ptr<const Executor> exec,
const dim<2>& size = {},
1155 std::shared_ptr<strategy_type> strategy =
nullptr);
1157 Csr(std::shared_ptr<const Executor> exec,
const dim<2>& size,
1158 array<value_type> values, array<index_type> col_idxs,
1159 array<index_type> row_ptrs,
1160 std::shared_ptr<strategy_type> strategy =
nullptr);
1162 void apply_impl(
const LinOp* b,
LinOp* x)
const override;
1164 void apply_impl(
const LinOp* alpha,
const LinOp* b,
const LinOp* beta,
1165 LinOp* x)
const override;
1168 static std::shared_ptr<strategy_type> make_default_strategy(
1169 std::shared_ptr<const Executor> exec)
1171 auto cuda_exec = std::dynamic_pointer_cast<const CudaExecutor>(exec);
1172 auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(exec);
1173 auto dpcpp_exec = std::dynamic_pointer_cast<const DpcppExecutor>(exec);
1174 std::shared_ptr<strategy_type> new_strategy;
1176 new_strategy = std::make_shared<automatical>(cuda_exec);
1177 }
else if (hip_exec) {
1178 new_strategy = std::make_shared<automatical>(hip_exec);
1179 }
else if (dpcpp_exec) {
1180 new_strategy = std::make_shared<automatical>(dpcpp_exec);
1182 new_strategy = std::make_shared<classical>();
1184 return new_strategy;
1188 template <
typename CsrType>
1189 void convert_strategy_helper(CsrType* result)
const
1192 std::shared_ptr<typename CsrType::strategy_type> new_strat;
1193 if (dynamic_cast<classical*>(strat)) {
1194 new_strat = std::make_shared<typename CsrType::classical>();
1195 }
else if (dynamic_cast<merge_path*>(strat)) {
1196 new_strat = std::make_shared<typename CsrType::merge_path>();
1197 }
else if (dynamic_cast<cusparse*>(strat)) {
1198 new_strat = std::make_shared<typename CsrType::cusparse>();
1199 }
else if (dynamic_cast<sparselib*>(strat)) {
1200 new_strat = std::make_shared<typename CsrType::sparselib>();
1202 auto rexec = result->get_executor();
1204 std::dynamic_pointer_cast<const CudaExecutor>(rexec);
1205 auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(rexec);
1207 std::dynamic_pointer_cast<const DpcppExecutor>(rexec);
1208 auto lb = dynamic_cast<load_balance*>(strat);
1212 std::make_shared<typename CsrType::load_balance>(
1215 new_strat = std::make_shared<typename CsrType::automatical>(
1218 }
else if (hip_exec) {
1221 std::make_shared<typename CsrType::load_balance>(
1224 new_strat = std::make_shared<typename CsrType::automatical>(
1227 }
else if (dpcpp_exec) {
1230 std::make_shared<typename CsrType::load_balance>(
1233 new_strat = std::make_shared<typename CsrType::automatical>(
1238 auto this_cuda_exec =
1239 std::dynamic_pointer_cast<const CudaExecutor>(
1241 auto this_hip_exec =
1242 std::dynamic_pointer_cast<const HipExecutor>(
1244 auto this_dpcpp_exec =
1245 std::dynamic_pointer_cast<const DpcppExecutor>(
1247 if (this_cuda_exec) {
1250 std::make_shared<typename CsrType::load_balance>(
1254 std::make_shared<typename CsrType::automatical>(
1257 }
else if (this_hip_exec) {
1260 std::make_shared<typename CsrType::load_balance>(
1264 std::make_shared<typename CsrType::automatical>(
1267 }
else if (this_dpcpp_exec) {
1270 std::make_shared<typename CsrType::load_balance>(
1274 std::make_shared<typename CsrType::automatical>(
1282 new_strat = std::make_shared<typename CsrType::classical>();
1286 result->set_strategy(new_strat);
1295 strategy_->process(row_ptrs_, &srow_);
1304 virtual void scale_impl(
const LinOp* alpha);
1312 virtual void inv_scale_impl(
const LinOp* alpha);
1315 std::shared_ptr<strategy_type> strategy_;
1316 array<value_type> values_;
1317 array<index_type> col_idxs_;
1318 array<index_type> row_ptrs_;
1319 array<index_type> srow_;
1321 void add_scaled_identity_impl(
const LinOp* a,
const LinOp* b)
override;
1334 template <
typename ValueType,
typename IndexType>
1335 void strategy_rebuild_helper(Csr<ValueType, IndexType>* result)
1337 using load_balance =
typename Csr<ValueType, IndexType>::load_balance;
1338 using automatical =
typename Csr<ValueType, IndexType>::automatical;
1339 auto strategy = result->get_strategy();
1340 auto executor = result->get_executor();
1341 if (std::dynamic_pointer_cast<load_balance>(strategy)) {
1343 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
1344 result->set_strategy(std::make_shared<load_balance>(exec));
1345 }
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
1347 result->set_strategy(std::make_shared<load_balance>(exec));
1349 }
else if (std::dynamic_pointer_cast<automatical>(strategy)) {
1351 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
1352 result->set_strategy(std::make_shared<automatical>(exec));
1353 }
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
1355 result->set_strategy(std::make_shared<automatical>(exec));
1366 #endif // GKO_PUBLIC_CORE_MATRIX_CSR_HPP_