5 #ifndef GKO_PUBLIC_CORE_MATRIX_CSR_HPP_
6 #define GKO_PUBLIC_CORE_MATRIX_CSR_HPP_
9 #include <ginkgo/core/base/array.hpp>
10 #include <ginkgo/core/base/index_set.hpp>
11 #include <ginkgo/core/base/lin_op.hpp>
12 #include <ginkgo/core/base/math.hpp>
13 #include <ginkgo/core/matrix/permutation.hpp>
14 #include <ginkgo/core/matrix/scaled_permutation.hpp>
21 template <
typename ValueType>
24 template <
typename ValueType>
27 template <
typename ValueType,
typename IndexType>
30 template <
typename ValueType,
typename IndexType>
33 template <
typename ValueType,
typename IndexType>
36 template <
typename ValueType,
typename IndexType>
39 template <
typename ValueType,
typename IndexType>
42 template <
typename ValueType,
typename IndexType>
45 template <
typename ValueType,
typename IndexType>
48 template <
typename ValueType,
typename IndexType>
51 template <
typename IndexType>
58 template <
typename ValueType = default_precision,
typename IndexType =
int32>
103 template <
typename ValueType = default_precision,
typename IndexType =
int32>
105 public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>,
106 #if GINKGO_ENABLE_HALF || GINKGO_ENABLE_BFLOAT16
107 public ConvertibleTo<Csr<next_precision<ValueType, 2>, IndexType>>,
109 #if GINKGO_ENABLE_HALF && GINKGO_ENABLE_BFLOAT16
110 public ConvertibleTo<Csr<next_precision<ValueType, 3>, IndexType>>,
125 remove_complex<Csr<ValueType, IndexType>>>,
128 friend class Coo<ValueType, IndexType>;
129 friend class Dense<ValueType>;
131 friend class Ell<ValueType, IndexType>;
132 friend class Hybrid<ValueType, IndexType>;
133 friend class Sellp<ValueType, IndexType>;
135 friend class Fbcsr<ValueType, IndexType>;
136 friend class CsrBuilder<ValueType, IndexType>;
160 using value_type = ValueType;
161 using index_type = IndexType;
211 virtual int64_t
clac_size(
const int64_t nnz) = 0;
217 virtual std::shared_ptr<strategy_type>
copy() = 0;
220 void set_name(std::string name) { name_ = name; }
242 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
244 const bool is_mtx_on_host{host_mtx_exec ==
246 const index_type* row_ptrs{};
247 if (is_mtx_on_host) {
250 row_ptrs_host = mtx_row_ptrs;
253 auto num_rows = mtx_row_ptrs.
get_size() - 1;
254 max_length_per_row_ = 0;
255 for (
size_type i = 0; i < num_rows; i++) {
256 max_length_per_row_ = std::max(max_length_per_row_,
257 row_ptrs[i + 1] - row_ptrs[i]);
261 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
263 index_type get_max_length_per_row() const noexcept
265 return max_length_per_row_;
268 std::shared_ptr<strategy_type>
copy()
override
270 return std::make_shared<classical>();
274 index_type max_length_per_row_;
293 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
295 std::shared_ptr<strategy_type>
copy()
override
297 return std::make_shared<merge_path>();
318 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
320 std::shared_ptr<strategy_type>
copy()
override
322 return std::make_shared<cusparse>();
342 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
344 std::shared_ptr<strategy_type>
copy()
override
346 return std::make_shared<sparselib>();
372 :
load_balance(exec->get_num_warps(), exec->get_warp_size())
381 :
load_balance(exec->get_num_warps(), exec->get_warp_size(), false)
392 :
load_balance(exec->get_num_subgroups(), 32, false,
"intel")
407 bool cuda_strategy =
true,
408 std::string strategy_name =
"none")
411 warp_size_(warp_size),
412 cuda_strategy_(cuda_strategy),
413 strategy_name_(strategy_name)
422 auto host_srow_exec = mtx_srow->
get_executor()->get_master();
423 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
424 const bool is_srow_on_host{host_srow_exec ==
426 const bool is_mtx_on_host{host_mtx_exec ==
430 const index_type* row_ptrs{};
432 if (is_srow_on_host) {
435 srow_host = *mtx_srow;
438 if (is_mtx_on_host) {
441 row_ptrs_host = mtx_row_ptrs;
447 const auto num_rows = mtx_row_ptrs.
get_size() - 1;
448 const auto num_elems = row_ptrs[num_rows];
449 const auto bucket_divider =
450 num_elems > 0 ?
ceildiv(num_elems, warp_size_) : 1;
451 for (
size_type i = 0; i < num_rows; i++) {
455 if (bucket < nwarps) {
461 srow[i] += srow[i - 1];
463 if (!is_srow_on_host) {
464 *mtx_srow = srow_host;
471 if (warp_size_ > 0) {
473 if (nnz >= static_cast<int64_t>(2e8)) {
475 }
else if (nnz >= static_cast<int64_t>(2e7)) {
477 }
else if (nnz >= static_cast<int64_t>(2e6)) {
479 }
else if (nnz >= static_cast<int64_t>(2e5)) {
482 if (strategy_name_ ==
"intel") {
484 if (nnz >= static_cast<int64_t>(2e8)) {
486 }
else if (nnz >= static_cast<int64_t>(2e7)) {
490 #if GINKGO_HIP_PLATFORM_HCC
491 if (!cuda_strategy_) {
493 if (nnz >= static_cast<int64_t>(1e7)) {
495 }
else if (nnz >= static_cast<int64_t>(1e6)) {
499 #endif // GINKGO_HIP_PLATFORM_HCC
501 auto nwarps = nwarps_ * multiple;
508 std::shared_ptr<strategy_type>
copy()
override
510 return std::make_shared<load_balance>(
511 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
518 std::string strategy_name_;
525 const index_type nvidia_row_len_limit = 1024;
528 const index_type nvidia_nnz_limit{static_cast<index_type>(1e6)};
531 const index_type amd_row_len_limit = 768;
534 const index_type amd_nnz_limit{static_cast<index_type>(1e8)};
537 const index_type intel_row_len_limit = 25600;
540 const index_type intel_nnz_limit{static_cast<index_type>(3e8)};
560 :
automatical(exec->get_num_warps(), exec->get_warp_size())
569 :
automatical(exec->get_num_warps(), exec->get_warp_size(), false)
580 :
automatical(exec->get_num_subgroups(), 32, false,
"intel")
595 bool cuda_strategy =
true,
596 std::string strategy_name =
"none")
599 warp_size_(warp_size),
600 cuda_strategy_(cuda_strategy),
601 strategy_name_(strategy_name),
602 max_length_per_row_(0)
611 index_type nnz_limit = nvidia_nnz_limit;
612 index_type row_len_limit = nvidia_row_len_limit;
613 if (strategy_name_ ==
"intel") {
614 nnz_limit = intel_nnz_limit;
615 row_len_limit = intel_row_len_limit;
617 #if GINKGO_HIP_PLATFORM_HCC
618 if (!cuda_strategy_) {
619 nnz_limit = amd_nnz_limit;
620 row_len_limit = amd_row_len_limit;
622 #endif // GINKGO_HIP_PLATFORM_HCC
623 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
624 const bool is_mtx_on_host{host_mtx_exec ==
627 const index_type* row_ptrs{};
628 if (is_mtx_on_host) {
631 row_ptrs_host = mtx_row_ptrs;
634 const auto num_rows = mtx_row_ptrs.
get_size() - 1;
635 if (row_ptrs[num_rows] > nnz_limit) {
637 cuda_strategy_, strategy_name_);
638 if (is_mtx_on_host) {
639 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
641 actual_strategy.
process(row_ptrs_host, mtx_srow);
643 this->set_name(actual_strategy.
get_name());
645 index_type maxnum = 0;
646 for (
size_type i = 0; i < num_rows; i++) {
647 maxnum = std::max(maxnum, row_ptrs[i + 1] - row_ptrs[i]);
649 if (maxnum > row_len_limit) {
651 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
652 if (is_mtx_on_host) {
653 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
655 actual_strategy.
process(row_ptrs_host, mtx_srow);
657 this->set_name(actual_strategy.
get_name());
660 if (is_mtx_on_host) {
661 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
662 max_length_per_row_ =
663 actual_strategy.get_max_length_per_row();
665 actual_strategy.
process(row_ptrs_host, mtx_srow);
666 max_length_per_row_ =
667 actual_strategy.get_max_length_per_row();
669 this->set_name(actual_strategy.
get_name());
676 return std::make_shared<load_balance>(
677 nwarps_, warp_size_, cuda_strategy_, strategy_name_)
681 index_type get_max_length_per_row() const noexcept
683 return max_length_per_row_;
686 std::shared_ptr<strategy_type>
copy()
override
688 return std::make_shared<automatical>(
689 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
696 std::string strategy_name_;
697 index_type max_length_per_row_;
707 #if GINKGO_ENABLE_HALF || GINKGO_ENABLE_BFLOAT16
719 #if GINKGO_ENABLE_HALF && GINKGO_ENABLE_BFLOAT16
759 void read(
const mat_data& data)
override;
761 void read(
const device_mat_data& data)
override;
763 void read(device_mat_data&& data)
override;
765 void write(mat_data& data)
const override;
767 std::unique_ptr<LinOp>
transpose()
const override;
793 std::unique_ptr<Permutation<IndexType>> value_permutation;
844 bool invert =
false)
const;
891 bool invert =
false)
const;
923 bool invert =
false)
const;
925 std::unique_ptr<LinOp>
permute(
960 bool is_sorted_by_column_index()
const;
1086 strategy_ = std::move(strategy->copy());
1099 GKO_ASSERT_EQUAL_DIMENSIONS(alpha,
dim<2>(1, 1));
1112 GKO_ASSERT_EQUAL_DIMENSIONS(alpha,
dim<2>(1, 1));
1124 static std::unique_ptr<Csr>
create(std::shared_ptr<const Executor> exec,
1125 std::shared_ptr<strategy_type> strategy);
1138 static std::unique_ptr<Csr>
create(
1139 std::shared_ptr<const Executor> exec,
const dim<2>& size = {},
1141 std::shared_ptr<strategy_type> strategy =
nullptr);
1162 static std::unique_ptr<Csr>
create(
1163 std::shared_ptr<const Executor> exec,
const dim<2>& size,
1164 array<value_type> values, array<index_type> col_idxs,
1165 array<index_type> row_ptrs,
1166 std::shared_ptr<strategy_type> strategy =
nullptr);
1172 template <
typename InputValueType,
typename InputColumnIndexType,
1173 typename InputRowPtrType>
1175 "explicitly construct the gko::array argument instead of passing "
1176 "initializer lists")
1178 std::shared_ptr<const
Executor> exec, const
dim<2>& size,
1179 std::initializer_list<InputValueType> values,
1180 std::initializer_list<InputColumnIndexType> col_idxs,
1181 std::initializer_list<InputRowPtrType> row_ptrs)
1204 std::shared_ptr<const Executor> exec,
const dim<2>& size,
1205 gko::detail::const_array_view<ValueType>&& values,
1206 gko::detail::const_array_view<IndexType>&& col_idxs,
1207 gko::detail::const_array_view<IndexType>&& row_ptrs,
1208 std::shared_ptr<strategy_type> strategy =
nullptr);
1238 const span& row_span,
const span& column_span)
const;
1265 Csr(std::shared_ptr<const Executor> exec,
const dim<2>& size = {},
1267 std::shared_ptr<strategy_type> strategy =
nullptr);
1269 Csr(std::shared_ptr<const Executor> exec,
const dim<2>& size,
1270 array<value_type> values, array<index_type> col_idxs,
1271 array<index_type> row_ptrs,
1272 std::shared_ptr<strategy_type> strategy =
nullptr);
1274 void apply_impl(
const LinOp* b,
LinOp* x)
const override;
1276 void apply_impl(
const LinOp* alpha,
const LinOp* b,
const LinOp* beta,
1277 LinOp* x)
const override;
1280 static std::shared_ptr<strategy_type> make_default_strategy(
1281 std::shared_ptr<const Executor> exec)
1283 auto cuda_exec = std::dynamic_pointer_cast<const CudaExecutor>(exec);
1284 auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(exec);
1285 auto dpcpp_exec = std::dynamic_pointer_cast<const DpcppExecutor>(exec);
1286 std::shared_ptr<strategy_type> new_strategy;
1288 new_strategy = std::make_shared<automatical>(cuda_exec);
1289 }
else if (hip_exec) {
1290 new_strategy = std::make_shared<automatical>(hip_exec);
1291 }
else if (dpcpp_exec) {
1292 new_strategy = std::make_shared<automatical>(dpcpp_exec);
1294 new_strategy = std::make_shared<classical>();
1296 return new_strategy;
1300 template <
typename CsrType>
1301 void convert_strategy_helper(CsrType* result)
const
1304 std::shared_ptr<typename CsrType::strategy_type> new_strat;
1305 if (dynamic_cast<classical*>(strat)) {
1306 new_strat = std::make_shared<typename CsrType::classical>();
1307 }
else if (dynamic_cast<merge_path*>(strat)) {
1308 new_strat = std::make_shared<typename CsrType::merge_path>();
1309 }
else if (dynamic_cast<cusparse*>(strat)) {
1310 new_strat = std::make_shared<typename CsrType::cusparse>();
1311 }
else if (dynamic_cast<sparselib*>(strat)) {
1312 new_strat = std::make_shared<typename CsrType::sparselib>();
1314 auto rexec = result->get_executor();
1316 std::dynamic_pointer_cast<const CudaExecutor>(rexec);
1317 auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(rexec);
1319 std::dynamic_pointer_cast<const DpcppExecutor>(rexec);
1320 auto lb = dynamic_cast<load_balance*>(strat);
1324 std::make_shared<typename CsrType::load_balance>(
1327 new_strat = std::make_shared<typename CsrType::automatical>(
1330 }
else if (hip_exec) {
1333 std::make_shared<typename CsrType::load_balance>(
1336 new_strat = std::make_shared<typename CsrType::automatical>(
1339 }
else if (dpcpp_exec) {
1342 std::make_shared<typename CsrType::load_balance>(
1345 new_strat = std::make_shared<typename CsrType::automatical>(
1350 auto this_cuda_exec =
1351 std::dynamic_pointer_cast<const CudaExecutor>(
1353 auto this_hip_exec =
1354 std::dynamic_pointer_cast<const HipExecutor>(
1356 auto this_dpcpp_exec =
1357 std::dynamic_pointer_cast<const DpcppExecutor>(
1359 if (this_cuda_exec) {
1362 std::make_shared<typename CsrType::load_balance>(
1366 std::make_shared<typename CsrType::automatical>(
1369 }
else if (this_hip_exec) {
1372 std::make_shared<typename CsrType::load_balance>(
1376 std::make_shared<typename CsrType::automatical>(
1379 }
else if (this_dpcpp_exec) {
1382 std::make_shared<typename CsrType::load_balance>(
1386 std::make_shared<typename CsrType::automatical>(
1394 new_strat = std::make_shared<typename CsrType::classical>();
1398 result->set_strategy(new_strat);
1407 strategy_->process(row_ptrs_, &srow_);
1416 virtual void scale_impl(
const LinOp* alpha);
1424 virtual void inv_scale_impl(
const LinOp* alpha);
1427 std::shared_ptr<strategy_type> strategy_;
1428 array<value_type> values_;
1429 array<index_type> col_idxs_;
1430 array<index_type> row_ptrs_;
1431 array<index_type> srow_;
1433 void add_scaled_identity_impl(
const LinOp* a,
const LinOp* b)
override;
1446 template <
typename ValueType,
typename IndexType>
1447 void strategy_rebuild_helper(Csr<ValueType, IndexType>* result)
1449 using load_balance =
typename Csr<ValueType, IndexType>::load_balance;
1450 using automatical =
typename Csr<ValueType, IndexType>::automatical;
1451 auto strategy = result->get_strategy();
1452 auto executor = result->get_executor();
1453 if (std::dynamic_pointer_cast<load_balance>(strategy)) {
1455 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
1456 result->set_strategy(std::make_shared<load_balance>(exec));
1457 }
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
1459 result->set_strategy(std::make_shared<load_balance>(exec));
1461 }
else if (std::dynamic_pointer_cast<automatical>(strategy)) {
1463 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
1464 result->set_strategy(std::make_shared<automatical>(exec));
1465 }
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
1467 result->set_strategy(std::make_shared<automatical>(exec));
1478 #endif // GKO_PUBLIC_CORE_MATRIX_CSR_HPP_