5 #ifndef GKO_PUBLIC_CORE_MATRIX_CSR_HPP_
6 #define GKO_PUBLIC_CORE_MATRIX_CSR_HPP_
9 #include <ginkgo/core/base/array.hpp>
10 #include <ginkgo/core/base/index_set.hpp>
11 #include <ginkgo/core/base/lin_op.hpp>
12 #include <ginkgo/core/base/math.hpp>
13 #include <ginkgo/core/matrix/permutation.hpp>
14 #include <ginkgo/core/matrix/scaled_permutation.hpp>
21 template <
typename ValueType>
24 template <
typename ValueType>
27 template <
typename ValueType,
typename IndexType>
30 template <
typename ValueType,
typename IndexType>
33 template <
typename ValueType,
typename IndexType>
36 template <
typename ValueType,
typename IndexType>
39 template <
typename ValueType,
typename IndexType>
42 template <
typename ValueType,
typename IndexType>
45 template <
typename ValueType,
typename IndexType>
48 template <
typename ValueType,
typename IndexType>
51 template <
typename IndexType>
58 template <
typename ValueType = default_precision,
typename IndexType =
int32>
103 template <
typename ValueType = default_precision,
typename IndexType =
int32>
105 public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>,
106 #if GINKGO_ENABLE_HALF
108 Csr<next_precision<next_precision<ValueType>>, IndexType>>,
123 remove_complex<Csr<ValueType, IndexType>>>,
126 friend class Coo<ValueType, IndexType>;
127 friend class Dense<ValueType>;
129 friend class Ell<ValueType, IndexType>;
130 friend class Hybrid<ValueType, IndexType>;
131 friend class Sellp<ValueType, IndexType>;
133 friend class Fbcsr<ValueType, IndexType>;
134 friend class CsrBuilder<ValueType, IndexType>;
158 using value_type = ValueType;
159 using index_type = IndexType;
209 virtual int64_t
clac_size(
const int64_t nnz) = 0;
215 virtual std::shared_ptr<strategy_type>
copy() = 0;
218 void set_name(std::string name) { name_ = name; }
240 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
242 const bool is_mtx_on_host{host_mtx_exec ==
244 const index_type* row_ptrs{};
245 if (is_mtx_on_host) {
248 row_ptrs_host = mtx_row_ptrs;
251 auto num_rows = mtx_row_ptrs.
get_size() - 1;
252 max_length_per_row_ = 0;
253 for (
size_type i = 0; i < num_rows; i++) {
254 max_length_per_row_ = std::max(max_length_per_row_,
255 row_ptrs[i + 1] - row_ptrs[i]);
259 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
261 index_type get_max_length_per_row() const noexcept
263 return max_length_per_row_;
266 std::shared_ptr<strategy_type>
copy()
override
268 return std::make_shared<classical>();
272 index_type max_length_per_row_;
291 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
293 std::shared_ptr<strategy_type>
copy()
override
295 return std::make_shared<merge_path>();
316 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
318 std::shared_ptr<strategy_type>
copy()
override
320 return std::make_shared<cusparse>();
340 int64_t
clac_size(
const int64_t nnz)
override {
return 0; }
342 std::shared_ptr<strategy_type>
copy()
override
344 return std::make_shared<sparselib>();
370 :
load_balance(exec->get_num_warps(), exec->get_warp_size())
379 :
load_balance(exec->get_num_warps(), exec->get_warp_size(), false)
390 :
load_balance(exec->get_num_subgroups(), 32, false,
"intel")
405 bool cuda_strategy =
true,
406 std::string strategy_name =
"none")
409 warp_size_(warp_size),
410 cuda_strategy_(cuda_strategy),
411 strategy_name_(strategy_name)
420 auto host_srow_exec = mtx_srow->
get_executor()->get_master();
421 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
422 const bool is_srow_on_host{host_srow_exec ==
424 const bool is_mtx_on_host{host_mtx_exec ==
428 const index_type* row_ptrs{};
430 if (is_srow_on_host) {
433 srow_host = *mtx_srow;
436 if (is_mtx_on_host) {
439 row_ptrs_host = mtx_row_ptrs;
445 const auto num_rows = mtx_row_ptrs.
get_size() - 1;
446 const auto num_elems = row_ptrs[num_rows];
447 const auto bucket_divider =
448 num_elems > 0 ?
ceildiv(num_elems, warp_size_) : 1;
449 for (
size_type i = 0; i < num_rows; i++) {
453 if (bucket < nwarps) {
459 srow[i] += srow[i - 1];
461 if (!is_srow_on_host) {
462 *mtx_srow = srow_host;
469 if (warp_size_ > 0) {
471 if (nnz >= static_cast<int64_t>(2e8)) {
473 }
else if (nnz >= static_cast<int64_t>(2e7)) {
475 }
else if (nnz >= static_cast<int64_t>(2e6)) {
477 }
else if (nnz >= static_cast<int64_t>(2e5)) {
480 if (strategy_name_ ==
"intel") {
482 if (nnz >= static_cast<int64_t>(2e8)) {
484 }
else if (nnz >= static_cast<int64_t>(2e7)) {
488 #if GINKGO_HIP_PLATFORM_HCC
489 if (!cuda_strategy_) {
491 if (nnz >= static_cast<int64_t>(1e7)) {
493 }
else if (nnz >= static_cast<int64_t>(1e6)) {
497 #endif // GINKGO_HIP_PLATFORM_HCC
499 auto nwarps = nwarps_ * multiple;
506 std::shared_ptr<strategy_type>
copy()
override
508 return std::make_shared<load_balance>(
509 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
516 std::string strategy_name_;
523 const index_type nvidia_row_len_limit = 1024;
526 const index_type nvidia_nnz_limit{static_cast<index_type>(1e6)};
529 const index_type amd_row_len_limit = 768;
532 const index_type amd_nnz_limit{static_cast<index_type>(1e8)};
535 const index_type intel_row_len_limit = 25600;
538 const index_type intel_nnz_limit{static_cast<index_type>(3e8)};
558 :
automatical(exec->get_num_warps(), exec->get_warp_size())
567 :
automatical(exec->get_num_warps(), exec->get_warp_size(), false)
578 :
automatical(exec->get_num_subgroups(), 32, false,
"intel")
593 bool cuda_strategy =
true,
594 std::string strategy_name =
"none")
597 warp_size_(warp_size),
598 cuda_strategy_(cuda_strategy),
599 strategy_name_(strategy_name),
600 max_length_per_row_(0)
609 index_type nnz_limit = nvidia_nnz_limit;
610 index_type row_len_limit = nvidia_row_len_limit;
611 if (strategy_name_ ==
"intel") {
612 nnz_limit = intel_nnz_limit;
613 row_len_limit = intel_row_len_limit;
615 #if GINKGO_HIP_PLATFORM_HCC
616 if (!cuda_strategy_) {
617 nnz_limit = amd_nnz_limit;
618 row_len_limit = amd_row_len_limit;
620 #endif // GINKGO_HIP_PLATFORM_HCC
621 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
622 const bool is_mtx_on_host{host_mtx_exec ==
625 const index_type* row_ptrs{};
626 if (is_mtx_on_host) {
629 row_ptrs_host = mtx_row_ptrs;
632 const auto num_rows = mtx_row_ptrs.
get_size() - 1;
633 if (row_ptrs[num_rows] > nnz_limit) {
635 cuda_strategy_, strategy_name_);
636 if (is_mtx_on_host) {
637 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
639 actual_strategy.
process(row_ptrs_host, mtx_srow);
641 this->set_name(actual_strategy.
get_name());
643 index_type maxnum = 0;
644 for (
size_type i = 0; i < num_rows; i++) {
645 maxnum = std::max(maxnum, row_ptrs[i + 1] - row_ptrs[i]);
647 if (maxnum > row_len_limit) {
649 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
650 if (is_mtx_on_host) {
651 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
653 actual_strategy.
process(row_ptrs_host, mtx_srow);
655 this->set_name(actual_strategy.
get_name());
658 if (is_mtx_on_host) {
659 actual_strategy.
process(mtx_row_ptrs, mtx_srow);
660 max_length_per_row_ =
661 actual_strategy.get_max_length_per_row();
663 actual_strategy.
process(row_ptrs_host, mtx_srow);
664 max_length_per_row_ =
665 actual_strategy.get_max_length_per_row();
667 this->set_name(actual_strategy.
get_name());
674 return std::make_shared<load_balance>(
675 nwarps_, warp_size_, cuda_strategy_, strategy_name_)
679 index_type get_max_length_per_row() const noexcept
681 return max_length_per_row_;
684 std::shared_ptr<strategy_type>
copy()
override
686 return std::make_shared<automatical>(
687 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
694 std::string strategy_name_;
695 index_type max_length_per_row_;
698 friend class Csr<previous_precision<ValueType>, IndexType>;
705 #if GINKGO_ENABLE_HALF
706 friend class Csr<previous_precision<previous_precision<ValueType>>,
714 result)
const override;
748 void read(
const mat_data& data)
override;
750 void read(
const device_mat_data& data)
override;
752 void read(device_mat_data&& data)
override;
754 void write(mat_data& data)
const override;
756 std::unique_ptr<LinOp>
transpose()
const override;
782 std::unique_ptr<Permutation<IndexType>> value_permutation;
833 bool invert =
false)
const;
880 bool invert =
false)
const;
912 bool invert =
false)
const;
914 std::unique_ptr<LinOp>
permute(
949 bool is_sorted_by_column_index()
const;
1075 strategy_ = std::move(strategy->copy());
1088 GKO_ASSERT_EQUAL_DIMENSIONS(alpha,
dim<2>(1, 1));
1101 GKO_ASSERT_EQUAL_DIMENSIONS(alpha,
dim<2>(1, 1));
1113 static std::unique_ptr<Csr>
create(std::shared_ptr<const Executor> exec,
1114 std::shared_ptr<strategy_type> strategy);
1127 static std::unique_ptr<Csr>
create(
1128 std::shared_ptr<const Executor> exec,
const dim<2>& size = {},
1130 std::shared_ptr<strategy_type> strategy =
nullptr);
1151 static std::unique_ptr<Csr>
create(
1152 std::shared_ptr<const Executor> exec,
const dim<2>& size,
1153 array<value_type> values, array<index_type> col_idxs,
1154 array<index_type> row_ptrs,
1155 std::shared_ptr<strategy_type> strategy =
nullptr);
1161 template <
typename InputValueType,
typename InputColumnIndexType,
1162 typename InputRowPtrType>
1164 "explicitly construct the gko::array argument instead of passing "
1165 "initializer lists")
1167 std::shared_ptr<const
Executor> exec, const
dim<2>& size,
1168 std::initializer_list<InputValueType> values,
1169 std::initializer_list<InputColumnIndexType> col_idxs,
1170 std::initializer_list<InputRowPtrType> row_ptrs)
1193 std::shared_ptr<const Executor> exec,
const dim<2>& size,
1194 gko::detail::const_array_view<ValueType>&& values,
1195 gko::detail::const_array_view<IndexType>&& col_idxs,
1196 gko::detail::const_array_view<IndexType>&& row_ptrs,
1197 std::shared_ptr<strategy_type> strategy =
nullptr);
1227 const span& row_span,
const span& column_span)
const;
1254 Csr(std::shared_ptr<const Executor> exec,
const dim<2>& size = {},
1256 std::shared_ptr<strategy_type> strategy =
nullptr);
1258 Csr(std::shared_ptr<const Executor> exec,
const dim<2>& size,
1259 array<value_type> values, array<index_type> col_idxs,
1260 array<index_type> row_ptrs,
1261 std::shared_ptr<strategy_type> strategy =
nullptr);
1263 void apply_impl(
const LinOp* b,
LinOp* x)
const override;
1265 void apply_impl(
const LinOp* alpha,
const LinOp* b,
const LinOp* beta,
1266 LinOp* x)
const override;
1269 static std::shared_ptr<strategy_type> make_default_strategy(
1270 std::shared_ptr<const Executor> exec)
1272 auto cuda_exec = std::dynamic_pointer_cast<const CudaExecutor>(exec);
1273 auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(exec);
1274 auto dpcpp_exec = std::dynamic_pointer_cast<const DpcppExecutor>(exec);
1275 std::shared_ptr<strategy_type> new_strategy;
1277 new_strategy = std::make_shared<automatical>(cuda_exec);
1278 }
else if (hip_exec) {
1279 new_strategy = std::make_shared<automatical>(hip_exec);
1280 }
else if (dpcpp_exec) {
1281 new_strategy = std::make_shared<automatical>(dpcpp_exec);
1283 new_strategy = std::make_shared<classical>();
1285 return new_strategy;
1289 template <
typename CsrType>
1290 void convert_strategy_helper(CsrType* result)
const
1293 std::shared_ptr<typename CsrType::strategy_type> new_strat;
1294 if (dynamic_cast<classical*>(strat)) {
1295 new_strat = std::make_shared<typename CsrType::classical>();
1296 }
else if (dynamic_cast<merge_path*>(strat)) {
1297 new_strat = std::make_shared<typename CsrType::merge_path>();
1298 }
else if (dynamic_cast<cusparse*>(strat)) {
1299 new_strat = std::make_shared<typename CsrType::cusparse>();
1300 }
else if (dynamic_cast<sparselib*>(strat)) {
1301 new_strat = std::make_shared<typename CsrType::sparselib>();
1303 auto rexec = result->get_executor();
1305 std::dynamic_pointer_cast<const CudaExecutor>(rexec);
1306 auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(rexec);
1308 std::dynamic_pointer_cast<const DpcppExecutor>(rexec);
1309 auto lb = dynamic_cast<load_balance*>(strat);
1313 std::make_shared<typename CsrType::load_balance>(
1316 new_strat = std::make_shared<typename CsrType::automatical>(
1319 }
else if (hip_exec) {
1322 std::make_shared<typename CsrType::load_balance>(
1325 new_strat = std::make_shared<typename CsrType::automatical>(
1328 }
else if (dpcpp_exec) {
1331 std::make_shared<typename CsrType::load_balance>(
1334 new_strat = std::make_shared<typename CsrType::automatical>(
1339 auto this_cuda_exec =
1340 std::dynamic_pointer_cast<const CudaExecutor>(
1342 auto this_hip_exec =
1343 std::dynamic_pointer_cast<const HipExecutor>(
1345 auto this_dpcpp_exec =
1346 std::dynamic_pointer_cast<const DpcppExecutor>(
1348 if (this_cuda_exec) {
1351 std::make_shared<typename CsrType::load_balance>(
1355 std::make_shared<typename CsrType::automatical>(
1358 }
else if (this_hip_exec) {
1361 std::make_shared<typename CsrType::load_balance>(
1365 std::make_shared<typename CsrType::automatical>(
1368 }
else if (this_dpcpp_exec) {
1371 std::make_shared<typename CsrType::load_balance>(
1375 std::make_shared<typename CsrType::automatical>(
1383 new_strat = std::make_shared<typename CsrType::classical>();
1387 result->set_strategy(new_strat);
1396 strategy_->process(row_ptrs_, &srow_);
1405 virtual void scale_impl(
const LinOp* alpha);
1413 virtual void inv_scale_impl(
const LinOp* alpha);
1416 std::shared_ptr<strategy_type> strategy_;
1417 array<value_type> values_;
1418 array<index_type> col_idxs_;
1419 array<index_type> row_ptrs_;
1420 array<index_type> srow_;
1422 void add_scaled_identity_impl(
const LinOp* a,
const LinOp* b)
override;
1435 template <
typename ValueType,
typename IndexType>
1436 void strategy_rebuild_helper(Csr<ValueType, IndexType>* result)
1438 using load_balance =
typename Csr<ValueType, IndexType>::load_balance;
1439 using automatical =
typename Csr<ValueType, IndexType>::automatical;
1440 auto strategy = result->get_strategy();
1441 auto executor = result->get_executor();
1442 if (std::dynamic_pointer_cast<load_balance>(strategy)) {
1444 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
1445 result->set_strategy(std::make_shared<load_balance>(exec));
1446 }
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
1448 result->set_strategy(std::make_shared<load_balance>(exec));
1450 }
else if (std::dynamic_pointer_cast<automatical>(strategy)) {
1452 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
1453 result->set_strategy(std::make_shared<automatical>(exec));
1454 }
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
1456 result->set_strategy(std::make_shared<automatical>(exec));
1467 #endif // GKO_PUBLIC_CORE_MATRIX_CSR_HPP_