5 #ifndef GKO_PUBLIC_CORE_MATRIX_CSR_HPP_ 
    6 #define GKO_PUBLIC_CORE_MATRIX_CSR_HPP_ 
    9 #include <ginkgo/core/base/array.hpp> 
   10 #include <ginkgo/core/base/index_set.hpp> 
   11 #include <ginkgo/core/base/lin_op.hpp> 
   12 #include <ginkgo/core/base/math.hpp> 
   13 #include <ginkgo/core/matrix/permutation.hpp> 
   14 #include <ginkgo/core/matrix/scaled_permutation.hpp> 
   21 template <
typename ValueType>
 
   24 template <
typename ValueType>
 
   27 template <
typename ValueType, 
typename IndexType>
 
   30 template <
typename ValueType, 
typename IndexType>
 
   33 template <
typename ValueType, 
typename IndexType>
 
   36 template <
typename ValueType, 
typename IndexType>
 
   39 template <
typename ValueType, 
typename IndexType>
 
   42 template <
typename ValueType, 
typename IndexType>
 
   45 template <
typename ValueType, 
typename IndexType>
 
   48 template <
typename ValueType, 
typename IndexType>
 
   55 template <
typename ValueType = default_precision, 
typename IndexType = 
int32>
 
  100 template <
typename ValueType = default_precision, 
typename IndexType = 
int32>
 
  102             public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>,
 
  116                 remove_complex<Csr<ValueType, IndexType>>>,
 
  119     friend class Coo<ValueType, IndexType>;
 
  120     friend class Dense<ValueType>;
 
  122     friend class Ell<ValueType, IndexType>;
 
  123     friend class Hybrid<ValueType, IndexType>;
 
  124     friend class Sellp<ValueType, IndexType>;
 
  126     friend class Fbcsr<ValueType, IndexType>;
 
  127     friend class CsrBuilder<ValueType, IndexType>;
 
  151     using value_type = ValueType;
 
  152     using index_type = IndexType;
 
  202         virtual int64_t 
clac_size(
const int64_t nnz) = 0;
 
  208         virtual std::shared_ptr<strategy_type> 
copy() = 0;
 
  211         void set_name(std::string name) { name_ = name; }
 
  233             auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
 
  235             const bool is_mtx_on_host{host_mtx_exec ==
 
  237             const index_type* row_ptrs{};
 
  238             if (is_mtx_on_host) {
 
  241                 row_ptrs_host = mtx_row_ptrs;
 
  244             auto num_rows = mtx_row_ptrs.
get_size() - 1;
 
  245             max_length_per_row_ = 0;
 
  246             for (
size_type i = 0; i < num_rows; i++) {
 
  247                 max_length_per_row_ = std::max(max_length_per_row_,
 
  248                                                row_ptrs[i + 1] - row_ptrs[i]);
 
  252         int64_t 
clac_size(
const int64_t nnz)
 override { 
return 0; }
 
  254         index_type get_max_length_per_row() const noexcept
 
  256             return max_length_per_row_;
 
  259         std::shared_ptr<strategy_type> 
copy()
 override 
  261             return std::make_shared<classical>();
 
  265         index_type max_length_per_row_;
 
  284         int64_t 
clac_size(
const int64_t nnz)
 override { 
return 0; }
 
  286         std::shared_ptr<strategy_type> 
copy()
 override 
  288             return std::make_shared<merge_path>();
 
  309         int64_t 
clac_size(
const int64_t nnz)
 override { 
return 0; }
 
  311         std::shared_ptr<strategy_type> 
copy()
 override 
  313             return std::make_shared<cusparse>();
 
  333         int64_t 
clac_size(
const int64_t nnz)
 override { 
return 0; }
 
  335         std::shared_ptr<strategy_type> 
copy()
 override 
  337             return std::make_shared<sparselib>();
 
  363             : 
load_balance(exec->get_num_warps(), exec->get_warp_size())
 
  372             : 
load_balance(exec->get_num_warps(), exec->get_warp_size(), false)
 
  383             : 
load_balance(exec->get_num_subgroups(), 32, false, 
"intel")
 
  398                      bool cuda_strategy = 
true,
 
  399                      std::string strategy_name = 
"none")
 
  402               warp_size_(warp_size),
 
  403               cuda_strategy_(cuda_strategy),
 
  404               strategy_name_(strategy_name)
 
  413                 auto host_srow_exec = mtx_srow->
get_executor()->get_master();
 
  414                 auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
 
  415                 const bool is_srow_on_host{host_srow_exec ==
 
  417                 const bool is_mtx_on_host{host_mtx_exec ==
 
  421                 const index_type* row_ptrs{};
 
  423                 if (is_srow_on_host) {
 
  426                     srow_host = *mtx_srow;
 
  429                 if (is_mtx_on_host) {
 
  432                     row_ptrs_host = mtx_row_ptrs;
 
  438                 const auto num_rows = mtx_row_ptrs.
get_size() - 1;
 
  439                 const auto num_elems = row_ptrs[num_rows];
 
  440                 const auto bucket_divider =
 
  441                     num_elems > 0 ? 
ceildiv(num_elems, warp_size_) : 1;
 
  442                 for (
size_type i = 0; i < num_rows; i++) {
 
  446                     if (bucket < nwarps) {
 
  452                     srow[i] += srow[i - 1];
 
  454                 if (!is_srow_on_host) {
 
  455                     *mtx_srow = srow_host;
 
  462             if (warp_size_ > 0) {
 
  464                 if (nnz >= static_cast<int64_t>(2e8)) {
 
  466                 } 
else if (nnz >= static_cast<int64_t>(2e7)) {
 
  468                 } 
else if (nnz >= static_cast<int64_t>(2e6)) {
 
  470                 } 
else if (nnz >= static_cast<int64_t>(2e5)) {
 
  473                 if (strategy_name_ == 
"intel") {
 
  475                     if (nnz >= static_cast<int64_t>(2e8)) {
 
  477                     } 
else if (nnz >= static_cast<int64_t>(2e7)) {
 
  481 #if GINKGO_HIP_PLATFORM_HCC 
  482                 if (!cuda_strategy_) {
 
  484                     if (nnz >= static_cast<int64_t>(1e7)) {
 
  486                     } 
else if (nnz >= static_cast<int64_t>(1e6)) {
 
  490 #endif  // GINKGO_HIP_PLATFORM_HCC 
  492                 auto nwarps = nwarps_ * multiple;
 
  499         std::shared_ptr<strategy_type> 
copy()
 override 
  501             return std::make_shared<load_balance>(
 
  502                 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
 
  509         std::string strategy_name_;
 
  516         const index_type nvidia_row_len_limit = 1024;
 
  519         const index_type nvidia_nnz_limit{static_cast<index_type>(1e6)};
 
  522         const index_type amd_row_len_limit = 768;
 
  525         const index_type amd_nnz_limit{static_cast<index_type>(1e8)};
 
  528         const index_type intel_row_len_limit = 25600;
 
  531         const index_type intel_nnz_limit{static_cast<index_type>(3e8)};
 
  551             : 
automatical(exec->get_num_warps(), exec->get_warp_size())
 
  560             : 
automatical(exec->get_num_warps(), exec->get_warp_size(), false)
 
  571             : 
automatical(exec->get_num_subgroups(), 32, false, 
"intel")
 
  586                     bool cuda_strategy = 
true,
 
  587                     std::string strategy_name = 
"none")
 
  590               warp_size_(warp_size),
 
  591               cuda_strategy_(cuda_strategy),
 
  592               strategy_name_(strategy_name),
 
  593               max_length_per_row_(0)
 
  602             index_type nnz_limit = nvidia_nnz_limit;
 
  603             index_type row_len_limit = nvidia_row_len_limit;
 
  604             if (strategy_name_ == 
"intel") {
 
  605                 nnz_limit = intel_nnz_limit;
 
  606                 row_len_limit = intel_row_len_limit;
 
  608 #if GINKGO_HIP_PLATFORM_HCC 
  609             if (!cuda_strategy_) {
 
  610                 nnz_limit = amd_nnz_limit;
 
  611                 row_len_limit = amd_row_len_limit;
 
  613 #endif  // GINKGO_HIP_PLATFORM_HCC 
  614             auto host_mtx_exec = mtx_row_ptrs.
get_executor()->get_master();
 
  615             const bool is_mtx_on_host{host_mtx_exec ==
 
  618             const index_type* row_ptrs{};
 
  619             if (is_mtx_on_host) {
 
  622                 row_ptrs_host = mtx_row_ptrs;
 
  625             const auto num_rows = mtx_row_ptrs.
get_size() - 1;
 
  626             if (row_ptrs[num_rows] > nnz_limit) {
 
  628                                              cuda_strategy_, strategy_name_);
 
  629                 if (is_mtx_on_host) {
 
  630                     actual_strategy.
process(mtx_row_ptrs, mtx_srow);
 
  632                     actual_strategy.
process(row_ptrs_host, mtx_srow);
 
  634                 this->set_name(actual_strategy.
get_name());
 
  636                 index_type maxnum = 0;
 
  637                 for (
size_type i = 0; i < num_rows; i++) {
 
  638                     maxnum = std::max(maxnum, row_ptrs[i + 1] - row_ptrs[i]);
 
  640                 if (maxnum > row_len_limit) {
 
  642                         nwarps_, warp_size_, cuda_strategy_, strategy_name_);
 
  643                     if (is_mtx_on_host) {
 
  644                         actual_strategy.
process(mtx_row_ptrs, mtx_srow);
 
  646                         actual_strategy.
process(row_ptrs_host, mtx_srow);
 
  648                     this->set_name(actual_strategy.
get_name());
 
  651                     if (is_mtx_on_host) {
 
  652                         actual_strategy.
process(mtx_row_ptrs, mtx_srow);
 
  653                         max_length_per_row_ =
 
  654                             actual_strategy.get_max_length_per_row();
 
  656                         actual_strategy.
process(row_ptrs_host, mtx_srow);
 
  657                         max_length_per_row_ =
 
  658                             actual_strategy.get_max_length_per_row();
 
  660                     this->set_name(actual_strategy.
get_name());
 
  667             return std::make_shared<load_balance>(
 
  668                        nwarps_, warp_size_, cuda_strategy_, strategy_name_)
 
  672         index_type get_max_length_per_row() const noexcept
 
  674             return max_length_per_row_;
 
  677         std::shared_ptr<strategy_type> 
copy()
 override 
  679             return std::make_shared<automatical>(
 
  680                 nwarps_, warp_size_, cuda_strategy_, strategy_name_);
 
  687         std::string strategy_name_;
 
  688         index_type max_length_per_row_;
 
  726     void read(
const mat_data& data) 
override;
 
  728     void read(
const device_mat_data& data) 
override;
 
  730     void read(device_mat_data&& data) 
override;
 
  732     void write(mat_data& data) 
const override;
 
  734     std::unique_ptr<LinOp> 
transpose() 
const override;
 
  772         bool invert = 
false) 
const;
 
  804         bool invert = 
false) 
const;
 
  806     std::unique_ptr<LinOp> 
permute(
 
  841     bool is_sorted_by_column_index() 
const;
 
  955         strategy_ = std::move(strategy->copy());
 
  968         GKO_ASSERT_EQUAL_DIMENSIONS(alpha, 
dim<2>(1, 1));
 
  981         GKO_ASSERT_EQUAL_DIMENSIONS(alpha, 
dim<2>(1, 1));
 
  993     static std::unique_ptr<Csr> 
create(std::shared_ptr<const Executor> exec,
 
  994                                        std::shared_ptr<strategy_type> strategy);
 
 1007     static std::unique_ptr<Csr> 
create(
 
 1008         std::shared_ptr<const Executor> exec, 
const dim<2>& size = {},
 
 1010         std::shared_ptr<strategy_type> strategy = 
nullptr);
 
 1031     static std::unique_ptr<Csr> 
create(
 
 1032         std::shared_ptr<const Executor> exec, 
const dim<2>& size,
 
 1033         array<value_type> values, array<index_type> col_idxs,
 
 1034         array<index_type> row_ptrs,
 
 1035         std::shared_ptr<strategy_type> strategy = 
nullptr);
 
 1041     template <
typename InputValueType, 
typename InputColumnIndexType,
 
 1042               typename InputRowPtrType>
 
 1044         "explicitly construct the gko::array argument instead of passing " 
 1045         "initializer lists")
 
 1047         std::shared_ptr<const 
Executor> exec, const 
dim<2>& size,
 
 1048         std::initializer_list<InputValueType> values,
 
 1049         std::initializer_list<InputColumnIndexType> col_idxs,
 
 1050         std::initializer_list<InputRowPtrType> row_ptrs)
 
 1073         std::shared_ptr<const Executor> exec, 
const dim<2>& size,
 
 1074         gko::detail::const_array_view<ValueType>&& values,
 
 1075         gko::detail::const_array_view<IndexType>&& col_idxs,
 
 1076         gko::detail::const_array_view<IndexType>&& row_ptrs,
 
 1077         std::shared_ptr<strategy_type> strategy = 
nullptr);
 
 1107         const span& row_span, 
const span& column_span) 
const;
 
 1134     Csr(std::shared_ptr<const Executor> exec, 
const dim<2>& size = {},
 
 1136         std::shared_ptr<strategy_type> strategy = 
nullptr);
 
 1138     Csr(std::shared_ptr<const Executor> exec, 
const dim<2>& size,
 
 1139         array<value_type> values, array<index_type> col_idxs,
 
 1140         array<index_type> row_ptrs,
 
 1141         std::shared_ptr<strategy_type> strategy = 
nullptr);
 
 1143     void apply_impl(
const LinOp* b, 
LinOp* x) 
const override;
 
 1145     void apply_impl(
const LinOp* alpha, 
const LinOp* b, 
const LinOp* beta,
 
 1146                     LinOp* x) 
const override;
 
 1149     static std::shared_ptr<strategy_type> make_default_strategy(
 
 1150         std::shared_ptr<const Executor> exec)
 
 1152         auto cuda_exec = std::dynamic_pointer_cast<const CudaExecutor>(exec);
 
 1153         auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(exec);
 
 1154         auto dpcpp_exec = std::dynamic_pointer_cast<const DpcppExecutor>(exec);
 
 1155         std::shared_ptr<strategy_type> new_strategy;
 
 1157             new_strategy = std::make_shared<automatical>(cuda_exec);
 
 1158         } 
else if (hip_exec) {
 
 1159             new_strategy = std::make_shared<automatical>(hip_exec);
 
 1160         } 
else if (dpcpp_exec) {
 
 1161             new_strategy = std::make_shared<automatical>(dpcpp_exec);
 
 1163             new_strategy = std::make_shared<classical>();
 
 1165         return new_strategy;
 
 1169     template <
typename CsrType>
 
 1170     void convert_strategy_helper(CsrType* result)
 const 
 1173         std::shared_ptr<typename CsrType::strategy_type> new_strat;
 
 1174         if (dynamic_cast<classical*>(strat)) {
 
 1175             new_strat = std::make_shared<typename CsrType::classical>();
 
 1176         } 
else if (dynamic_cast<merge_path*>(strat)) {
 
 1177             new_strat = std::make_shared<typename CsrType::merge_path>();
 
 1178         } 
else if (dynamic_cast<cusparse*>(strat)) {
 
 1179             new_strat = std::make_shared<typename CsrType::cusparse>();
 
 1180         } 
else if (dynamic_cast<sparselib*>(strat)) {
 
 1181             new_strat = std::make_shared<typename CsrType::sparselib>();
 
 1183             auto rexec = result->get_executor();
 
 1185                 std::dynamic_pointer_cast<const CudaExecutor>(rexec);
 
 1186             auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(rexec);
 
 1188                 std::dynamic_pointer_cast<const DpcppExecutor>(rexec);
 
 1189             auto lb = dynamic_cast<load_balance*>(strat);
 
 1193                         std::make_shared<typename CsrType::load_balance>(
 
 1196                     new_strat = std::make_shared<typename CsrType::automatical>(
 
 1199             } 
else if (hip_exec) {
 
 1202                         std::make_shared<typename CsrType::load_balance>(
 
 1205                     new_strat = std::make_shared<typename CsrType::automatical>(
 
 1208             } 
else if (dpcpp_exec) {
 
 1211                         std::make_shared<typename CsrType::load_balance>(
 
 1214                     new_strat = std::make_shared<typename CsrType::automatical>(
 
 1219                 auto this_cuda_exec =
 
 1220                     std::dynamic_pointer_cast<const CudaExecutor>(
 
 1222                 auto this_hip_exec =
 
 1223                     std::dynamic_pointer_cast<const HipExecutor>(
 
 1225                 auto this_dpcpp_exec =
 
 1226                     std::dynamic_pointer_cast<const DpcppExecutor>(
 
 1228                 if (this_cuda_exec) {
 
 1231                             std::make_shared<typename CsrType::load_balance>(
 
 1235                             std::make_shared<typename CsrType::automatical>(
 
 1238                 } 
else if (this_hip_exec) {
 
 1241                             std::make_shared<typename CsrType::load_balance>(
 
 1245                             std::make_shared<typename CsrType::automatical>(
 
 1248                 } 
else if (this_dpcpp_exec) {
 
 1251                             std::make_shared<typename CsrType::load_balance>(
 
 1255                             std::make_shared<typename CsrType::automatical>(
 
 1263                     new_strat = std::make_shared<typename CsrType::classical>();
 
 1267         result->set_strategy(new_strat);
 
 1276         strategy_->process(row_ptrs_, &srow_);
 
 1285     virtual void scale_impl(
const LinOp* alpha);
 
 1293     virtual void inv_scale_impl(
const LinOp* alpha);
 
 1296     std::shared_ptr<strategy_type> strategy_;
 
 1297     array<value_type> values_;
 
 1298     array<index_type> col_idxs_;
 
 1299     array<index_type> row_ptrs_;
 
 1300     array<index_type> srow_;
 
 1302     void add_scaled_identity_impl(
const LinOp* a, 
const LinOp* b) 
override;
 
 1315 template <
typename ValueType, 
typename IndexType>
 
 1316 void strategy_rebuild_helper(Csr<ValueType, IndexType>* result)
 
 1318     using load_balance = 
typename Csr<ValueType, IndexType>::load_balance;
 
 1319     using automatical = 
typename Csr<ValueType, IndexType>::automatical;
 
 1320     auto strategy = result->get_strategy();
 
 1321     auto executor = result->get_executor();
 
 1322     if (std::dynamic_pointer_cast<load_balance>(strategy)) {
 
 1324                 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
 
 1325             result->set_strategy(std::make_shared<load_balance>(exec));
 
 1326         } 
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
 
 1328             result->set_strategy(std::make_shared<load_balance>(exec));
 
 1330     } 
else if (std::dynamic_pointer_cast<automatical>(strategy)) {
 
 1332                 std::dynamic_pointer_cast<const HipExecutor>(executor)) {
 
 1333             result->set_strategy(std::make_shared<automatical>(exec));
 
 1334         } 
else if (
auto exec = std::dynamic_pointer_cast<const CudaExecutor>(
 
 1336             result->set_strategy(std::make_shared<automatical>(exec));
 
 1347 #endif  // GKO_PUBLIC_CORE_MATRIX_CSR_HPP_