doc/main/csr_8hpp_source.html

// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors

//

// SPDX-License-Identifier: BSD-3-Clause


#ifndef GKO_PUBLIC_CORE_MATRIX_CSR_HPP_

#define GKO_PUBLIC_CORE_MATRIX_CSR_HPP_


#include <ginkgo/core/base/array.hpp>

#include <ginkgo/core/base/index_set.hpp>

#include <ginkgo/core/base/lin_op.hpp>

#include <ginkgo/core/base/math.hpp>

#include <ginkgo/core/matrix/permutation.hpp>

#include <ginkgo/core/matrix/scaled_permutation.hpp>


namespace gko {

namespace matrix {


template <typename ValueType>

class Dense;


template <typename ValueType>

class Diagonal;


template <typename ValueType, typename IndexType>

class Coo;


template <typename ValueType, typename IndexType>

class Ell;


template <typename ValueType, typename IndexType>

class Hybrid;


template <typename ValueType, typename IndexType>

class Sellp;


template <typename ValueType, typename IndexType>

class SparsityCsr;


template <typename ValueType, typename IndexType>

class Csr;


template <typename ValueType, typename IndexType>

class Fbcsr;


template <typename ValueType, typename IndexType>

class CsrBuilder;


template <typename IndexType>

class Permutation;


namespace detail {


template <typename ValueType = default_precision, typename IndexType = int32>

void strategy_rebuild_helper(Csr<ValueType, IndexType>* result);


}  // namespace detail


template <typename ValueType = default_precision, typename IndexType = int32>

class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,

            public ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>,

#if GINKGO_ENABLE_HALF || GINKGO_ENABLE_BFLOAT16

            public ConvertibleTo<Csr<next_precision<ValueType, 2>, IndexType>>,

#endif

#if GINKGO_ENABLE_HALF && GINKGO_ENABLE_BFLOAT16

            public ConvertibleTo<Csr<next_precision<ValueType, 3>, IndexType>>,

#endif

            public ConvertibleTo<Dense<ValueType>>,

            public ConvertibleTo<Coo<ValueType, IndexType>>,

            public ConvertibleTo<Ell<ValueType, IndexType>>,

            public ConvertibleTo<Fbcsr<ValueType, IndexType>>,

            public ConvertibleTo<Hybrid<ValueType, IndexType>>,

            public ConvertibleTo<Sellp<ValueType, IndexType>>,

            public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,

            public DiagonalExtractable<ValueType>,

            public ReadableFromMatrixData<ValueType, IndexType>,

            public WritableToMatrixData<ValueType, IndexType>,

            public Transposable,

            public Permutable<IndexType>,

            public EnableAbsoluteComputation<

                remove_complex<Csr<ValueType, IndexType>>>,

            public ScaledIdentityAddable {

    friend class EnablePolymorphicObject<Csr, LinOp>;

    friend class Coo<ValueType, IndexType>;

    friend class Dense<ValueType>;

    friend class Diagonal<ValueType>;

    friend class Ell<ValueType, IndexType>;

    friend class Hybrid<ValueType, IndexType>;

    friend class Sellp<ValueType, IndexType>;

    friend class SparsityCsr<ValueType, IndexType>;

    friend class Fbcsr<ValueType, IndexType>;

    friend class CsrBuilder<ValueType, IndexType>;

    friend class Csr<to_complex<ValueType>, IndexType>;

    GKO_ASSERT_SUPPORTED_VALUE_AND_INDEX_TYPE;


public:

    using EnableLinOp<Csr>::convert_to;

    using EnableLinOp<Csr>::move_to;

    using ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>::convert_to;

    using ConvertibleTo<Csr<next_precision<ValueType>, IndexType>>::move_to;

    using ConvertibleTo<Dense<ValueType>>::convert_to;

    using ConvertibleTo<Dense<ValueType>>::move_to;

    using ConvertibleTo<Coo<ValueType, IndexType>>::convert_to;

    using ConvertibleTo<Coo<ValueType, IndexType>>::move_to;

    using ConvertibleTo<Ell<ValueType, IndexType>>::convert_to;

    using ConvertibleTo<Ell<ValueType, IndexType>>::move_to;

    using ConvertibleTo<Fbcsr<ValueType, IndexType>>::convert_to;

    using ConvertibleTo<Fbcsr<ValueType, IndexType>>::move_to;

    using ConvertibleTo<Hybrid<ValueType, IndexType>>::convert_to;

    using ConvertibleTo<Hybrid<ValueType, IndexType>>::move_to;

    using ConvertibleTo<Sellp<ValueType, IndexType>>::convert_to;

    using ConvertibleTo<Sellp<ValueType, IndexType>>::move_to;

    using ConvertibleTo<SparsityCsr<ValueType, IndexType>>::convert_to;

    using ConvertibleTo<SparsityCsr<ValueType, IndexType>>::move_to;

    using ReadableFromMatrixData<ValueType, IndexType>::read;


    using value_type = ValueType;

    using index_type = IndexType;

    using transposed_type = Csr<ValueType, IndexType>;

    using mat_data = matrix_data<ValueType, IndexType>;

    using device_mat_data = device_matrix_data<ValueType, IndexType>;

    using absolute_type = remove_complex<Csr>;


    class automatical;


    class strategy_type {

        friend class automatical;


    public:

        strategy_type(std::string name) : name_(name) {}


        virtual ~strategy_type() = default;


        std::string get_name() { return name_; }


        virtual void process(const array<index_type>& mtx_row_ptrs,

                             array<index_type>* mtx_srow) = 0;


        virtual int64_t clac_size(const int64_t nnz) = 0;


        virtual std::shared_ptr<strategy_type> copy() = 0;


    protected:

        void set_name(std::string name) { name_ = name; }


    private:

        std::string name_;

    };


    class classical : public strategy_type {

    public:

        classical() : strategy_type("classical"), max_length_per_row_(0) {}


        void process(const array<index_type>& mtx_row_ptrs,

                     array<index_type>* mtx_srow) override

        {

            auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();

            array<index_type> row_ptrs_host(host_mtx_exec);

            const bool is_mtx_on_host{host_mtx_exec ==

                                      mtx_row_ptrs.get_executor()};

            const index_type* row_ptrs{};

            if (is_mtx_on_host) {

                row_ptrs = mtx_row_ptrs.get_const_data();

            } else {

                row_ptrs_host = mtx_row_ptrs;

                row_ptrs = row_ptrs_host.get_const_data();

            }

            auto num_rows = mtx_row_ptrs.get_size() - 1;

            max_length_per_row_ = 0;

            for (size_type i = 0; i < num_rows; i++) {

                max_length_per_row_ = std::max(max_length_per_row_,

                                               row_ptrs[i + 1] - row_ptrs[i]);

            }

        }


        int64_t clac_size(const int64_t nnz) override { return 0; }


        index_type get_max_length_per_row() const noexcept

        {

            return max_length_per_row_;

        }


        std::shared_ptr<strategy_type> copy() override

        {

            return std::make_shared<classical>();

        }


    private:

        index_type max_length_per_row_;

    };


    class merge_path : public strategy_type {

    public:

        merge_path() : strategy_type("merge_path") {}


        void process(const array<index_type>& mtx_row_ptrs,

                     array<index_type>* mtx_srow) override

        {}


        int64_t clac_size(const int64_t nnz) override { return 0; }


        std::shared_ptr<strategy_type> copy() override

        {

            return std::make_shared<merge_path>();

        }

    };


    class cusparse : public strategy_type {

    public:

        cusparse() : strategy_type("cusparse") {}


        void process(const array<index_type>& mtx_row_ptrs,

                     array<index_type>* mtx_srow) override

        {}


        int64_t clac_size(const int64_t nnz) override { return 0; }


        std::shared_ptr<strategy_type> copy() override

        {

            return std::make_shared<cusparse>();

        }

    };


    class sparselib : public strategy_type {

    public:

        sparselib() : strategy_type("sparselib") {}


        void process(const array<index_type>& mtx_row_ptrs,

                     array<index_type>* mtx_srow) override

        {}


        int64_t clac_size(const int64_t nnz) override { return 0; }


        std::shared_ptr<strategy_type> copy() override

        {

            return std::make_shared<sparselib>();

        }

    };


    class load_balance : public strategy_type {

    public:

        [[deprecated]] load_balance()

            : load_balance(std::move(

                  gko::CudaExecutor::create(0, gko::OmpExecutor::create())))

        {}


        load_balance(std::shared_ptr<const CudaExecutor> exec)

            : load_balance(exec->get_num_warps(), exec->get_warp_size())

        {}


        load_balance(std::shared_ptr<const HipExecutor> exec)

            : load_balance(exec->get_num_warps(), exec->get_warp_size(), false)

        {}


        load_balance(std::shared_ptr<const DpcppExecutor> exec)

            : load_balance(exec->get_num_subgroups(), 32, false, "intel")

        {}


        load_balance(int64_t nwarps, int warp_size = 32,

                     bool cuda_strategy = true,

                     std::string strategy_name = "none")

            : strategy_type("load_balance"),

              nwarps_(nwarps),

              warp_size_(warp_size),

              cuda_strategy_(cuda_strategy),

              strategy_name_(strategy_name)

        {}


        void process(const array<index_type>& mtx_row_ptrs,

                     array<index_type>* mtx_srow) override

        {

            auto nwarps = mtx_srow->get_size();


            if (nwarps > 0) {

                auto host_srow_exec = mtx_srow->get_executor()->get_master();

                auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();

                const bool is_srow_on_host{host_srow_exec ==

                                           mtx_srow->get_executor()};

                const bool is_mtx_on_host{host_mtx_exec ==

                                          mtx_row_ptrs.get_executor()};

                array<index_type> row_ptrs_host(host_mtx_exec);

                array<index_type> srow_host(host_srow_exec);

                const index_type* row_ptrs{};

                index_type* srow{};

                if (is_srow_on_host) {

                    srow = mtx_srow->get_data();

                } else {

                    srow_host = *mtx_srow;

                    srow = srow_host.get_data();

                }

                if (is_mtx_on_host) {

                    row_ptrs = mtx_row_ptrs.get_const_data();

                } else {

                    row_ptrs_host = mtx_row_ptrs;

                    row_ptrs = row_ptrs_host.get_const_data();

                }

                for (size_type i = 0; i < nwarps; i++) {

                    srow[i] = 0;

                }

                const auto num_rows = mtx_row_ptrs.get_size() - 1;

                const auto num_elems = row_ptrs[num_rows];

                const auto bucket_divider =

                    num_elems > 0 ? ceildiv(num_elems, warp_size_) : 1;

                for (size_type i = 0; i < num_rows; i++) {

                    auto bucket =

                        ceildiv((ceildiv(row_ptrs[i + 1], warp_size_) * nwarps),

                                bucket_divider);

                    if (bucket < nwarps) {

                        srow[bucket]++;

                    }

                }

                // find starting row for thread i

                for (size_type i = 1; i < nwarps; i++) {

                    srow[i] += srow[i - 1];

                }

                if (!is_srow_on_host) {

                    *mtx_srow = srow_host;

                }

            }

        }


        int64_t clac_size(const int64_t nnz) override

        {

            if (warp_size_ > 0) {

                int multiple = 8;

                if (nnz >= static_cast<int64_t>(2e8)) {

                    multiple = 2048;

                } else if (nnz >= static_cast<int64_t>(2e7)) {

                    multiple = 512;

                } else if (nnz >= static_cast<int64_t>(2e6)) {

                    multiple = 128;

                } else if (nnz >= static_cast<int64_t>(2e5)) {

                    multiple = 32;

                }

                if (strategy_name_ == "intel") {

                    multiple = 8;

                    if (nnz >= static_cast<int64_t>(2e8)) {

                        multiple = 256;

                    } else if (nnz >= static_cast<int64_t>(2e7)) {

                        multiple = 32;

                    }

                }

#if GINKGO_HIP_PLATFORM_HCC

                if (!cuda_strategy_) {

                    multiple = 8;

                    if (nnz >= static_cast<int64_t>(1e7)) {

                        multiple = 64;

                    } else if (nnz >= static_cast<int64_t>(1e6)) {

                        multiple = 16;

                    }

                }

#endif  // GINKGO_HIP_PLATFORM_HCC


                auto nwarps = nwarps_ * multiple;

                return min(ceildiv(nnz, warp_size_), nwarps);

            } else {

                return 0;

            }

        }


        std::shared_ptr<strategy_type> copy() override

        {

            return std::make_shared<load_balance>(

                nwarps_, warp_size_, cuda_strategy_, strategy_name_);

        }


    private:

        int64_t nwarps_;

        int warp_size_;

        bool cuda_strategy_;

        std::string strategy_name_;

    };


    class automatical : public strategy_type {

    public:

        /* Use imbalance strategy when the maximum number of nonzero per row is

         * more than 1024 on NVIDIA hardware */

        const index_type nvidia_row_len_limit = 1024;

        /* Use imbalance strategy when the matrix has more more than 1e6 on

         * NVIDIA hardware */

        const index_type nvidia_nnz_limit{static_cast<index_type>(1e6)};

        /* Use imbalance strategy when the maximum number of nonzero per row is

         * more than 768 on AMD hardware */

        const index_type amd_row_len_limit = 768;

        /* Use imbalance strategy when the matrix has more more than 1e8 on AMD

         * hardware */

        const index_type amd_nnz_limit{static_cast<index_type>(1e8)};

        /* Use imbalance strategy when the maximum number of nonzero per row is

         * more than 25600 on Intel hardware */

        const index_type intel_row_len_limit = 25600;

        /* Use imbalance strategy when the matrix has more more than 3e8 on

         * Intel hardware */

        const index_type intel_nnz_limit{static_cast<index_type>(3e8)};


    public:

        [[deprecated]] automatical()

            : automatical(std::move(

                  gko::CudaExecutor::create(0, gko::OmpExecutor::create())))

        {}


        automatical(std::shared_ptr<const CudaExecutor> exec)

            : automatical(exec->get_num_warps(), exec->get_warp_size())

        {}


        automatical(std::shared_ptr<const HipExecutor> exec)

            : automatical(exec->get_num_warps(), exec->get_warp_size(), false)

        {}


        automatical(std::shared_ptr<const DpcppExecutor> exec)

            : automatical(exec->get_num_subgroups(), 32, false, "intel")

        {}


        automatical(int64_t nwarps, int warp_size = 32,

                    bool cuda_strategy = true,

                    std::string strategy_name = "none")

            : strategy_type("automatical"),

              nwarps_(nwarps),

              warp_size_(warp_size),

              cuda_strategy_(cuda_strategy),

              strategy_name_(strategy_name),

              max_length_per_row_(0)

        {}


        void process(const array<index_type>& mtx_row_ptrs,

                     array<index_type>* mtx_srow) override

        {

            // if the number of stored elements is larger than <nnz_limit> or

            // the maximum number of stored elements per row is larger than

            // <row_len_limit>, use load_balance otherwise use classical

            index_type nnz_limit = nvidia_nnz_limit;

            index_type row_len_limit = nvidia_row_len_limit;

            if (strategy_name_ == "intel") {

                nnz_limit = intel_nnz_limit;

                row_len_limit = intel_row_len_limit;

            }

#if GINKGO_HIP_PLATFORM_HCC

            if (!cuda_strategy_) {

                nnz_limit = amd_nnz_limit;

                row_len_limit = amd_row_len_limit;

            }

#endif  // GINKGO_HIP_PLATFORM_HCC

            auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();

            const bool is_mtx_on_host{host_mtx_exec ==

                                      mtx_row_ptrs.get_executor()};

            array<index_type> row_ptrs_host(host_mtx_exec);

            const index_type* row_ptrs{};

            if (is_mtx_on_host) {

                row_ptrs = mtx_row_ptrs.get_const_data();

            } else {

                row_ptrs_host = mtx_row_ptrs;

                row_ptrs = row_ptrs_host.get_const_data();

            }

            const auto num_rows = mtx_row_ptrs.get_size() - 1;

            if (row_ptrs[num_rows] > nnz_limit) {

                load_balance actual_strategy(nwarps_, warp_size_,

                                             cuda_strategy_, strategy_name_);

                if (is_mtx_on_host) {

                    actual_strategy.process(mtx_row_ptrs, mtx_srow);

                } else {

                    actual_strategy.process(row_ptrs_host, mtx_srow);

                }

                this->set_name(actual_strategy.get_name());

            } else {

                index_type maxnum = 0;

                for (size_type i = 0; i < num_rows; i++) {

                    maxnum = std::max(maxnum, row_ptrs[i + 1] - row_ptrs[i]);

                }

                if (maxnum > row_len_limit) {

                    load_balance actual_strategy(

                        nwarps_, warp_size_, cuda_strategy_, strategy_name_);

                    if (is_mtx_on_host) {

                        actual_strategy.process(mtx_row_ptrs, mtx_srow);

                    } else {

                        actual_strategy.process(row_ptrs_host, mtx_srow);

                    }

                    this->set_name(actual_strategy.get_name());

                } else {

                    classical actual_strategy;

                    if (is_mtx_on_host) {

                        actual_strategy.process(mtx_row_ptrs, mtx_srow);

                        max_length_per_row_ =

                            actual_strategy.get_max_length_per_row();

                    } else {

                        actual_strategy.process(row_ptrs_host, mtx_srow);

                        max_length_per_row_ =

                            actual_strategy.get_max_length_per_row();

                    }

                    this->set_name(actual_strategy.get_name());

                }

            }

        }


        int64_t clac_size(const int64_t nnz) override

        {

            return std::make_shared<load_balance>(

                       nwarps_, warp_size_, cuda_strategy_, strategy_name_)

                ->clac_size(nnz);

        }


        index_type get_max_length_per_row() const noexcept

        {

            return max_length_per_row_;

        }


        std::shared_ptr<strategy_type> copy() override

        {

            return std::make_shared<automatical>(

                nwarps_, warp_size_, cuda_strategy_, strategy_name_);

        }


    private:

        int64_t nwarps_;

        int warp_size_;

        bool cuda_strategy_;

        std::string strategy_name_;

        index_type max_length_per_row_;

    };


    friend class Csr<previous_precision<ValueType>, IndexType>;


    void convert_to(

        Csr<next_precision<ValueType>, IndexType>* result) const override;


    void move_to(Csr<next_precision<ValueType>, IndexType>* result) override;


#if GINKGO_ENABLE_HALF || GINKGO_ENABLE_BFLOAT16

    friend class Csr<previous_precision<ValueType, 2>, IndexType>;

    using ConvertibleTo<

        Csr<next_precision<ValueType, 2>, IndexType>>::convert_to;

    using ConvertibleTo<Csr<next_precision<ValueType, 2>, IndexType>>::move_to;


    void convert_to(

        Csr<next_precision<ValueType, 2>, IndexType>* result) const override;


    void move_to(Csr<next_precision<ValueType, 2>, IndexType>* result) override;

#endif


#if GINKGO_ENABLE_HALF && GINKGO_ENABLE_BFLOAT16

    friend class Csr<previous_precision<ValueType, 3>, IndexType>;

    using ConvertibleTo<

        Csr<next_precision<ValueType, 3>, IndexType>>::convert_to;

    using ConvertibleTo<Csr<next_precision<ValueType, 3>, IndexType>>::move_to;


    void convert_to(

        Csr<next_precision<ValueType, 3>, IndexType>* result) const override;


    void move_to(Csr<next_precision<ValueType, 3>, IndexType>* result) override;

#endif


    void convert_to(Dense<ValueType>* other) const override;


    void move_to(Dense<ValueType>* other) override;


    void convert_to(Coo<ValueType, IndexType>* result) const override;


    void move_to(Coo<ValueType, IndexType>* result) override;


    void convert_to(Ell<ValueType, IndexType>* result) const override;


    void move_to(Ell<ValueType, IndexType>* result) override;


    void convert_to(Fbcsr<ValueType, IndexType>* result) const override;


    void move_to(Fbcsr<ValueType, IndexType>* result) override;


    void convert_to(Hybrid<ValueType, IndexType>* result) const override;


    void move_to(Hybrid<ValueType, IndexType>* result) override;


    void convert_to(Sellp<ValueType, IndexType>* result) const override;


    void move_to(Sellp<ValueType, IndexType>* result) override;


    void convert_to(SparsityCsr<ValueType, IndexType>* result) const override;


    void move_to(SparsityCsr<ValueType, IndexType>* result) override;


    void read(const mat_data& data) override;


    void read(const device_mat_data& data) override;


    void read(device_mat_data&& data) override;


    void write(mat_data& data) const override;


    std::unique_ptr<LinOp> transpose() const override;


    std::unique_ptr<LinOp> conj_transpose() const override;


    class multiply_reuse_info {

        friend class Csr;


    public:

        explicit multiply_reuse_info();


        ~multiply_reuse_info();


        multiply_reuse_info(const multiply_reuse_info&) = delete;


        multiply_reuse_info(multiply_reuse_info&&) noexcept;


        multiply_reuse_info& operator=(const multiply_reuse_info&) = delete;


        multiply_reuse_info& operator=(multiply_reuse_info&&) noexcept;


        void update_values(ptr_param<const Csr> mtx1, ptr_param<const Csr> mtx2,

                           ptr_param<Csr> out) const;


    private:

        struct lookup_data;


        explicit multiply_reuse_info(std::unique_ptr<lookup_data> data);


        std::unique_ptr<lookup_data> internal;

    };


    std::unique_ptr<Csr> multiply(ptr_param<const Csr> other) const;


    std::pair<std::unique_ptr<Csr>, multiply_reuse_info> multiply_reuse(

        ptr_param<const Csr> other) const;


    class multiply_add_reuse_info {

        friend class Csr;


    public:

        explicit multiply_add_reuse_info();


        ~multiply_add_reuse_info();


        multiply_add_reuse_info(const multiply_add_reuse_info&) = delete;


        multiply_add_reuse_info(multiply_add_reuse_info&&) noexcept;


        multiply_add_reuse_info& operator=(const multiply_add_reuse_info&) =

            delete;


        multiply_add_reuse_info& operator=(multiply_add_reuse_info&&) noexcept;


        void update_values(ptr_param<const Csr> mtx,

                           ptr_param<const Dense<value_type>> scale_mult,

                           ptr_param<const Csr> mtx_mult,

                           ptr_param<const Dense<value_type>> scale_add,

                           ptr_param<const Csr> mtx_add,

                           ptr_param<Csr> out) const;


    private:

        struct lookup_data;


        explicit multiply_add_reuse_info(std::unique_ptr<lookup_data> data);


        std::unique_ptr<lookup_data> internal;

    };


    std::unique_ptr<Csr> multiply_add(

        ptr_param<const Dense<value_type>> scale_mult,

        ptr_param<const Csr> mtx_mult,

        ptr_param<const Dense<value_type>> scale_add,

        ptr_param<const Csr> mtx_add) const;


    std::pair<std::unique_ptr<Csr>, multiply_add_reuse_info> multiply_add_reuse(

        ptr_param<const Dense<value_type>> scale_mult,

        ptr_param<const Csr> mtx_mult,

        ptr_param<const Dense<value_type>> scale_add,

        ptr_param<const Csr> mtx_add) const;


    class scale_add_reuse_info {

        friend class Csr;


    public:

        explicit scale_add_reuse_info();


        ~scale_add_reuse_info();


        scale_add_reuse_info(const scale_add_reuse_info&) = delete;


        scale_add_reuse_info(scale_add_reuse_info&&) noexcept;


        scale_add_reuse_info& operator=(const scale_add_reuse_info&) = delete;


        scale_add_reuse_info& operator=(scale_add_reuse_info&&) noexcept;


        void update_values(ptr_param<const Dense<value_type>> scale1,

                           ptr_param<const Csr> mtx1,

                           ptr_param<const Dense<value_type>> scale2,

                           ptr_param<const Csr> mtx2, ptr_param<Csr> out) const;


    private:

        struct lookup_data;


        explicit scale_add_reuse_info(std::unique_ptr<lookup_data> data);


        std::unique_ptr<lookup_data> internal;

    };


    std::unique_ptr<Csr> scale_add(

        ptr_param<const Dense<value_type>> scale_this,

        ptr_param<const Dense<value_type>> scale_other,

        ptr_param<const Csr> mtx_other) const;


    std::pair<std::unique_ptr<Csr>, scale_add_reuse_info> add_scale_reuse(

        ptr_param<const Dense<value_type>> scale_this,

        ptr_param<const Dense<value_type>> scale_other,

        ptr_param<const Csr> mtx_other) const;


    struct permuting_reuse_info {

        explicit permuting_reuse_info();


        explicit permuting_reuse_info(

            std::unique_ptr<Permutation<index_type>> value_permutation);


        void update_values(ptr_param<const Csr> input,

                           ptr_param<Csr> output) const;


        std::unique_ptr<Permutation<IndexType>> value_permutation;

    };


    std::pair<std::unique_ptr<Csr>, permuting_reuse_info> transpose_reuse()

        const;


    std::unique_ptr<Csr> permute(

        ptr_param<const Permutation<index_type>> permutation,

        permute_mode mode = permute_mode::symmetric) const;


    std::unique_ptr<Csr> permute(

        ptr_param<const Permutation<index_type>> row_permutation,

        ptr_param<const Permutation<index_type>> column_permutation,

        bool invert = false) const;


    std::pair<std::unique_ptr<Csr>, permuting_reuse_info> permute_reuse(

        ptr_param<const Permutation<index_type>> permutation,

        permute_mode mode = permute_mode::symmetric) const;


    std::pair<std::unique_ptr<Csr>, permuting_reuse_info> permute_reuse(

        ptr_param<const Permutation<index_type>> row_permutation,

        ptr_param<const Permutation<index_type>> column_permutation,

        bool invert = false) const;


    std::unique_ptr<Csr> scale_permute(

        ptr_param<const ScaledPermutation<value_type, index_type>> permutation,

        permute_mode = permute_mode::symmetric) const;


    std::unique_ptr<Csr> scale_permute(

        ptr_param<const ScaledPermutation<value_type, index_type>>

            row_permutation,

        ptr_param<const ScaledPermutation<value_type, index_type>>

            column_permutation,

        bool invert = false) const;


    std::unique_ptr<LinOp> permute(

        const array<IndexType>* permutation_indices) const override;


    std::unique_ptr<LinOp> inverse_permute(

        const array<IndexType>* inverse_permutation_indices) const override;


    std::unique_ptr<LinOp> row_permute(

        const array<IndexType>* permutation_indices) const override;


    std::unique_ptr<LinOp> column_permute(

        const array<IndexType>* permutation_indices) const override;


    std::unique_ptr<LinOp> inverse_row_permute(

        const array<IndexType>* inverse_permutation_indices) const override;


    std::unique_ptr<LinOp> inverse_column_permute(

        const array<IndexType>* inverse_permutation_indices) const override;


    std::unique_ptr<Diagonal<ValueType>> extract_diagonal() const override;


    std::unique_ptr<absolute_type> compute_absolute() const override;


    void compute_absolute_inplace() override;


    void sort_by_column_index();


    /*

     * Tests if all row entry pairs (value, col_idx) are sorted by column index

     *

     * @returns True if all row entry pairs (value, col_idx) are sorted by

     *          column index

     */

    bool is_sorted_by_column_index() const;


    value_type* get_values() noexcept { return values_.get_data(); }


    const value_type* get_const_values() const noexcept

    {

        return values_.get_const_data();

    }


    std::unique_ptr<Dense<ValueType>> create_value_view();


    std::unique_ptr<const Dense<ValueType>> create_const_value_view() const;


    index_type* get_col_idxs() noexcept { return col_idxs_.get_data(); }


    const index_type* get_const_col_idxs() const noexcept

    {

        return col_idxs_.get_const_data();

    }


    index_type* get_row_ptrs() noexcept { return row_ptrs_.get_data(); }


    const index_type* get_const_row_ptrs() const noexcept

    {

        return row_ptrs_.get_const_data();

    }


    index_type* get_srow() noexcept { return srow_.get_data(); }


    const index_type* get_const_srow() const noexcept

    {

        return srow_.get_const_data();

    }


    size_type get_num_srow_elements() const noexcept

    {

        return srow_.get_size();

    }


    size_type get_num_stored_elements() const noexcept

    {

        return values_.get_size();

    }


    std::shared_ptr<strategy_type> get_strategy() const noexcept

    {

        return strategy_;

    }


    void set_strategy(std::shared_ptr<strategy_type> strategy)

    {

        strategy_ = std::move(strategy->copy());

        this->make_srow();

    }


    void scale(ptr_param<const LinOp> alpha)

    {

        auto exec = this->get_executor();

        GKO_ASSERT_EQUAL_DIMENSIONS(alpha, dim<2>(1, 1));

        this->scale_impl(make_temporary_clone(exec, alpha).get());

    }


    void inv_scale(ptr_param<const LinOp> alpha)

    {

        auto exec = this->get_executor();

        GKO_ASSERT_EQUAL_DIMENSIONS(alpha, dim<2>(1, 1));

        this->inv_scale_impl(make_temporary_clone(exec, alpha).get());

    }


    static std::unique_ptr<Csr> create(std::shared_ptr<const Executor> exec,

                                       std::shared_ptr<strategy_type> strategy);


    static std::unique_ptr<Csr> create(

        std::shared_ptr<const Executor> exec, const dim<2>& size = {},

        size_type num_nonzeros = {},

        std::shared_ptr<strategy_type> strategy = nullptr);


    static std::unique_ptr<Csr> create(

        std::shared_ptr<const Executor> exec, const dim<2>& size,

        array<value_type> values, array<index_type> col_idxs,

        array<index_type> row_ptrs,

        std::shared_ptr<strategy_type> strategy = nullptr);


    template <typename InputValueType, typename InputColumnIndexType,

              typename InputRowPtrType>

    GKO_DEPRECATED(

        "explicitly construct the gko::array argument instead of passing "

        "initializer lists")

    static std::unique_ptr<Csr> create(

        std::shared_ptr<const Executor> exec, const dim<2>& size,

        std::initializer_list<InputValueType> values,

        std::initializer_list<InputColumnIndexType> col_idxs,

        std::initializer_list<InputRowPtrType> row_ptrs)

    {

        return create(exec, size, array<value_type>{exec, std::move(values)},

                      array<index_type>{exec, std::move(col_idxs)},

                      array<index_type>{exec, std::move(row_ptrs)});

    }


    static std::unique_ptr<const Csr> create_const(

        std::shared_ptr<const Executor> exec, const dim<2>& size,

        gko::detail::const_array_view<ValueType>&& values,

        gko::detail::const_array_view<IndexType>&& col_idxs,

        gko::detail::const_array_view<IndexType>&& row_ptrs,

        std::shared_ptr<strategy_type> strategy = nullptr);


    std::unique_ptr<Csr<ValueType, IndexType>> create_submatrix(

        const index_set<IndexType>& row_index_set,

        const index_set<IndexType>& column_index_set) const;


    std::unique_ptr<Csr<ValueType, IndexType>> create_submatrix(

        const span& row_span, const span& column_span) const;


    Csr& operator=(const Csr&);


    Csr& operator=(Csr&&);


    Csr(const Csr&);


    Csr(Csr&&);


protected:

    Csr(std::shared_ptr<const Executor> exec, const dim<2>& size = {},

        size_type num_nonzeros = {},

        std::shared_ptr<strategy_type> strategy = nullptr);


    Csr(std::shared_ptr<const Executor> exec, const dim<2>& size,

        array<value_type> values, array<index_type> col_idxs,

        array<index_type> row_ptrs,

        std::shared_ptr<strategy_type> strategy = nullptr);


    void apply_impl(const LinOp* b, LinOp* x) const override;


    void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,

                    LinOp* x) const override;


    // TODO: This provides some more sane settings. Please fix this!

    static std::shared_ptr<strategy_type> make_default_strategy(

        std::shared_ptr<const Executor> exec)

    {

        auto cuda_exec = std::dynamic_pointer_cast<const CudaExecutor>(exec);

        auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(exec);

        auto dpcpp_exec = std::dynamic_pointer_cast<const DpcppExecutor>(exec);

        std::shared_ptr<strategy_type> new_strategy;

        if (cuda_exec) {

            new_strategy = std::make_shared<automatical>(cuda_exec);

        } else if (hip_exec) {

            new_strategy = std::make_shared<automatical>(hip_exec);

        } else if (dpcpp_exec) {

            new_strategy = std::make_shared<automatical>(dpcpp_exec);

        } else {

            new_strategy = std::make_shared<classical>();

        }

        return new_strategy;

    }


    // TODO clean this up as soon as we improve strategy_type

    template <typename CsrType>

    void convert_strategy_helper(CsrType* result) const

    {

        auto strat = this->get_strategy().get();

        std::shared_ptr<typename CsrType::strategy_type> new_strat;

        if (dynamic_cast<classical*>(strat)) {

            new_strat = std::make_shared<typename CsrType::classical>();

        } else if (dynamic_cast<merge_path*>(strat)) {

            new_strat = std::make_shared<typename CsrType::merge_path>();

        } else if (dynamic_cast<cusparse*>(strat)) {

            new_strat = std::make_shared<typename CsrType::cusparse>();

        } else if (dynamic_cast<sparselib*>(strat)) {

            new_strat = std::make_shared<typename CsrType::sparselib>();

        } else {

            auto rexec = result->get_executor();

            auto cuda_exec =

                std::dynamic_pointer_cast<const CudaExecutor>(rexec);

            auto hip_exec = std::dynamic_pointer_cast<const HipExecutor>(rexec);

            auto dpcpp_exec =

                std::dynamic_pointer_cast<const DpcppExecutor>(rexec);

            auto lb = dynamic_cast<load_balance*>(strat);

            if (cuda_exec) {

                if (lb) {

                    new_strat =

                        std::make_shared<typename CsrType::load_balance>(

                            cuda_exec);

                } else {

                    new_strat = std::make_shared<typename CsrType::automatical>(

                        cuda_exec);

                }

            } else if (hip_exec) {

                if (lb) {

                    new_strat =

                        std::make_shared<typename CsrType::load_balance>(

                            hip_exec);

                } else {

                    new_strat = std::make_shared<typename CsrType::automatical>(

                        hip_exec);

                }

            } else if (dpcpp_exec) {

                if (lb) {

                    new_strat =

                        std::make_shared<typename CsrType::load_balance>(

                            dpcpp_exec);

                } else {

                    new_strat = std::make_shared<typename CsrType::automatical>(

                        dpcpp_exec);

                }

            } else {

                // Try to preserve this executor's configuration

                auto this_cuda_exec =

                    std::dynamic_pointer_cast<const CudaExecutor>(

                        this->get_executor());

                auto this_hip_exec =

                    std::dynamic_pointer_cast<const HipExecutor>(

                        this->get_executor());

                auto this_dpcpp_exec =

                    std::dynamic_pointer_cast<const DpcppExecutor>(

                        this->get_executor());

                if (this_cuda_exec) {

                    if (lb) {

                        new_strat =

                            std::make_shared<typename CsrType::load_balance>(

                                this_cuda_exec);

                    } else {

                        new_strat =

                            std::make_shared<typename CsrType::automatical>(

                                this_cuda_exec);

                    }

                } else if (this_hip_exec) {

                    if (lb) {

                        new_strat =

                            std::make_shared<typename CsrType::load_balance>(

                                this_hip_exec);

                    } else {

                        new_strat =

                            std::make_shared<typename CsrType::automatical>(

                                this_hip_exec);

                    }

                } else if (this_dpcpp_exec) {

                    if (lb) {

                        new_strat =

                            std::make_shared<typename CsrType::load_balance>(

                                this_dpcpp_exec);

                    } else {

                        new_strat =

                            std::make_shared<typename CsrType::automatical>(

                                this_dpcpp_exec);

                    }

                } else {

                    // FIXME: this changes strategies.

                    // We had a load balance or automatical strategy from a non

                    // HIP or Cuda executor and are moving to a non HIP or Cuda

                    // executor.

                    new_strat = std::make_shared<typename CsrType::classical>();

                }

            }

        }

        result->set_strategy(new_strat);

    }


    void make_srow()

    {

        srow_.resize_and_reset(strategy_->clac_size(values_.get_size()));

        strategy_->process(row_ptrs_, &srow_);

    }


    virtual void scale_impl(const LinOp* alpha);


    virtual void inv_scale_impl(const LinOp* alpha);


private:

    std::shared_ptr<strategy_type> strategy_;

    array<value_type> values_;

    array<index_type> col_idxs_;

    array<index_type> row_ptrs_;

    array<index_type> srow_;


    void add_scaled_identity_impl(const LinOp* a, const LinOp* b) override;

};


namespace detail {


template <typename ValueType, typename IndexType>

void strategy_rebuild_helper(Csr<ValueType, IndexType>* result)

{

    using load_balance = typename Csr<ValueType, IndexType>::load_balance;

    using automatical = typename Csr<ValueType, IndexType>::automatical;

    auto strategy = result->get_strategy();

    auto executor = result->get_executor();

    if (std::dynamic_pointer_cast<load_balance>(strategy)) {

        if (auto exec =

                std::dynamic_pointer_cast<const HipExecutor>(executor)) {

            result->set_strategy(std::make_shared<load_balance>(exec));

        } else if (auto exec = std::dynamic_pointer_cast<const CudaExecutor>(

                       executor)) {

            result->set_strategy(std::make_shared<load_balance>(exec));

        }

    } else if (std::dynamic_pointer_cast<automatical>(strategy)) {

        if (auto exec =

                std::dynamic_pointer_cast<const HipExecutor>(executor)) {

            result->set_strategy(std::make_shared<automatical>(exec));

        } else if (auto exec = std::dynamic_pointer_cast<const CudaExecutor>(

                       executor)) {

            result->set_strategy(std::make_shared<automatical>(exec));

        }

    }

}


}  // namespace detail

}  // namespace matrix

}  // namespace gko


#endif  // GKO_PUBLIC_CORE_MATRIX_CSR_HPP_