doc/develop/executor_8hpp_source.html

// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors

//

// SPDX-License-Identifier: BSD-3-Clause


#ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_

#define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_


#include <array>

#include <atomic>

#include <iostream>

#include <memory>

#include <mutex>

#include <sstream>

#include <string>

#include <tuple>

#include <type_traits>

#include <vector>


#include <ginkgo/core/base/device.hpp>

#include <ginkgo/core/base/fwd_decls.hpp>

#include <ginkgo/core/base/machine_topology.hpp>

#include <ginkgo/core/base/memory.hpp>

#include <ginkgo/core/base/scoped_device_id_guard.hpp>

#include <ginkgo/core/base/types.hpp>

#include <ginkgo/core/log/logger.hpp>

#include <ginkgo/core/synthesizer/containers.hpp>


namespace gko {


enum class log_propagation_mode {

    never,

    automatic

};


enum class allocation_mode { device, unified_global, unified_host };


#ifdef NDEBUG


// When in release, prefer device allocations

constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;


constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;


#else


// When in debug, always UM allocations.

constexpr allocation_mode default_cuda_alloc_mode =

    allocation_mode::unified_global;


#if (GINKGO_HIP_PLATFORM_HCC == 1)


// HIP on AMD GPUs does not support UM, so always prefer device allocations.

constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;


#else


// HIP on NVIDIA GPUs supports UM, so prefer UM allocations.

constexpr allocation_mode default_hip_alloc_mode =

    allocation_mode::unified_global;


#endif


#endif


}  // namespace gko


enum class dpcpp_queue_property {

    in_order = 1,


    enable_profiling = 2

};


GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,

                                                         dpcpp_queue_property b)

{

    return static_cast<dpcpp_queue_property>(static_cast<int>(a) |

                                             static_cast<int>(b));

}


namespace gko {


#define GKO_FORWARD_DECLARE(_type, ...) class _type


GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);


#undef GKO_FORWARD_DECLARE


class ReferenceExecutor;


namespace detail {


template <typename>

class ExecutorBase;


}  // namespace detail


class Operation {

public:

#define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \

    virtual void run(std::shared_ptr<const _type>) const


    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);


#undef GKO_DECLARE_RUN_OVERLOAD


    // ReferenceExecutor overload can be defaulted to OmpExecutor's

    virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;


    virtual const char* get_name() const noexcept;

};


namespace detail {


template <typename Closure>

class RegisteredOperation : public Operation {

public:

    RegisteredOperation(const char* name, Closure op)

        : name_(name), op_(std::move(op))

    {}


    const char* get_name() const noexcept override { return name_; }


    void run(std::shared_ptr<const ReferenceExecutor> exec) const override

    {

        op_(exec);

    }


    void run(std::shared_ptr<const OmpExecutor> exec) const override

    {

        op_(exec);

    }


    void run(std::shared_ptr<const CudaExecutor> exec) const override

    {

        op_(exec);

    }


    void run(std::shared_ptr<const HipExecutor> exec) const override

    {

        op_(exec);

    }


    void run(std::shared_ptr<const DpcppExecutor> exec) const override

    {

        op_(exec);

    }


private:

    const char* name_;

    Closure op_;

};


template <typename Closure>

RegisteredOperation<Closure> make_register_operation(const char* name,

                                                     Closure op)

{

    return RegisteredOperation<Closure>{name, std::move(op)};

}


}  // namespace detail


#define GKO_REGISTER_OPERATION(_name, _kernel)                                 \

    template <typename... Args>                                                \

    auto make_##_name(Args&&... args)                                          \

    {                                                                          \

        return ::gko::detail::make_register_operation(                         \

            #_kernel, [&args...](auto exec) {                                  \

                using exec_type = decltype(exec);                              \

                if (std::is_same<                                              \

                        exec_type,                                             \

                        std::shared_ptr<const ::gko::ReferenceExecutor>>::     \

                        value) {                                               \

                    ::gko::kernels::reference::_kernel(                        \

                        std::dynamic_pointer_cast<                             \

                            const ::gko::ReferenceExecutor>(exec),             \

                        std::forward<Args>(args)...);                          \

                } else if (std::is_same<                                       \

                               exec_type,                                      \

                               std::shared_ptr<const ::gko::OmpExecutor>>::    \

                               value) {                                        \

                    ::gko::kernels::omp::_kernel(                              \

                        std::dynamic_pointer_cast<const ::gko::OmpExecutor>(   \

                            exec),                                             \

                        std::forward<Args>(args)...);                          \

                } else if (std::is_same<                                       \

                               exec_type,                                      \

                               std::shared_ptr<const ::gko::CudaExecutor>>::   \

                               value) {                                        \

                    ::gko::kernels::cuda::_kernel(                             \

                        std::dynamic_pointer_cast<const ::gko::CudaExecutor>(  \

                            exec),                                             \

                        std::forward<Args>(args)...);                          \

                } else if (std::is_same<                                       \

                               exec_type,                                      \

                               std::shared_ptr<const ::gko::HipExecutor>>::    \

                               value) {                                        \

                    ::gko::kernels::hip::_kernel(                              \

                        std::dynamic_pointer_cast<const ::gko::HipExecutor>(   \

                            exec),                                             \

                        std::forward<Args>(args)...);                          \

                } else if (std::is_same<                                       \

                               exec_type,                                      \

                               std::shared_ptr<const ::gko::DpcppExecutor>>::  \

                               value) {                                        \

                    ::gko::kernels::dpcpp::_kernel(                            \

                        std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \

                            exec),                                             \

                        std::forward<Args>(args)...);                          \

                } else {                                                       \

                    GKO_NOT_IMPLEMENTED;                                       \

                }                                                              \

            });                                                                \

    }                                                                          \

    static_assert(true,                                                        \

                  "This assert is used to counter the false positive extra "   \

                  "semi-colon warnings")


#define GKO_REGISTER_HOST_OPERATION(_name, _kernel)                          \

    template <typename... Args>                                              \

    auto make_##_name(Args&&... args)                                        \

    {                                                                        \

        return ::gko::detail::make_register_operation(                       \

            #_kernel,                                                        \

            [&args...](auto) { _kernel(std::forward<Args>(args)...); });     \

    }                                                                        \

    static_assert(true,                                                      \

                  "This assert is used to counter the false positive extra " \

                  "semi-colon warnings")


#define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type


class Executor : public log::EnableLogging<Executor> {

    template <typename T>

    friend class detail::ExecutorBase;


    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);

    friend class ReferenceExecutor;


public:

    virtual ~Executor() = default;


    Executor() = default;

    Executor(Executor&) = delete;

    Executor(Executor&&) = delete;

    Executor& operator=(Executor&) = delete;

    Executor& operator=(Executor&&) = delete;


    virtual void run(const Operation& op) const = 0;


    template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,

              typename ClosureDpcpp>

    GKO_DEPRECATED(

        "Please use the overload with std::string as first parameter.")

    void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,

             const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const

    {

        LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,

                        ClosureDpcpp>

            op(op_omp, op_cuda, op_hip, op_dpcpp);

        this->run(op);

    }


    template <typename ClosureReference, typename ClosureOmp,

              typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>

    void run(std::string name, const ClosureReference& op_ref,

             const ClosureOmp& op_omp, const ClosureCuda& op_cuda,

             const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const

    {

        LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,

                        ClosureDpcpp>

            op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);

        this->run(op);

    }


    template <typename T>

    T* alloc(size_type num_elems) const

    {

        this->template log<log::Logger::allocation_started>(

            this, num_elems * sizeof(T));

        T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));

        this->template log<log::Logger::allocation_completed>(

            this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));

        return allocated;

    }


    void free(void* ptr) const noexcept

    {

        this->template log<log::Logger::free_started>(

            this, reinterpret_cast<uintptr>(ptr));

        this->raw_free(ptr);

        this->template log<log::Logger::free_completed>(

            this, reinterpret_cast<uintptr>(ptr));

    }


    template <typename T>

    void copy_from(ptr_param<const Executor> src_exec, size_type num_elems,

                   const T* src_ptr, T* dest_ptr) const

    {

        const auto src_loc = reinterpret_cast<uintptr>(src_ptr);

        const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);

        this->template log<log::Logger::copy_started>(

            src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));

        if (this != src_exec.get()) {

            src_exec->template log<log::Logger::copy_started>(

                src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));

        }

        try {

            this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,

                                dest_ptr);

        } catch (NotSupported&) {

#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)

            // Unoptimized copy. Try to go through the masters.

            // output to log when verbose >= 1 and debug build

            std::clog << "Not direct copy. Try to copy data from the masters."

                      << std::endl;

#endif

            auto src_master = src_exec->get_master().get();

            if (num_elems > 0 && src_master != src_exec.get()) {

                auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);

                src_master->copy_from<T>(src_exec, num_elems, src_ptr,

                                         master_ptr);

                this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);

                src_master->free(master_ptr);

            }

        }

        this->template log<log::Logger::copy_completed>(

            src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));

        if (this != src_exec.get()) {

            src_exec->template log<log::Logger::copy_completed>(

                src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));

        }

    }


    template <typename T>

    void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const

    {

        this->copy_from(this, num_elems, src_ptr, dest_ptr);

    }


    template <typename T>

    T copy_val_to_host(const T* ptr) const

    {

        T out{};

        this->get_master()->copy_from(this, 1, ptr, &out);

        return out;

    }


    virtual std::shared_ptr<Executor> get_master() noexcept = 0;


    virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;


    virtual void synchronize() const = 0;


    void add_logger(std::shared_ptr<const log::Logger> logger) override

    {

        this->propagating_logger_refcount_.fetch_add(

            logger->needs_propagation() ? 1 : 0);

        this->EnableLogging<Executor>::add_logger(logger);

    }


    void remove_logger(const log::Logger* logger) override

    {

        this->propagating_logger_refcount_.fetch_sub(

            logger->needs_propagation() ? 1 : 0);

        this->EnableLogging<Executor>::remove_logger(logger);

    }


    using EnableLogging<Executor>::remove_logger;


    void set_log_propagation_mode(log_propagation_mode mode)

    {

        log_propagation_mode_ = mode;

    }


    bool should_propagate_log() const

    {

        return this->propagating_logger_refcount_.load() > 0 &&

               log_propagation_mode_ == log_propagation_mode::automatic;

    }


    bool memory_accessible(const std::shared_ptr<const Executor>& other) const

    {

        return this->verify_memory_from(other.get());

    }


    virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;


    virtual std::string get_description() const = 0;


protected:

    struct exec_info {

        int device_id = -1;


        std::string device_type;


        int numa_node = -1;


        int num_computing_units = -1;


        int num_pu_per_cu = -1;


        std::vector<int> subgroup_sizes{};


        int max_subgroup_size = -1;


        std::vector<int> max_workitem_sizes{};


        int max_workgroup_size;


        int major = -1;


        int minor = -1;


        std::string pci_bus_id = std::string(13, 'x');


        std::vector<int> closest_pu_ids{};

    };


    const exec_info& get_exec_info() const { return this->exec_info_; }


    virtual void* raw_alloc(size_type size) const = 0;


    virtual void raw_free(void* ptr) const noexcept = 0;


    virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,

                               const void* src_ptr, void* dest_ptr) const = 0;


#define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...)                              \

    virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \

                             const void* src_ptr, void* dest_ptr) const = 0


    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);


#undef GKO_ENABLE_RAW_COPY_TO


    virtual bool verify_memory_from(const Executor* src_exec) const = 0;


#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \

    virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0


    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);


    GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);


#undef GKO_ENABLE_VERIFY_MEMORY_TO


    virtual void populate_exec_info(const machine_topology* mach_topo) = 0;


    exec_info& get_exec_info() { return this->exec_info_; }


    exec_info exec_info_;


    log_propagation_mode log_propagation_mode_{log_propagation_mode::automatic};


    std::atomic<int> propagating_logger_refcount_{};


private:

    template <typename ClosureReference, typename ClosureOmp,

              typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>

    class LambdaOperation : public Operation {

    public:

        LambdaOperation(std::string name, const ClosureReference& op_ref,

                        const ClosureOmp& op_omp, const ClosureCuda& op_cuda,

                        const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)

            : name_(std::move(name)),

              op_ref_(op_ref),

              op_omp_(op_omp),

              op_cuda_(op_cuda),

              op_hip_(op_hip),

              op_dpcpp_(op_dpcpp)

        {}


        LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,

                        const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)

            : LambdaOperation("unnamed", op_omp, op_omp, op_cuda, op_hip,

                              op_dpcpp)

        {}


        void run(std::shared_ptr<const OmpExecutor>) const override

        {

            op_omp_();

        }


        void run(std::shared_ptr<const ReferenceExecutor>) const override

        {

            op_ref_();

        }


        void run(std::shared_ptr<const CudaExecutor>) const override

        {

            op_cuda_();

        }


        void run(std::shared_ptr<const HipExecutor>) const override

        {

            op_hip_();

        }


        void run(std::shared_ptr<const DpcppExecutor>) const override

        {

            op_dpcpp_();

        }


        const char* get_name() const noexcept override { return name_.c_str(); }


    private:

        std::string name_;

        ClosureReference op_ref_;

        ClosureOmp op_omp_;

        ClosureCuda op_cuda_;

        ClosureHip op_hip_;

        ClosureDpcpp op_dpcpp_;

    };

};


template <typename T>

class executor_deleter {

public:

    using pointer = T*;


    explicit executor_deleter(std::shared_ptr<const Executor> exec)

        : exec_{exec}

    {}


    void operator()(pointer ptr) const

    {

        if (exec_) {

            exec_->free(ptr);

        }

    }


private:

    std::shared_ptr<const Executor> exec_;

};


// a specialization for arrays

template <typename T>

class executor_deleter<T[]> {

public:

    using pointer = T[];


    explicit executor_deleter(std::shared_ptr<const Executor> exec)

        : exec_{exec}

    {}


    void operator()(pointer ptr) const

    {

        if (exec_) {

            exec_->free(ptr);

        }

    }


private:

    std::shared_ptr<const Executor> exec_;

};


namespace detail {


template <typename ConcreteExecutor>

class ExecutorBase : public Executor {

    // friend class is not in the nearest enclosing namesace, so we write the

    // full name

    friend class ::gko::OmpExecutor;

    friend class ::gko::HipExecutor;

    friend class ::gko::DpcppExecutor;

    friend class ::gko::CudaExecutor;

    friend class ::gko::ReferenceExecutor;


public:

    void run(const Operation& op) const override

    {

        this->template log<log::Logger::operation_launched>(this, &op);

        auto scope_guard = get_scoped_device_id_guard();

        op.run(self()->shared_from_this());

        this->template log<log::Logger::operation_completed>(this, &op);

    }


protected:

    void raw_copy_from(const Executor* src_exec, size_type n_bytes,

                       const void* src_ptr, void* dest_ptr) const override

    {

        src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);

    }


    virtual bool verify_memory_from(const Executor* src_exec) const override

    {

        return src_exec->verify_memory_to(self());

    }


private:

    ConcreteExecutor* self() noexcept

    {

        return static_cast<ConcreteExecutor*>(this);

    }


    const ConcreteExecutor* self() const noexcept

    {

        return static_cast<const ConcreteExecutor*>(this);

    }

};


#undef GKO_DECLARE_EXECUTOR_FRIEND


class EnableDeviceReset {

public:

    GKO_DEPRECATED(

        "device_reset is no longer supported, call "

        "cudaDeviceReset/hipDeviceReset manually")

    void set_device_reset(bool device_reset) {}


    GKO_DEPRECATED(

        "device_reset is no longer supported, call "

        "cudaDeviceReset/hipDeviceReset manually")

    bool get_device_reset() { return false; }


protected:

    EnableDeviceReset() {}


    GKO_DEPRECATED(

        "device_reset is no longer supported, call "

        "cudaDeviceReset/hipDeviceReset manually")

    EnableDeviceReset(bool device_reset) {}

};


}  // namespace detail


#define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...)                    \

    void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \

                     const void* src_ptr, void* dest_ptr) const override


#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_)                     \

    virtual bool verify_memory_to(const dest_* other) const override         \

    {                                                                        \

        return bool_;                                                        \

    }                                                                        \

    static_assert(true,                                                      \

                  "This assert is used to counter the false positive extra " \

                  "semi-colon warnings")


class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,

                    public std::enable_shared_from_this<OmpExecutor> {

    friend class detail::ExecutorBase<OmpExecutor>;


public:

    using Executor::run;


    static std::shared_ptr<OmpExecutor> create(

        std::shared_ptr<CpuAllocatorBase> alloc =

            std::make_shared<CpuAllocator>())

    {

        return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));

    }


    std::shared_ptr<Executor> get_master() noexcept override;


    std::shared_ptr<const Executor> get_master() const noexcept override;


    void synchronize() const override;


    int get_num_cores() const

    {

        return this->get_exec_info().num_computing_units;

    }


    int get_num_threads_per_core() const

    {

        return this->get_exec_info().num_pu_per_cu;

    }


    static int get_num_omp_threads();


    scoped_device_id_guard get_scoped_device_id_guard() const override;


    std::string get_description() const override;


protected:

    OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)

        : alloc_{std::move(alloc)}

    {

        this->OmpExecutor::populate_exec_info(machine_topology::get_instance());

    }


    void populate_exec_info(const machine_topology* mach_topo) override;


    void* raw_alloc(size_type size) const override;


    void raw_free(void* ptr) const noexcept override;


    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);


    bool verify_memory_to(const DpcppExecutor* dest_exec) const override;


    std::shared_ptr<CpuAllocatorBase> alloc_;

};


namespace kernels {

namespace omp {

using DefaultExecutor = OmpExecutor;

}  // namespace omp

}  // namespace kernels


class ReferenceExecutor : public OmpExecutor {

public:

    using Executor::run;


    static std::shared_ptr<ReferenceExecutor> create(

        std::shared_ptr<CpuAllocatorBase> alloc =

            std::make_shared<CpuAllocator>())

    {

        return std::shared_ptr<ReferenceExecutor>(

            new ReferenceExecutor(std::move(alloc)));

    }


    scoped_device_id_guard get_scoped_device_id_guard() const override

    {

        return {this, 0};

    }


    std::string get_description() const override { return "ReferenceExecutor"; }


    void run(const Operation& op) const override

    {

        this->template log<log::Logger::operation_launched>(this, &op);

        op.run(std::static_pointer_cast<const ReferenceExecutor>(

            this->shared_from_this()));

        this->template log<log::Logger::operation_completed>(this, &op);

    }


protected:

    ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)

        : OmpExecutor{std::move(alloc)}

    {

        this->ReferenceExecutor::populate_exec_info(

            machine_topology::get_instance());

    }


    void populate_exec_info(const machine_topology*) override

    {

        this->get_exec_info().device_id = -1;

        this->get_exec_info().num_computing_units = 1;

        this->get_exec_info().num_pu_per_cu = 1;

    }


    bool verify_memory_from(const Executor* src_exec) const override

    {

        return src_exec->verify_memory_to(this);

    }


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);

};


namespace kernels {

namespace reference {

using DefaultExecutor = ReferenceExecutor;

}  // namespace reference

}  // namespace kernels


class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,

                     public std::enable_shared_from_this<CudaExecutor>,

                     public detail::EnableDeviceReset {

    friend class detail::ExecutorBase<CudaExecutor>;


public:

    using Executor::run;


    GKO_DEPRECATED(

        "calling this CudaExecutor::create method is deprecated, because"

        "device_reset no longer has an effect"

        "call CudaExecutor::create("

        "  int device_id, std::shared_ptr<Executor> master,"

        "  std::shared_ptr<CudaAllocatorBase> alloc,"

        "  CUstream_st* stream);"

        "instead")

    static std::shared_ptr<CudaExecutor> create(

        int device_id, std::shared_ptr<Executor> master, bool device_reset,

        allocation_mode alloc_mode = default_cuda_alloc_mode,

        CUstream_st* stream = nullptr);


    static std::shared_ptr<CudaExecutor> create(

        int device_id, std::shared_ptr<Executor> master,

        std::shared_ptr<CudaAllocatorBase> alloc =

            std::make_shared<CudaAllocator>(),

        CUstream_st* stream = nullptr);


    std::shared_ptr<Executor> get_master() noexcept override;


    std::shared_ptr<const Executor> get_master() const noexcept override;


    void synchronize() const override;


    scoped_device_id_guard get_scoped_device_id_guard() const override;


    std::string get_description() const override;


    int get_device_id() const noexcept

    {

        return this->get_exec_info().device_id;

    }


    static int get_num_devices();


    int get_num_warps_per_sm() const noexcept

    {

        return this->get_exec_info().num_pu_per_cu;

    }


    int get_num_multiprocessor() const noexcept

    {

        return this->get_exec_info().num_computing_units;

    }


    int get_num_warps() const noexcept

    {

        return this->get_exec_info().num_computing_units *

               this->get_exec_info().num_pu_per_cu;

    }


    int get_warp_size() const noexcept

    {

        return this->get_exec_info().max_subgroup_size;

    }


    int get_major_version() const noexcept

    {

        return this->get_exec_info().major;

    }


    int get_minor_version() const noexcept

    {

        return this->get_exec_info().minor;

    }


    GKO_DEPRECATED("use get_blas_handle() instead")

    cublasContext* get_cublas_handle() const { return get_blas_handle(); }


    cublasContext* get_blas_handle() const { return cublas_handle_.get(); }


    GKO_DEPRECATED("use get_sparselib_handle() instead")

    cusparseContext* get_cusparse_handle() const

    {

        return get_sparselib_handle();

    }


    cusparseContext* get_sparselib_handle() const

    {

        return cusparse_handle_.get();

    }


    std::vector<int> get_closest_pus() const

    {

        return this->get_exec_info().closest_pu_ids;

    }


    int get_closest_numa() const { return this->get_exec_info().numa_node; }


    CUstream_st* get_stream() const { return stream_; }


protected:

    void set_gpu_property();


    void init_handles();


    CudaExecutor(int device_id, std::shared_ptr<Executor> master,

                 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)

        : master_(master), alloc_{std::move(alloc)}, stream_{stream}

    {

        this->get_exec_info().device_id = device_id;

        this->get_exec_info().num_computing_units = 0;

        this->get_exec_info().num_pu_per_cu = 0;

        this->CudaExecutor::populate_exec_info(

            machine_topology::get_instance());

        this->set_gpu_property();

        this->init_handles();

    }


    void* raw_alloc(size_type size) const override;


    void raw_free(void* ptr) const noexcept override;


    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);


    bool verify_memory_to(const HipExecutor* dest_exec) const override;


    bool verify_memory_to(const CudaExecutor* dest_exec) const override;


    void populate_exec_info(const machine_topology* mach_topo) override;


private:

    std::shared_ptr<Executor> master_;


    template <typename T>

    using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;

    handle_manager<cublasContext> cublas_handle_;

    handle_manager<cusparseContext> cusparse_handle_;

    std::shared_ptr<CudaAllocatorBase> alloc_;

    CUstream_st* stream_;

};


namespace kernels {

namespace cuda {

using DefaultExecutor = CudaExecutor;

}  // namespace cuda

}  // namespace kernels


class HipExecutor : public detail::ExecutorBase<HipExecutor>,

                    public std::enable_shared_from_this<HipExecutor>,

                    public detail::EnableDeviceReset {

    friend class detail::ExecutorBase<HipExecutor>;


public:

    using Executor::run;


    GKO_DEPRECATED(

        "device_reset is deprecated entirely, call hipDeviceReset directly. "

        "alloc_mode was replaced by the Allocator type "

        "hierarchy.")

    static std::shared_ptr<HipExecutor> create(

        int device_id, std::shared_ptr<Executor> master, bool device_reset,

        allocation_mode alloc_mode = default_hip_alloc_mode,

        GKO_HIP_STREAM_STRUCT* stream = nullptr);


    static std::shared_ptr<HipExecutor> create(

        int device_id, std::shared_ptr<Executor> master,

        std::shared_ptr<HipAllocatorBase> alloc =

            std::make_shared<HipAllocator>(),

        GKO_HIP_STREAM_STRUCT* stream = nullptr);


    std::shared_ptr<Executor> get_master() noexcept override;


    std::shared_ptr<const Executor> get_master() const noexcept override;


    void synchronize() const override;


    scoped_device_id_guard get_scoped_device_id_guard() const override;


    std::string get_description() const override;


    int get_device_id() const noexcept

    {

        return this->get_exec_info().device_id;

    }


    static int get_num_devices();


    int get_num_warps_per_sm() const noexcept

    {

        return this->get_exec_info().num_pu_per_cu;

    }


    int get_num_multiprocessor() const noexcept

    {

        return this->get_exec_info().num_computing_units;

    }


    int get_major_version() const noexcept

    {

        return this->get_exec_info().major;

    }


    int get_minor_version() const noexcept

    {

        return this->get_exec_info().minor;

    }


    int get_num_warps() const noexcept

    {

        return this->get_exec_info().num_computing_units *

               this->get_exec_info().num_pu_per_cu;

    }


    int get_warp_size() const noexcept

    {

        return this->get_exec_info().max_subgroup_size;

    }


    GKO_DEPRECATED("use get_blas_handle() instead")

    hipblasContext* get_hipblas_handle() const { return get_blas_handle(); }


    hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); }


    GKO_DEPRECATED("use get_sparselib_handle() instead")

    hipsparseContext* get_hipsparse_handle() const

    {

        return get_sparselib_handle();

    }


    hipsparseContext* get_sparselib_handle() const

    {

        return hipsparse_handle_.get();

    }


    int get_closest_numa() const { return this->get_exec_info().numa_node; }


    std::vector<int> get_closest_pus() const

    {

        return this->get_exec_info().closest_pu_ids;

    }


    GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }


protected:

    void set_gpu_property();


    void init_handles();


    HipExecutor(int device_id, std::shared_ptr<Executor> master,

                std::shared_ptr<HipAllocatorBase> alloc,

                GKO_HIP_STREAM_STRUCT* stream)

        : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}

    {

        this->get_exec_info().device_id = device_id;

        this->get_exec_info().num_computing_units = 0;

        this->get_exec_info().num_pu_per_cu = 0;

        this->HipExecutor::populate_exec_info(machine_topology::get_instance());

        this->set_gpu_property();

        this->init_handles();

    }


    void* raw_alloc(size_type size) const override;


    void raw_free(void* ptr) const noexcept override;


    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);


    bool verify_memory_to(const CudaExecutor* dest_exec) const override;


    bool verify_memory_to(const HipExecutor* dest_exec) const override;


    void populate_exec_info(const machine_topology* mach_topo) override;


private:

    std::shared_ptr<Executor> master_;


    template <typename T>

    using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;

    handle_manager<hipblasContext> hipblas_handle_;

    handle_manager<hipsparseContext> hipsparse_handle_;

    std::shared_ptr<HipAllocatorBase> alloc_;

    GKO_HIP_STREAM_STRUCT* stream_;

};


namespace kernels {

namespace hip {

using DefaultExecutor = HipExecutor;

}  // namespace hip

}  // namespace kernels


class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,

                      public std::enable_shared_from_this<DpcppExecutor> {

    friend class detail::ExecutorBase<DpcppExecutor>;


public:

    using Executor::run;


    static std::shared_ptr<DpcppExecutor> create(

        int device_id, std::shared_ptr<Executor> master,

        std::string device_type = "all",

        dpcpp_queue_property property = dpcpp_queue_property::in_order);


    std::shared_ptr<Executor> get_master() noexcept override;


    std::shared_ptr<const Executor> get_master() const noexcept override;


    void synchronize() const override;


    scoped_device_id_guard get_scoped_device_id_guard() const override;


    std::string get_description() const override;


    int get_device_id() const noexcept

    {

        return this->get_exec_info().device_id;

    }


    sycl::queue* get_queue() const { return queue_.get(); }


    static int get_num_devices(std::string device_type);


    const std::vector<int>& get_subgroup_sizes() const noexcept

    {

        return this->get_exec_info().subgroup_sizes;

    }


    int get_num_computing_units() const noexcept

    {

        return this->get_exec_info().num_computing_units;

    }


    int get_num_subgroups() const noexcept

    {

        return this->get_exec_info().num_computing_units *

               this->get_exec_info().num_pu_per_cu;

    }


    const std::vector<int>& get_max_workitem_sizes() const noexcept

    {

        return this->get_exec_info().max_workitem_sizes;

    }


    int get_max_workgroup_size() const noexcept

    {

        return this->get_exec_info().max_workgroup_size;

    }


    int get_max_subgroup_size() const noexcept

    {

        return this->get_exec_info().max_subgroup_size;

    }


    std::string get_device_type() const noexcept

    {

        return this->get_exec_info().device_type;

    }


protected:

    void set_device_property(

        dpcpp_queue_property property = dpcpp_queue_property::in_order);


    DpcppExecutor(

        int device_id, std::shared_ptr<Executor> master,

        std::string device_type = "all",

        dpcpp_queue_property property = dpcpp_queue_property::in_order)

        : master_(master)

    {

        std::for_each(device_type.begin(), device_type.end(),

                      [](char& c) { c = std::tolower(c); });

        this->get_exec_info().device_type = std::string(device_type);

        this->get_exec_info().device_id = device_id;

        this->set_device_property(property);

    }


    void populate_exec_info(const machine_topology* mach_topo) override;


    void* raw_alloc(size_type size) const override;


    void raw_free(void* ptr) const noexcept override;


    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);


    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);


    bool verify_memory_to(const OmpExecutor* dest_exec) const override;


    bool verify_memory_to(const DpcppExecutor* dest_exec) const override;


private:

    std::shared_ptr<Executor> master_;


    template <typename T>

    using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;

    queue_manager<sycl::queue> queue_;

};


namespace kernels {

namespace dpcpp {

using DefaultExecutor = DpcppExecutor;

}  // namespace dpcpp

}  // namespace kernels


#undef GKO_OVERRIDE_RAW_COPY_TO


}  // namespace gko


#endif  // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_