5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ 
    6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ 
   17 #include <type_traits> 
   21 #include <ginkgo/core/base/device.hpp> 
   22 #include <ginkgo/core/base/fwd_decls.hpp> 
   23 #include <ginkgo/core/base/machine_topology.hpp> 
   24 #include <ginkgo/core/base/memory.hpp> 
   25 #include <ginkgo/core/base/scoped_device_id_guard.hpp> 
   26 #include <ginkgo/core/base/types.hpp> 
   27 #include <ginkgo/core/log/logger.hpp> 
   28 #include <ginkgo/core/synthesizer/containers.hpp> 
   69 constexpr 
allocation_mode default_cuda_alloc_mode = allocation_mode::device;
 
   71 constexpr 
allocation_mode default_hip_alloc_mode = allocation_mode::device;
 
   77     allocation_mode::unified_global;
 
   79 #if (GINKGO_HIP_PLATFORM_HCC == 1) 
   82 constexpr 
allocation_mode default_hip_alloc_mode = allocation_mode::device;
 
   88     allocation_mode::unified_global;
 
  102 enum class dpcpp_queue_property {
 
  114 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
 
  115                                                          dpcpp_queue_property b)
 
  117     return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
 
  118                                              static_cast<int>(b));
 
  125 #define GKO_FORWARD_DECLARE(_type, ...) class _type 
  127 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
 
  129 #undef GKO_FORWARD_DECLARE 
  132 class ReferenceExecutor;
 
  261 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \ 
  262     virtual void run(std::shared_ptr<const _type>) const 
  264     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
 
  266 #undef GKO_DECLARE_RUN_OVERLOAD 
  269     virtual void run(std::shared_ptr<const ReferenceExecutor> executor) 
const;
 
  276     virtual const char* 
get_name() 
const noexcept;
 
  292 template <
typename Closure>
 
  293 class RegisteredOperation : 
public Operation {
 
  301     RegisteredOperation(
const char* name, Closure op)
 
  302         : name_(name), op_(std::move(op))
 
  305     const char* get_name() const noexcept
 override { 
return name_; }
 
  307     void run(std::shared_ptr<const ReferenceExecutor> exec)
 const override 
  312     void run(std::shared_ptr<const OmpExecutor> exec)
 const override 
  317     void run(std::shared_ptr<const CudaExecutor> exec)
 const override 
  322     void run(std::shared_ptr<const HipExecutor> exec)
 const override 
  327     void run(std::shared_ptr<const DpcppExecutor> exec)
 const override 
  338 template <
typename Closure>
 
  339 RegisteredOperation<Closure> make_register_operation(
const char* name,
 
  342     return RegisteredOperation<Closure>{name, std::move(op)};
 
  420 #define GKO_REGISTER_OPERATION(_name, _kernel)                                 \ 
  421     template <typename... Args>                                                \ 
  422     auto make_##_name(Args&&... args)                                          \ 
  424         return ::gko::detail::make_register_operation(                         \ 
  425             #_kernel, [&args...](auto exec) {                                  \ 
  426                 using exec_type = decltype(exec);                              \ 
  429                         std::shared_ptr<const ::gko::ReferenceExecutor>>::     \ 
  431                     ::gko::kernels::reference::_kernel(                        \ 
  432                         std::dynamic_pointer_cast<                             \ 
  433                             const ::gko::ReferenceExecutor>(exec),             \ 
  434                         std::forward<Args>(args)...);                          \ 
  435                 } else if (std::is_same<                                       \ 
  437                                std::shared_ptr<const ::gko::OmpExecutor>>::    \ 
  439                     ::gko::kernels::omp::_kernel(                              \ 
  440                         std::dynamic_pointer_cast<const ::gko::OmpExecutor>(   \ 
  442                         std::forward<Args>(args)...);                          \ 
  443                 } else if (std::is_same<                                       \ 
  445                                std::shared_ptr<const ::gko::CudaExecutor>>::   \ 
  447                     ::gko::kernels::cuda::_kernel(                             \ 
  448                         std::dynamic_pointer_cast<const ::gko::CudaExecutor>(  \ 
  450                         std::forward<Args>(args)...);                          \ 
  451                 } else if (std::is_same<                                       \ 
  453                                std::shared_ptr<const ::gko::HipExecutor>>::    \ 
  455                     ::gko::kernels::hip::_kernel(                              \ 
  456                         std::dynamic_pointer_cast<const ::gko::HipExecutor>(   \ 
  458                         std::forward<Args>(args)...);                          \ 
  459                 } else if (std::is_same<                                       \ 
  461                                std::shared_ptr<const ::gko::DpcppExecutor>>::  \ 
  463                     ::gko::kernels::dpcpp::_kernel(                            \ 
  464                         std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \ 
  466                         std::forward<Args>(args)...);                          \ 
  468                     GKO_NOT_IMPLEMENTED;                                       \ 
  472     static_assert(true,                                                        \ 
  473                   "This assert is used to counter the false positive extra "   \ 
  474                   "semi-colon warnings") 
  514 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel)                          \ 
  515     template <typename... Args>                                              \ 
  516     auto make_##_name(Args&&... args)                                        \ 
  518         return ::gko::detail::make_register_operation(                       \ 
  520             [&args...](auto) { _kernel(std::forward<Args>(args)...); });     \ 
  522     static_assert(true,                                                      \ 
  523                   "This assert is used to counter the false positive extra " \ 
  524                   "semi-colon warnings") 
  527 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type 
  617     template <
typename T>
 
  618     friend class detail::ExecutorBase;
 
  620     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
 
  653     template <
typename ClosureOmp, 
typename ClosureCuda, 
typename ClosureHip,
 
  654               typename ClosureDpcpp>
 
  655     void run(
const ClosureOmp& op_omp, 
const ClosureCuda& op_cuda,
 
  656              const ClosureHip& op_hip, 
const ClosureDpcpp& op_dpcpp)
 const 
  658         LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
 
  659             op_omp, op_cuda, op_hip, op_dpcpp);
 
  674     template <
typename T>
 
  677         this->
template log<log::Logger::allocation_started>(
 
  678             this, num_elems * 
sizeof(T));
 
  679         T* allocated = static_cast<T*>(this->raw_alloc(num_elems * 
sizeof(T)));
 
  680         this->
template log<log::Logger::allocation_completed>(
 
  681             this, num_elems * 
sizeof(T), reinterpret_cast<uintptr>(allocated));
 
  692     void free(
void* ptr) 
const noexcept
 
  694         this->
template log<log::Logger::free_started>(
 
  695             this, reinterpret_cast<uintptr>(ptr));
 
  697         this->
template log<log::Logger::free_completed>(
 
  698             this, reinterpret_cast<uintptr>(ptr));
 
  713     template <
typename T>
 
  715                    const T* src_ptr, T* dest_ptr)
 const 
  717         const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
 
  718         const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
 
  719         this->
template log<log::Logger::copy_started>(
 
  720             src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  721         if (
this != src_exec.
get()) {
 
  722             src_exec->template log<log::Logger::copy_started>(
 
  723                 src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  726             this->raw_copy_from(src_exec.
get(), num_elems * 
sizeof(T), src_ptr,
 
  729 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG) 
  732             std::clog << 
"Not direct copy. Try to copy data from the masters." 
  735             auto src_master = src_exec->get_master().
get();
 
  736             if (num_elems > 0 && src_master != src_exec.
get()) {
 
  737                 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
 
  738                 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
 
  740                 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
 
  741                 src_master->free(master_ptr);
 
  744         this->
template log<log::Logger::copy_completed>(
 
  745             src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  746         if (
this != src_exec.
get()) {
 
  747             src_exec->template log<log::Logger::copy_completed>(
 
  748                 src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  763     template <
typename T>
 
  766         this->
copy_from(
this, num_elems, src_ptr, dest_ptr);
 
  778     template <
typename T>
 
  782         this->
get_master()->copy_from(
this, 1, ptr, &out);
 
  790     virtual std::shared_ptr<Executor> 
get_master() noexcept = 0;
 
  808     void add_logger(std::shared_ptr<const log::Logger> logger)
 override 
  810         this->propagating_logger_refcount_.fetch_add(
 
  811             logger->needs_propagation() ? 1 : 0);
 
  812         this->EnableLogging<Executor>::add_logger(logger);
 
  823         this->propagating_logger_refcount_.fetch_sub(
 
  825         this->EnableLogging<Executor>::remove_logger(logger);
 
  828     using EnableLogging<Executor>::remove_logger;
 
  839         log_propagation_mode_ = 
mode;
 
  851         return this->propagating_logger_refcount_.load() > 0 &&
 
  864         return this->verify_memory_from(other.get());
 
  883         std::string device_type;
 
  898         int num_computing_units = -1;
 
  911         int num_pu_per_cu = -1;
 
  921         std::vector<int> subgroup_sizes{};
 
  931         int max_subgroup_size = -1;
 
  943         std::vector<int> max_workitem_sizes{};
 
  954         int max_workgroup_size;
 
  971         std::string pci_bus_id = std::string(13, 
'x');
 
  983         std::vector<int> closest_pu_ids{};
 
  991     const exec_info& get_exec_info()
 const { 
return this->exec_info_; }
 
 1002     virtual void* raw_alloc(
size_type size) 
const = 0;
 
 1011     virtual void raw_free(
void* ptr) 
const noexcept = 0;
 
 1023     virtual void raw_copy_from(
const Executor* src_exec, 
size_type n_bytes,
 
 1024                                const void* src_ptr, 
void* dest_ptr) 
const = 0;
 
 1035 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...)                              \ 
 1036     virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \ 
 1037                              const void* src_ptr, void* dest_ptr) const = 0 
 1039     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
 
 1041 #undef GKO_ENABLE_RAW_COPY_TO 
 1050     virtual bool verify_memory_from(
const Executor* src_exec) 
const = 0;
 
 1061 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \ 
 1062     virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0 
 1064     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
 
 1066     GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
 
 1068 #undef GKO_ENABLE_VERIFY_MEMORY_TO 
 1076     virtual void populate_exec_info(
const machine_topology* mach_topo) = 0;
 
 1083     exec_info& get_exec_info() { 
return this->exec_info_; }
 
 1085     exec_info exec_info_;
 
 1089     std::atomic<int> propagating_logger_refcount_{};
 
 1106     template <
typename ClosureOmp, 
typename ClosureCuda, 
typename ClosureHip,
 
 1107               typename ClosureDpcpp>
 
 1108     class LambdaOperation : 
public Operation {
 
 1120         LambdaOperation(
const ClosureOmp& op_omp, 
const ClosureCuda& op_cuda,
 
 1121                         const ClosureHip& op_hip, 
const ClosureDpcpp& op_dpcpp)
 
 1128         void run(std::shared_ptr<const OmpExecutor>)
 const override 
 1133         void run(std::shared_ptr<const ReferenceExecutor>)
 const override 
 1138         void run(std::shared_ptr<const CudaExecutor>)
 const override 
 1143         void run(std::shared_ptr<const HipExecutor>)
 const override 
 1148         void run(std::shared_ptr<const DpcppExecutor>)
 const override 
 1155         ClosureCuda op_cuda_;
 
 1157         ClosureDpcpp op_dpcpp_;
 
 1170 template <
typename T>
 
 1197     std::shared_ptr<const Executor> exec_;
 
 1201 template <
typename T>
 
 1204     using pointer = T[];
 
 1218     std::shared_ptr<const Executor> exec_;
 
 1225 template <
typename ConcreteExecutor>
 
 1226 class ExecutorBase : 
public Executor {
 
 1227     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
 
 1233     void run(
const Operation& op)
 const override 
 1235         this->
template log<log::Logger::operation_launched>(
this, &op);
 
 1236         auto scope_guard = get_scoped_device_id_guard();
 
 1237         op.run(
self()->shared_from_this());
 
 1238         this->
template log<log::Logger::operation_completed>(
this, &op);
 
 1242     void raw_copy_from(
const Executor* src_exec, 
size_type n_bytes,
 
 1243                        const void* src_ptr, 
void* dest_ptr)
 const override 
 1245         src_exec->raw_copy_to(
self(), n_bytes, src_ptr, dest_ptr);
 
 1248     virtual bool verify_memory_from(
const Executor* src_exec)
 const override 
 1250         return src_exec->verify_memory_to(
self());
 
 1254     ConcreteExecutor* 
self() noexcept
 
 1256         return static_cast<ConcreteExecutor*>(
this);
 
 1259     const ConcreteExecutor* 
self() 
const noexcept
 
 1261         return static_cast<const ConcreteExecutor*>(
this);
 
 1265 #undef GKO_DECLARE_EXECUTOR_FRIEND 
 1275 class EnableDeviceReset {
 
 1283         "device_reset is no longer supported, call " 
 1284         "cudaDeviceReset/hipDeviceReset manually")
 
 1285     void set_device_reset(
bool device_reset) {}
 
 1293         "device_reset is no longer supported, call " 
 1294         "cudaDeviceReset/hipDeviceReset manually")
 
 1295     bool get_device_reset() { 
return false; }
 
 1303     EnableDeviceReset() {}
 
 1306         "device_reset is no longer supported, call " 
 1307         "cudaDeviceReset/hipDeviceReset manually")
 
 1308     EnableDeviceReset(
bool device_reset) {}
 
 1315 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...)                    \ 
 1316     void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \ 
 1317                      const void* src_ptr, void* dest_ptr) const override 
 1320 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_)                     \ 
 1321     virtual bool verify_memory_to(const dest_* other) const override         \ 
 1325     static_assert(true,                                                      \ 
 1326                   "This assert is used to counter the false positive extra " \ 
 1327                   "semi-colon warnings") 
 1338                     public std::enable_shared_from_this<OmpExecutor> {
 
 1339     friend class detail::ExecutorBase<OmpExecutor>;
 
 1346         std::shared_ptr<CpuAllocatorBase> alloc =
 
 1347             std::make_shared<CpuAllocator>())
 
 1349         return std::shared_ptr<OmpExecutor>(
new OmpExecutor(std::move(alloc)));
 
 1352     std::shared_ptr<Executor> get_master() noexcept override;
 
 1354     std::shared_ptr<const 
Executor> get_master() const noexcept override;
 
 1356     void synchronize() const override;
 
 1358     int get_num_cores()
 const 
 1360         return this->get_exec_info().num_computing_units;
 
 1363     int get_num_threads_per_core()
 const 
 1365         return this->get_exec_info().num_pu_per_cu;
 
 1368     static int get_num_omp_threads();
 
 1370     scoped_device_id_guard get_scoped_device_id_guard() 
const override;
 
 1373     OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
 
 1374         : alloc_{std::move(alloc)}
 
 1379     void populate_exec_info(
const machine_topology* mach_topo) 
override;
 
 1381     void* raw_alloc(
size_type size) 
const override;
 
 1383     void raw_free(
void* ptr) 
const noexcept 
override;
 
 1385     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 1387     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
true);
 
 1389     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 1391     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, 
false);
 
 1393     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, 
false);
 
 1395     bool verify_memory_to(
const DpcppExecutor* dest_exec) 
const override;
 
 1397     std::shared_ptr<CpuAllocatorBase> alloc_;
 
 1403 using DefaultExecutor = OmpExecutor;
 
 1417     static std::shared_ptr<ReferenceExecutor> create(
 
 1418         std::shared_ptr<CpuAllocatorBase> alloc =
 
 1419             std::make_shared<CpuAllocator>())
 
 1421         return std::shared_ptr<ReferenceExecutor>(
 
 1432         this->
template log<log::Logger::operation_launched>(
this, &op);
 
 1433         op.run(std::static_pointer_cast<const ReferenceExecutor>(
 
 1434             this->shared_from_this()));
 
 1435         this->
template log<log::Logger::operation_completed>(
this, &op);
 
 1442         this->ReferenceExecutor::populate_exec_info(
 
 1446     void populate_exec_info(
const machine_topology*)
 override 
 1448         this->get_exec_info().device_id = -1;
 
 1449         this->get_exec_info().num_computing_units = 1;
 
 1450         this->get_exec_info().num_pu_per_cu = 1;
 
 1453     bool verify_memory_from(
const Executor* src_exec)
 const override 
 1455         return src_exec->verify_memory_to(
this);
 
 1458     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
true);
 
 1460     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
false);
 
 1462     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, 
false);
 
 1464     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, 
false);
 
 1466     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, 
false);
 
 1471 namespace reference {
 
 1472 using DefaultExecutor = ReferenceExecutor;
 
 1484                      public std::enable_shared_from_this<CudaExecutor>,
 
 1485                      public detail::EnableDeviceReset {
 
 1486     friend class detail::ExecutorBase<CudaExecutor>;
 
 1501         "calling this CudaExecutor::create method is deprecated, because" 
 1502         "device_reset no longer has an effect" 
 1503         "call CudaExecutor::create(" 
 1504         "  int device_id, std::shared_ptr<Executor> master," 
 1505         "  std::shared_ptr<CudaAllocatorBase> alloc," 
 1506         "  CUstream_st* stream);" 
 1508     static std::shared_ptr<CudaExecutor> create(
 
 1509         int device_id, std::shared_ptr<Executor> master, 
bool device_reset,
 
 1511         CUstream_st* stream = 
nullptr);
 
 1522     static std::shared_ptr<CudaExecutor> create(
 
 1523         int device_id, std::shared_ptr<Executor> master,
 
 1524         std::shared_ptr<CudaAllocatorBase> alloc =
 
 1525             std::make_shared<CudaAllocator>(),
 
 1526         CUstream_st* stream = 
nullptr);
 
 1528     std::shared_ptr<Executor> get_master() noexcept 
override;
 
 1530     std::shared_ptr<const Executor> get_master() 
const noexcept 
override;
 
 1532     void synchronize() 
const override;
 
 1541         return this->get_exec_info().device_id;
 
 1547     static int get_num_devices();
 
 1554         return this->get_exec_info().num_pu_per_cu;
 
 1562         return this->get_exec_info().num_computing_units;
 
 1570         return this->get_exec_info().num_computing_units *
 
 1571                this->get_exec_info().num_pu_per_cu;
 
 1579         return this->get_exec_info().max_subgroup_size;
 
 1587         return this->get_exec_info().major;
 
 1595         return this->get_exec_info().minor;
 
 1612         return cusparse_handle_.get();
 
 1622         return this->get_exec_info().closest_pu_ids;
 
 1641     void set_gpu_property();
 
 1643     void init_handles();
 
 1645     CudaExecutor(
int device_id, std::shared_ptr<Executor> master,
 
 1646                  std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
 
 1647         : alloc_{std::move(alloc)}, master_(master), stream_{stream}
 
 1649         this->get_exec_info().device_id = device_id;
 
 1650         this->get_exec_info().num_computing_units = 0;
 
 1651         this->get_exec_info().num_pu_per_cu = 0;
 
 1652         this->CudaExecutor::populate_exec_info(
 
 1654         this->set_gpu_property();
 
 1655         this->init_handles();
 
 1658     void* raw_alloc(
size_type size) 
const override;
 
 1660     void raw_free(
void* ptr) 
const noexcept 
override;
 
 1662     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 1664     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
false);
 
 1666     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 1668     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, 
false);
 
 1670     bool verify_memory_to(
const HipExecutor* dest_exec) 
const override;
 
 1672     bool verify_memory_to(
const CudaExecutor* dest_exec) 
const override;
 
 1674     void populate_exec_info(
const machine_topology* mach_topo) 
override;
 
 1677     std::shared_ptr<Executor> master_;
 
 1679     template <
typename T>
 
 1680     using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
 
 1681     handle_manager<cublasContext> cublas_handle_;
 
 1682     handle_manager<cusparseContext> cusparse_handle_;
 
 1683     std::shared_ptr<CudaAllocatorBase> alloc_;
 
 1684     CUstream_st* stream_;
 
 1690 using DefaultExecutor = CudaExecutor;
 
 1702                     public std::enable_shared_from_this<HipExecutor>,
 
 1703                     public detail::EnableDeviceReset {
 
 1719         "device_reset is deprecated entirely, call hipDeviceReset directly. " 
 1720         "alloc_mode was replaced by the Allocator type " 
 1722     static std::shared_ptr<HipExecutor> 
create(
 
 1723         int device_id, std::shared_ptr<Executor> master, 
bool device_reset,
 
 1725         GKO_HIP_STREAM_STRUCT* stream = 
nullptr);
 
 1727     static std::shared_ptr<HipExecutor> 
create(
 
 1728         int device_id, std::shared_ptr<Executor> master,
 
 1729         std::shared_ptr<HipAllocatorBase> 
alloc =
 
 1730             std::make_shared<HipAllocator>(),
 
 1731         GKO_HIP_STREAM_STRUCT* stream = 
nullptr);
 
 1733     std::shared_ptr<Executor> 
get_master() noexcept 
override;
 
 1735     std::shared_ptr<const Executor> 
get_master() 
const noexcept 
override;
 
 1746         return this->get_exec_info().device_id;
 
 1759         return this->get_exec_info().num_pu_per_cu;
 
 1767         return this->get_exec_info().num_computing_units;
 
 1775         return this->get_exec_info().major;
 
 1783         return this->get_exec_info().minor;
 
 1791         return this->get_exec_info().num_computing_units *
 
 1792                this->get_exec_info().num_pu_per_cu;
 
 1800         return this->get_exec_info().max_subgroup_size;
 
 1817         return hipsparse_handle_.get();
 
 1834         return this->get_exec_info().closest_pu_ids;
 
 1837     GKO_HIP_STREAM_STRUCT* get_stream()
 const { 
return stream_; }
 
 1840     void set_gpu_property();
 
 1842     void init_handles();
 
 1844     HipExecutor(
int device_id, std::shared_ptr<Executor> master,
 
 1845                 std::shared_ptr<HipAllocatorBase> 
alloc,
 
 1846                 GKO_HIP_STREAM_STRUCT* stream)
 
 1847         : master_{std::move(master)}, alloc_{std::move(
alloc)}, stream_{stream}
 
 1849         this->get_exec_info().device_id = device_id;
 
 1850         this->get_exec_info().num_computing_units = 0;
 
 1851         this->get_exec_info().num_pu_per_cu = 0;
 
 1853         this->set_gpu_property();
 
 1854         this->init_handles();
 
 1857     void* raw_alloc(
size_type size) 
const override;
 
 1859     void raw_free(
void* ptr) 
const noexcept 
override;
 
 1861     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 1863     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
false);
 
 1865     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 1867     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, 
false);
 
 1869     bool verify_memory_to(
const CudaExecutor* dest_exec) 
const override;
 
 1871     bool verify_memory_to(
const HipExecutor* dest_exec) 
const override;
 
 1873     void populate_exec_info(
const machine_topology* mach_topo) 
override;
 
 1876     std::shared_ptr<Executor> master_;
 
 1878     template <
typename T>
 
 1879     using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
 
 1880     handle_manager<hipblasContext> hipblas_handle_;
 
 1881     handle_manager<hipsparseContext> hipsparse_handle_;
 
 1882     std::shared_ptr<HipAllocatorBase> alloc_;
 
 1883     GKO_HIP_STREAM_STRUCT* stream_;
 
 1889 using DefaultExecutor = HipExecutor;
 
 1901                       public std::enable_shared_from_this<DpcppExecutor> {
 
 1914     static std::shared_ptr<DpcppExecutor> 
create(
 
 1915         int device_id, std::shared_ptr<Executor> master,
 
 1916         std::string device_type = 
"all",
 
 1917         dpcpp_queue_property property = dpcpp_queue_property::in_order);
 
 1919     std::shared_ptr<Executor> 
get_master() noexcept 
override;
 
 1921     std::shared_ptr<const Executor> 
get_master() 
const noexcept 
override;
 
 1934         return this->get_exec_info().device_id;
 
 1937     sycl::queue* get_queue()
 const { 
return queue_.get(); }
 
 1955         return this->get_exec_info().subgroup_sizes;
 
 1965         return this->get_exec_info().num_computing_units;
 
 1973         return this->get_exec_info().num_computing_units *
 
 1974                this->get_exec_info().num_pu_per_cu;
 
 1984         return this->get_exec_info().max_workitem_sizes;
 
 1994         return this->get_exec_info().max_workgroup_size;
 
 2004         return this->get_exec_info().max_subgroup_size;
 
 2014         return this->get_exec_info().device_type;
 
 2018     void set_device_property(
 
 2019         dpcpp_queue_property property = dpcpp_queue_property::in_order);
 
 2022         int device_id, std::shared_ptr<Executor> master,
 
 2023         std::string device_type = 
"all",
 
 2024         dpcpp_queue_property property = dpcpp_queue_property::in_order)
 
 2027         std::for_each(device_type.begin(), device_type.end(),
 
 2028                       [](
char& c) { c = std::tolower(c); });
 
 2029         this->get_exec_info().device_type = std::string(device_type);
 
 2030         this->get_exec_info().device_id = device_id;
 
 2031         this->set_device_property(property);
 
 2034     void populate_exec_info(
const machine_topology* mach_topo) 
override;
 
 2036     void* raw_alloc(
size_type size) 
const override;
 
 2038     void raw_free(
void* ptr) 
const noexcept 
override;
 
 2040     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 2042     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, 
false);
 
 2044     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, 
false);
 
 2046     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 2048     bool verify_memory_to(
const OmpExecutor* dest_exec) 
const override;
 
 2050     bool verify_memory_to(
const DpcppExecutor* dest_exec) 
const override;
 
 2053     std::shared_ptr<Executor> master_;
 
 2055     template <
typename T>
 
 2056     using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
 
 2057     queue_manager<sycl::queue> queue_;
 
 2063 using DefaultExecutor = DpcppExecutor;
 
 2068 #undef GKO_OVERRIDE_RAW_COPY_TO 
 2074 #endif  // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_