5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
17 #include <type_traits>
20 #include <ginkgo/core/base/device.hpp>
21 #include <ginkgo/core/base/fwd_decls.hpp>
22 #include <ginkgo/core/base/machine_topology.hpp>
23 #include <ginkgo/core/base/memory.hpp>
24 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
25 #include <ginkgo/core/base/types.hpp>
26 #include <ginkgo/core/log/logger.hpp>
27 #include <ginkgo/core/synthesizer/containers.hpp>
68 constexpr
allocation_mode default_cuda_alloc_mode = allocation_mode::device;
70 constexpr
allocation_mode default_hip_alloc_mode = allocation_mode::device;
76 allocation_mode::unified_global;
78 #if (GINKGO_HIP_PLATFORM_HCC == 1)
81 constexpr
allocation_mode default_hip_alloc_mode = allocation_mode::device;
87 allocation_mode::unified_global;
101 enum class dpcpp_queue_property {
113 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
114 dpcpp_queue_property b)
116 return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
117 static_cast<int>(b));
124 #define GKO_FORWARD_DECLARE(_type, ...) class _type
126 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
128 #undef GKO_FORWARD_DECLARE
131 class ReferenceExecutor;
260 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
261 virtual void run(std::shared_ptr<const _type>) const
263 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
265 #undef GKO_DECLARE_RUN_OVERLOAD
268 virtual void run(std::shared_ptr<const ReferenceExecutor> executor)
const;
275 virtual const char*
get_name()
const noexcept;
291 template <
typename Closure>
292 class RegisteredOperation :
public Operation {
300 RegisteredOperation(
const char* name, Closure op)
301 : name_(name), op_(std::move(op))
304 const char* get_name() const noexcept
override {
return name_; }
306 void run(std::shared_ptr<const ReferenceExecutor> exec)
const override
311 void run(std::shared_ptr<const OmpExecutor> exec)
const override
316 void run(std::shared_ptr<const CudaExecutor> exec)
const override
321 void run(std::shared_ptr<const HipExecutor> exec)
const override
326 void run(std::shared_ptr<const DpcppExecutor> exec)
const override
337 template <
typename Closure>
338 RegisteredOperation<Closure> make_register_operation(
const char* name,
341 return RegisteredOperation<Closure>{name, std::move(op)};
419 #define GKO_REGISTER_OPERATION(_name, _kernel) \
420 template <typename... Args> \
421 auto make_##_name(Args&&... args) \
423 return ::gko::detail::make_register_operation( \
424 #_kernel, [&args...](auto exec) { \
425 using exec_type = decltype(exec); \
428 std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
430 ::gko::kernels::reference::_kernel( \
431 std::dynamic_pointer_cast< \
432 const ::gko::ReferenceExecutor>(exec), \
433 std::forward<Args>(args)...); \
434 } else if (std::is_same< \
436 std::shared_ptr<const ::gko::OmpExecutor>>:: \
438 ::gko::kernels::omp::_kernel( \
439 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
441 std::forward<Args>(args)...); \
442 } else if (std::is_same< \
444 std::shared_ptr<const ::gko::CudaExecutor>>:: \
446 ::gko::kernels::cuda::_kernel( \
447 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
449 std::forward<Args>(args)...); \
450 } else if (std::is_same< \
452 std::shared_ptr<const ::gko::HipExecutor>>:: \
454 ::gko::kernels::hip::_kernel( \
455 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
457 std::forward<Args>(args)...); \
458 } else if (std::is_same< \
460 std::shared_ptr<const ::gko::DpcppExecutor>>:: \
462 ::gko::kernels::dpcpp::_kernel( \
463 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
465 std::forward<Args>(args)...); \
467 GKO_NOT_IMPLEMENTED; \
471 static_assert(true, \
472 "This assert is used to counter the false positive extra " \
473 "semi-colon warnings")
513 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
514 template <typename... Args> \
515 auto make_##_name(Args&&... args) \
517 return ::gko::detail::make_register_operation( \
519 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
521 static_assert(true, \
522 "This assert is used to counter the false positive extra " \
523 "semi-colon warnings")
526 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
616 template <
typename T>
617 friend class detail::ExecutorBase;
619 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
652 template <
typename ClosureOmp,
typename ClosureCuda,
typename ClosureHip,
653 typename ClosureDpcpp>
654 void run(
const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
655 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
const
657 LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
658 op_omp, op_cuda, op_hip, op_dpcpp);
673 template <
typename T>
676 this->
template log<log::Logger::allocation_started>(
677 this, num_elems *
sizeof(T));
678 T* allocated = static_cast<T*>(this->raw_alloc(num_elems *
sizeof(T)));
679 this->
template log<log::Logger::allocation_completed>(
680 this, num_elems *
sizeof(T), reinterpret_cast<uintptr>(allocated));
691 void free(
void* ptr)
const noexcept
693 this->
template log<log::Logger::free_started>(
694 this, reinterpret_cast<uintptr>(ptr));
696 this->
template log<log::Logger::free_completed>(
697 this, reinterpret_cast<uintptr>(ptr));
712 template <
typename T>
714 const T* src_ptr, T* dest_ptr)
const
716 const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
717 const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
718 this->
template log<log::Logger::copy_started>(
719 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
720 if (
this != src_exec.
get()) {
721 src_exec->template log<log::Logger::copy_started>(
722 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
725 this->raw_copy_from(src_exec.
get(), num_elems *
sizeof(T), src_ptr,
728 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
731 std::clog <<
"Not direct copy. Try to copy data from the masters."
734 auto src_master = src_exec->get_master().
get();
735 if (num_elems > 0 && src_master != src_exec.
get()) {
736 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
737 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
739 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
740 src_master->free(master_ptr);
743 this->
template log<log::Logger::copy_completed>(
744 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
745 if (
this != src_exec.
get()) {
746 src_exec->template log<log::Logger::copy_completed>(
747 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
762 template <
typename T>
765 this->
copy_from(
this, num_elems, src_ptr, dest_ptr);
777 template <
typename T>
781 this->
get_master()->copy_from(
this, 1, ptr, &out);
789 virtual std::shared_ptr<Executor>
get_master() noexcept = 0;
807 void add_logger(std::shared_ptr<const log::Logger> logger)
override
809 this->propagating_logger_refcount_.fetch_add(
810 logger->needs_propagation() ? 1 : 0);
811 this->EnableLogging<Executor>::add_logger(logger);
822 this->propagating_logger_refcount_.fetch_sub(
824 this->EnableLogging<Executor>::remove_logger(logger);
827 using EnableLogging<Executor>::remove_logger;
838 log_propagation_mode_ =
mode;
850 return this->propagating_logger_refcount_.load() > 0 &&
863 return this->verify_memory_from(other.get());
885 std::string device_type;
900 int num_computing_units = -1;
913 int num_pu_per_cu = -1;
923 std::vector<int> subgroup_sizes{};
933 int max_subgroup_size = -1;
945 std::vector<int> max_workitem_sizes{};
956 int max_workgroup_size;
973 std::string pci_bus_id = std::string(13,
'x');
985 std::vector<int> closest_pu_ids{};
993 const exec_info& get_exec_info()
const {
return this->exec_info_; }
1004 virtual void* raw_alloc(
size_type size)
const = 0;
1013 virtual void raw_free(
void* ptr)
const noexcept = 0;
1025 virtual void raw_copy_from(
const Executor* src_exec,
size_type n_bytes,
1026 const void* src_ptr,
void* dest_ptr)
const = 0;
1037 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1038 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1039 const void* src_ptr, void* dest_ptr) const = 0
1041 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1043 #undef GKO_ENABLE_RAW_COPY_TO
1052 virtual bool verify_memory_from(
const Executor* src_exec)
const = 0;
1063 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1064 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1066 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1068 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1070 #undef GKO_ENABLE_VERIFY_MEMORY_TO
1078 virtual void populate_exec_info(
const machine_topology* mach_topo) = 0;
1085 exec_info& get_exec_info() {
return this->exec_info_; }
1087 exec_info exec_info_;
1091 std::atomic<int> propagating_logger_refcount_{};
1108 template <
typename ClosureOmp,
typename ClosureCuda,
typename ClosureHip,
1109 typename ClosureDpcpp>
1110 class LambdaOperation :
public Operation {
1122 LambdaOperation(
const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
1123 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
1130 void run(std::shared_ptr<const OmpExecutor>)
const override
1135 void run(std::shared_ptr<const ReferenceExecutor>)
const override
1140 void run(std::shared_ptr<const CudaExecutor>)
const override
1145 void run(std::shared_ptr<const HipExecutor>)
const override
1150 void run(std::shared_ptr<const DpcppExecutor>)
const override
1157 ClosureCuda op_cuda_;
1159 ClosureDpcpp op_dpcpp_;
1172 template <
typename T>
1199 std::shared_ptr<const Executor> exec_;
1203 template <
typename T>
1206 using pointer = T[];
1220 std::shared_ptr<const Executor> exec_;
1227 template <
typename ConcreteExecutor>
1228 class ExecutorBase :
public Executor {
1229 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
1235 void run(
const Operation& op)
const override
1237 this->
template log<log::Logger::operation_launched>(
this, &op);
1238 auto scope_guard = get_scoped_device_id_guard();
1239 op.run(
self()->shared_from_this());
1240 this->
template log<log::Logger::operation_completed>(
this, &op);
1244 void raw_copy_from(
const Executor* src_exec,
size_type n_bytes,
1245 const void* src_ptr,
void* dest_ptr)
const override
1247 src_exec->raw_copy_to(
self(), n_bytes, src_ptr, dest_ptr);
1250 virtual bool verify_memory_from(
const Executor* src_exec)
const override
1252 return src_exec->verify_memory_to(
self());
1256 ConcreteExecutor*
self() noexcept
1258 return static_cast<ConcreteExecutor*>(
this);
1261 const ConcreteExecutor*
self()
const noexcept
1263 return static_cast<const ConcreteExecutor*>(
this);
1267 #undef GKO_DECLARE_EXECUTOR_FRIEND
1277 class EnableDeviceReset {
1285 "device_reset is no longer supported, call "
1286 "cudaDeviceReset/hipDeviceReset manually")
1287 void set_device_reset(
bool device_reset) {}
1295 "device_reset is no longer supported, call "
1296 "cudaDeviceReset/hipDeviceReset manually")
1297 bool get_device_reset() {
return false; }
1305 EnableDeviceReset() {}
1308 "device_reset is no longer supported, call "
1309 "cudaDeviceReset/hipDeviceReset manually")
1310 EnableDeviceReset(
bool device_reset) {}
1317 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1318 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1319 const void* src_ptr, void* dest_ptr) const override
1322 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1323 virtual bool verify_memory_to(const dest_* other) const override \
1327 static_assert(true, \
1328 "This assert is used to counter the false positive extra " \
1329 "semi-colon warnings")
1340 public std::enable_shared_from_this<OmpExecutor> {
1341 friend class detail::ExecutorBase<OmpExecutor>;
1348 std::shared_ptr<CpuAllocatorBase> alloc =
1349 std::make_shared<CpuAllocator>())
1351 return std::shared_ptr<OmpExecutor>(
new OmpExecutor(std::move(alloc)));
1354 std::shared_ptr<Executor> get_master() noexcept override;
1356 std::shared_ptr<const
Executor> get_master() const noexcept override;
1358 void synchronize() const override;
1360 int get_num_cores()
const
1362 return this->get_exec_info().num_computing_units;
1365 int get_num_threads_per_core()
const
1367 return this->get_exec_info().num_pu_per_cu;
1370 static int get_num_omp_threads();
1372 scoped_device_id_guard get_scoped_device_id_guard()
const override;
1374 std::string get_description()
const override;
1377 OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1378 : alloc_{std::move(alloc)}
1383 void populate_exec_info(
const machine_topology* mach_topo)
override;
1385 void* raw_alloc(
size_type size)
const override;
1387 void raw_free(
void* ptr)
const noexcept
override;
1389 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1391 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
true);
1393 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1395 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1397 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1399 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
1401 std::shared_ptr<CpuAllocatorBase> alloc_;
1407 using DefaultExecutor = OmpExecutor;
1421 static std::shared_ptr<ReferenceExecutor> create(
1422 std::shared_ptr<CpuAllocatorBase> alloc =
1423 std::make_shared<CpuAllocator>())
1425 return std::shared_ptr<ReferenceExecutor>(
1438 this->
template log<log::Logger::operation_launched>(
this, &op);
1439 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1440 this->shared_from_this()));
1441 this->
template log<log::Logger::operation_completed>(
this, &op);
1448 this->ReferenceExecutor::populate_exec_info(
1452 void populate_exec_info(
const machine_topology*)
override
1454 this->get_exec_info().device_id = -1;
1455 this->get_exec_info().num_computing_units = 1;
1456 this->get_exec_info().num_pu_per_cu = 1;
1459 bool verify_memory_from(
const Executor* src_exec)
const override
1461 return src_exec->verify_memory_to(
this);
1464 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
true);
1466 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1468 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1470 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1472 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1477 namespace reference {
1478 using DefaultExecutor = ReferenceExecutor;
1490 public std::enable_shared_from_this<CudaExecutor>,
1491 public detail::EnableDeviceReset {
1492 friend class detail::ExecutorBase<CudaExecutor>;
1507 "calling this CudaExecutor::create method is deprecated, because"
1508 "device_reset no longer has an effect"
1509 "call CudaExecutor::create("
1510 " int device_id, std::shared_ptr<Executor> master,"
1511 " std::shared_ptr<CudaAllocatorBase> alloc,"
1512 " CUstream_st* stream);"
1514 static std::shared_ptr<CudaExecutor> create(
1515 int device_id, std::shared_ptr<Executor> master,
bool device_reset,
1517 CUstream_st* stream =
nullptr);
1528 static std::shared_ptr<CudaExecutor> create(
1529 int device_id, std::shared_ptr<Executor> master,
1530 std::shared_ptr<CudaAllocatorBase> alloc =
1531 std::make_shared<CudaAllocator>(),
1532 CUstream_st* stream =
nullptr);
1534 std::shared_ptr<Executor> get_master() noexcept
override;
1536 std::shared_ptr<const Executor> get_master()
const noexcept
override;
1538 void synchronize()
const override;
1542 std::string get_description()
const override;
1549 return this->get_exec_info().device_id;
1555 static int get_num_devices();
1562 return this->get_exec_info().num_pu_per_cu;
1570 return this->get_exec_info().num_computing_units;
1578 return this->get_exec_info().num_computing_units *
1579 this->get_exec_info().num_pu_per_cu;
1587 return this->get_exec_info().max_subgroup_size;
1595 return this->get_exec_info().major;
1603 return this->get_exec_info().minor;
1611 GKO_DEPRECATED(
"use get_blas_handle() instead")
1612 cublasContext* get_cublas_handle()
const {
return get_blas_handle(); }
1624 GKO_DEPRECATED(
"use get_sparselib_handle() instead")
1625 cusparseContext* get_cusparse_handle()
const
1627 return get_sparselib_handle();
1635 return cusparse_handle_.get();
1645 return this->get_exec_info().closest_pu_ids;
1664 void set_gpu_property();
1666 void init_handles();
1668 CudaExecutor(
int device_id, std::shared_ptr<Executor> master,
1669 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1670 : alloc_{std::move(alloc)}, master_(master), stream_{stream}
1672 this->get_exec_info().device_id = device_id;
1673 this->get_exec_info().num_computing_units = 0;
1674 this->get_exec_info().num_pu_per_cu = 0;
1675 this->CudaExecutor::populate_exec_info(
1677 this->set_gpu_property();
1678 this->init_handles();
1681 void* raw_alloc(
size_type size)
const override;
1683 void raw_free(
void* ptr)
const noexcept
override;
1685 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1687 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1689 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1691 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1693 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1695 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1697 void populate_exec_info(
const machine_topology* mach_topo)
override;
1700 std::shared_ptr<Executor> master_;
1702 template <
typename T>
1703 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1704 handle_manager<cublasContext> cublas_handle_;
1705 handle_manager<cusparseContext> cusparse_handle_;
1706 std::shared_ptr<CudaAllocatorBase> alloc_;
1707 CUstream_st* stream_;
1713 using DefaultExecutor = CudaExecutor;
1725 public std::enable_shared_from_this<HipExecutor>,
1726 public detail::EnableDeviceReset {
1742 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1743 "alloc_mode was replaced by the Allocator type "
1745 static std::shared_ptr<HipExecutor>
create(
1746 int device_id, std::shared_ptr<Executor> master,
bool device_reset,
1748 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1750 static std::shared_ptr<HipExecutor>
create(
1751 int device_id, std::shared_ptr<Executor> master,
1752 std::shared_ptr<HipAllocatorBase>
alloc =
1753 std::make_shared<HipAllocator>(),
1754 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1756 std::shared_ptr<Executor>
get_master() noexcept
override;
1758 std::shared_ptr<const Executor>
get_master()
const noexcept
override;
1771 return this->get_exec_info().device_id;
1784 return this->get_exec_info().num_pu_per_cu;
1792 return this->get_exec_info().num_computing_units;
1800 return this->get_exec_info().major;
1808 return this->get_exec_info().minor;
1816 return this->get_exec_info().num_computing_units *
1817 this->get_exec_info().num_pu_per_cu;
1825 return this->get_exec_info().max_subgroup_size;
1833 GKO_DEPRECATED(
"use get_blas_handle() instead")
1846 GKO_DEPRECATED(
"use get_sparselib_handle() instead")
1857 return hipsparse_handle_.get();
1874 return this->get_exec_info().closest_pu_ids;
1877 GKO_HIP_STREAM_STRUCT* get_stream()
const {
return stream_; }
1880 void set_gpu_property();
1882 void init_handles();
1884 HipExecutor(
int device_id, std::shared_ptr<Executor> master,
1885 std::shared_ptr<HipAllocatorBase>
alloc,
1886 GKO_HIP_STREAM_STRUCT* stream)
1887 : master_{std::move(master)}, alloc_{std::move(
alloc)}, stream_{stream}
1889 this->get_exec_info().device_id = device_id;
1890 this->get_exec_info().num_computing_units = 0;
1891 this->get_exec_info().num_pu_per_cu = 0;
1893 this->set_gpu_property();
1894 this->init_handles();
1897 void* raw_alloc(
size_type size)
const override;
1899 void raw_free(
void* ptr)
const noexcept
override;
1901 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1903 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1905 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1907 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1909 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1911 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1913 void populate_exec_info(
const machine_topology* mach_topo)
override;
1916 std::shared_ptr<Executor> master_;
1918 template <
typename T>
1919 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1920 handle_manager<hipblasContext> hipblas_handle_;
1921 handle_manager<hipsparseContext> hipsparse_handle_;
1922 std::shared_ptr<HipAllocatorBase> alloc_;
1923 GKO_HIP_STREAM_STRUCT* stream_;
1929 using DefaultExecutor = HipExecutor;
1941 public std::enable_shared_from_this<DpcppExecutor> {
1954 static std::shared_ptr<DpcppExecutor>
create(
1955 int device_id, std::shared_ptr<Executor> master,
1956 std::string device_type =
"all",
1957 dpcpp_queue_property property = dpcpp_queue_property::in_order);
1959 std::shared_ptr<Executor>
get_master() noexcept
override;
1961 std::shared_ptr<const Executor>
get_master()
const noexcept
override;
1976 return this->get_exec_info().device_id;
1979 sycl::queue* get_queue()
const {
return queue_.get(); }
1997 return this->get_exec_info().subgroup_sizes;
2007 return this->get_exec_info().num_computing_units;
2015 return this->get_exec_info().num_computing_units *
2016 this->get_exec_info().num_pu_per_cu;
2026 return this->get_exec_info().max_workitem_sizes;
2036 return this->get_exec_info().max_workgroup_size;
2046 return this->get_exec_info().max_subgroup_size;
2056 return this->get_exec_info().device_type;
2060 void set_device_property(
2061 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2064 int device_id, std::shared_ptr<Executor> master,
2065 std::string device_type =
"all",
2066 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2069 std::for_each(device_type.begin(), device_type.end(),
2070 [](
char& c) { c = std::tolower(c); });
2071 this->get_exec_info().device_type = std::string(device_type);
2072 this->get_exec_info().device_id = device_id;
2073 this->set_device_property(property);
2076 void populate_exec_info(
const machine_topology* mach_topo)
override;
2078 void* raw_alloc(
size_type size)
const override;
2080 void raw_free(
void* ptr)
const noexcept
override;
2082 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2084 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
2086 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
2088 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
2090 bool verify_memory_to(
const OmpExecutor* dest_exec)
const override;
2092 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
2095 std::shared_ptr<Executor> master_;
2097 template <
typename T>
2098 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2099 queue_manager<sycl::queue> queue_;
2105 using DefaultExecutor = DpcppExecutor;
2110 #undef GKO_OVERRIDE_RAW_COPY_TO
2116 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_