5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
17 #include <type_traits>
20 #include <ginkgo/core/base/device.hpp>
21 #include <ginkgo/core/base/fwd_decls.hpp>
22 #include <ginkgo/core/base/machine_topology.hpp>
23 #include <ginkgo/core/base/memory.hpp>
24 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
25 #include <ginkgo/core/base/types.hpp>
26 #include <ginkgo/core/log/logger.hpp>
27 #include <ginkgo/core/synthesizer/containers.hpp>
68 constexpr
allocation_mode default_cuda_alloc_mode = allocation_mode::device;
70 constexpr
allocation_mode default_hip_alloc_mode = allocation_mode::device;
76 allocation_mode::unified_global;
78 #if (GINKGO_HIP_PLATFORM_HCC == 1)
81 constexpr
allocation_mode default_hip_alloc_mode = allocation_mode::device;
87 allocation_mode::unified_global;
101 enum class dpcpp_queue_property {
113 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
114 dpcpp_queue_property b)
116 return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
117 static_cast<int>(b));
124 #define GKO_FORWARD_DECLARE(_type, ...) class _type
126 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
128 #undef GKO_FORWARD_DECLARE
131 class ReferenceExecutor;
260 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
261 virtual void run(std::shared_ptr<const _type>) const
263 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
265 #undef GKO_DECLARE_RUN_OVERLOAD
268 virtual void run(std::shared_ptr<const ReferenceExecutor> executor)
const;
275 virtual const char*
get_name()
const noexcept;
291 template <
typename Closure>
292 class RegisteredOperation :
public Operation {
300 RegisteredOperation(
const char* name, Closure op)
301 : name_(name), op_(std::move(op))
304 const char* get_name() const noexcept
override {
return name_; }
306 void run(std::shared_ptr<const ReferenceExecutor> exec)
const override
311 void run(std::shared_ptr<const OmpExecutor> exec)
const override
316 void run(std::shared_ptr<const CudaExecutor> exec)
const override
321 void run(std::shared_ptr<const HipExecutor> exec)
const override
326 void run(std::shared_ptr<const DpcppExecutor> exec)
const override
337 template <
typename Closure>
338 RegisteredOperation<Closure> make_register_operation(
const char* name,
341 return RegisteredOperation<Closure>{name, std::move(op)};
419 #define GKO_REGISTER_OPERATION(_name, _kernel) \
420 template <typename... Args> \
421 auto make_##_name(Args&&... args) \
423 return ::gko::detail::make_register_operation( \
424 #_kernel, [&args...](auto exec) { \
425 using exec_type = decltype(exec); \
428 std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
430 ::gko::kernels::reference::_kernel( \
431 std::dynamic_pointer_cast< \
432 const ::gko::ReferenceExecutor>(exec), \
433 std::forward<Args>(args)...); \
434 } else if (std::is_same< \
436 std::shared_ptr<const ::gko::OmpExecutor>>:: \
438 ::gko::kernels::omp::_kernel( \
439 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
441 std::forward<Args>(args)...); \
442 } else if (std::is_same< \
444 std::shared_ptr<const ::gko::CudaExecutor>>:: \
446 ::gko::kernels::cuda::_kernel( \
447 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
449 std::forward<Args>(args)...); \
450 } else if (std::is_same< \
452 std::shared_ptr<const ::gko::HipExecutor>>:: \
454 ::gko::kernels::hip::_kernel( \
455 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
457 std::forward<Args>(args)...); \
458 } else if (std::is_same< \
460 std::shared_ptr<const ::gko::DpcppExecutor>>:: \
462 ::gko::kernels::dpcpp::_kernel( \
463 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
465 std::forward<Args>(args)...); \
467 GKO_NOT_IMPLEMENTED; \
471 static_assert(true, \
472 "This assert is used to counter the false positive extra " \
473 "semi-colon warnings")
513 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
514 template <typename... Args> \
515 auto make_##_name(Args&&... args) \
517 return ::gko::detail::make_register_operation( \
519 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
521 static_assert(true, \
522 "This assert is used to counter the false positive extra " \
523 "semi-colon warnings")
526 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
616 template <
typename T>
617 friend class detail::ExecutorBase;
619 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
652 template <
typename ClosureOmp,
typename ClosureCuda,
typename ClosureHip,
653 typename ClosureDpcpp>
655 "Please use the overload with std::string as first parameter.")
656 void
run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
657 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
const
659 LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,
661 op(op_omp, op_cuda, op_hip, op_dpcpp);
681 template <
typename ClosureReference,
typename ClosureOmp,
682 typename ClosureCuda,
typename ClosureHip,
typename ClosureDpcpp>
683 void run(std::string name,
const ClosureReference& op_ref,
684 const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
685 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
const
687 LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,
689 op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);
704 template <
typename T>
707 this->
template log<log::Logger::allocation_started>(
708 this, num_elems *
sizeof(T));
709 T* allocated = static_cast<T*>(this->raw_alloc(num_elems *
sizeof(T)));
710 this->
template log<log::Logger::allocation_completed>(
711 this, num_elems *
sizeof(T), reinterpret_cast<uintptr>(allocated));
722 void free(
void* ptr)
const noexcept
724 this->
template log<log::Logger::free_started>(
725 this, reinterpret_cast<uintptr>(ptr));
727 this->
template log<log::Logger::free_completed>(
728 this, reinterpret_cast<uintptr>(ptr));
743 template <
typename T>
745 const T* src_ptr, T* dest_ptr)
const
747 const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
748 const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
749 this->
template log<log::Logger::copy_started>(
750 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
751 if (
this != src_exec.
get()) {
752 src_exec->template log<log::Logger::copy_started>(
753 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
756 this->raw_copy_from(src_exec.
get(), num_elems *
sizeof(T), src_ptr,
759 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
762 std::clog <<
"Not direct copy. Try to copy data from the masters."
765 auto src_master = src_exec->get_master().
get();
766 if (num_elems > 0 && src_master != src_exec.
get()) {
767 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
768 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
770 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
771 src_master->free(master_ptr);
774 this->
template log<log::Logger::copy_completed>(
775 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
776 if (
this != src_exec.
get()) {
777 src_exec->template log<log::Logger::copy_completed>(
778 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
793 template <
typename T>
796 this->
copy_from(
this, num_elems, src_ptr, dest_ptr);
808 template <
typename T>
812 this->
get_master()->copy_from(
this, 1, ptr, &out);
820 virtual std::shared_ptr<Executor>
get_master() noexcept = 0;
838 void add_logger(std::shared_ptr<const log::Logger> logger)
override
840 this->propagating_logger_refcount_.fetch_add(
841 logger->needs_propagation() ? 1 : 0);
842 this->EnableLogging<Executor>::add_logger(logger);
853 this->propagating_logger_refcount_.fetch_sub(
855 this->EnableLogging<Executor>::remove_logger(logger);
858 using EnableLogging<Executor>::remove_logger;
869 log_propagation_mode_ =
mode;
881 return this->propagating_logger_refcount_.load() > 0 &&
894 return this->verify_memory_from(other.get());
916 std::string device_type;
931 int num_computing_units = -1;
944 int num_pu_per_cu = -1;
954 std::vector<int> subgroup_sizes{};
964 int max_subgroup_size = -1;
976 std::vector<int> max_workitem_sizes{};
987 int max_workgroup_size;
1004 std::string pci_bus_id = std::string(13,
'x');
1016 std::vector<int> closest_pu_ids{};
1024 const exec_info& get_exec_info()
const {
return this->exec_info_; }
1035 virtual void* raw_alloc(
size_type size)
const = 0;
1044 virtual void raw_free(
void* ptr)
const noexcept = 0;
1056 virtual void raw_copy_from(
const Executor* src_exec,
size_type n_bytes,
1057 const void* src_ptr,
void* dest_ptr)
const = 0;
1068 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1069 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1070 const void* src_ptr, void* dest_ptr) const = 0
1072 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1074 #undef GKO_ENABLE_RAW_COPY_TO
1083 virtual bool verify_memory_from(
const Executor* src_exec)
const = 0;
1094 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1095 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1097 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1099 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1101 #undef GKO_ENABLE_VERIFY_MEMORY_TO
1109 virtual void populate_exec_info(
const machine_topology* mach_topo) = 0;
1116 exec_info& get_exec_info() {
return this->exec_info_; }
1118 exec_info exec_info_;
1122 std::atomic<int> propagating_logger_refcount_{};
1139 template <
typename ClosureReference,
typename ClosureOmp,
1140 typename ClosureCuda,
typename ClosureHip,
typename ClosureDpcpp>
1141 class LambdaOperation :
public Operation {
1143 LambdaOperation(std::string name,
const ClosureReference& op_ref,
1144 const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
1145 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
1146 : name_(std::move(name)),
1164 LambdaOperation(
const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
1165 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
1166 : LambdaOperation(
"unnamed", op_omp, op_omp, op_cuda, op_hip,
1170 void run(std::shared_ptr<const OmpExecutor>)
const override
1175 void run(std::shared_ptr<const ReferenceExecutor>)
const override
1180 void run(std::shared_ptr<const CudaExecutor>)
const override
1185 void run(std::shared_ptr<const HipExecutor>)
const override
1190 void run(std::shared_ptr<const DpcppExecutor>)
const override
1195 const char* get_name() const noexcept
override {
return name_.c_str(); }
1199 ClosureReference op_ref_;
1201 ClosureCuda op_cuda_;
1203 ClosureDpcpp op_dpcpp_;
1216 template <
typename T>
1243 std::shared_ptr<const Executor> exec_;
1247 template <
typename T>
1250 using pointer = T[];
1264 std::shared_ptr<const Executor> exec_;
1271 template <
typename ConcreteExecutor>
1272 class ExecutorBase :
public Executor {
1275 friend class ::gko::OmpExecutor;
1276 friend class ::gko::HipExecutor;
1277 friend class ::gko::DpcppExecutor;
1278 friend class ::gko::CudaExecutor;
1279 friend class ::gko::ReferenceExecutor;
1282 void run(
const Operation& op)
const override
1284 this->
template log<log::Logger::operation_launched>(
this, &op);
1285 auto scope_guard = get_scoped_device_id_guard();
1286 op.run(
self()->shared_from_this());
1287 this->
template log<log::Logger::operation_completed>(
this, &op);
1291 void raw_copy_from(
const Executor* src_exec,
size_type n_bytes,
1292 const void* src_ptr,
void* dest_ptr)
const override
1294 src_exec->raw_copy_to(
self(), n_bytes, src_ptr, dest_ptr);
1297 virtual bool verify_memory_from(
const Executor* src_exec)
const override
1299 return src_exec->verify_memory_to(
self());
1303 ConcreteExecutor*
self() noexcept
1305 return static_cast<ConcreteExecutor*>(
this);
1308 const ConcreteExecutor*
self()
const noexcept
1310 return static_cast<const ConcreteExecutor*>(
this);
1314 #undef GKO_DECLARE_EXECUTOR_FRIEND
1324 class EnableDeviceReset {
1332 "device_reset is no longer supported, call "
1333 "cudaDeviceReset/hipDeviceReset manually")
1334 void set_device_reset(
bool device_reset) {}
1342 "device_reset is no longer supported, call "
1343 "cudaDeviceReset/hipDeviceReset manually")
1344 bool get_device_reset() {
return false; }
1352 EnableDeviceReset() {}
1355 "device_reset is no longer supported, call "
1356 "cudaDeviceReset/hipDeviceReset manually")
1357 EnableDeviceReset(
bool device_reset) {}
1364 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1365 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1366 const void* src_ptr, void* dest_ptr) const override
1369 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1370 virtual bool verify_memory_to(const dest_* other) const override \
1374 static_assert(true, \
1375 "This assert is used to counter the false positive extra " \
1376 "semi-colon warnings")
1387 public std::enable_shared_from_this<OmpExecutor> {
1388 friend class detail::ExecutorBase<OmpExecutor>;
1397 std::shared_ptr<CpuAllocatorBase> alloc =
1398 std::make_shared<CpuAllocator>())
1400 return std::shared_ptr<OmpExecutor>(
new OmpExecutor(std::move(alloc)));
1403 std::shared_ptr<Executor> get_master() noexcept override;
1405 std::shared_ptr<const
Executor> get_master() const noexcept override;
1407 void synchronize() const override;
1409 int get_num_cores()
const
1411 return this->get_exec_info().num_computing_units;
1414 int get_num_threads_per_core()
const
1416 return this->get_exec_info().num_pu_per_cu;
1419 static int get_num_omp_threads();
1421 scoped_device_id_guard get_scoped_device_id_guard()
const override;
1423 std::string get_description()
const override;
1426 OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1427 : alloc_{std::move(alloc)}
1432 void populate_exec_info(
const machine_topology* mach_topo)
override;
1434 void* raw_alloc(
size_type size)
const override;
1436 void raw_free(
void* ptr)
const noexcept
override;
1438 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1440 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
true);
1442 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1444 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1446 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1448 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
1450 std::shared_ptr<CpuAllocatorBase> alloc_;
1456 using DefaultExecutor = OmpExecutor;
1472 static std::shared_ptr<ReferenceExecutor> create(
1473 std::shared_ptr<CpuAllocatorBase> alloc =
1474 std::make_shared<CpuAllocator>())
1476 return std::shared_ptr<ReferenceExecutor>(
1489 this->
template log<log::Logger::operation_launched>(
this, &op);
1490 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1491 this->shared_from_this()));
1492 this->
template log<log::Logger::operation_completed>(
this, &op);
1499 this->ReferenceExecutor::populate_exec_info(
1503 void populate_exec_info(
const machine_topology*)
override
1505 this->get_exec_info().device_id = -1;
1506 this->get_exec_info().num_computing_units = 1;
1507 this->get_exec_info().num_pu_per_cu = 1;
1510 bool verify_memory_from(
const Executor* src_exec)
const override
1512 return src_exec->verify_memory_to(
this);
1515 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
true);
1517 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1519 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1521 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1523 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1528 namespace reference {
1529 using DefaultExecutor = ReferenceExecutor;
1541 public std::enable_shared_from_this<CudaExecutor>,
1542 public detail::EnableDeviceReset {
1543 friend class detail::ExecutorBase<CudaExecutor>;
1560 "calling this CudaExecutor::create method is deprecated, because"
1561 "device_reset no longer has an effect"
1562 "call CudaExecutor::create("
1563 " int device_id, std::shared_ptr<Executor> master,"
1564 " std::shared_ptr<CudaAllocatorBase> alloc,"
1565 " CUstream_st* stream);"
1567 static std::shared_ptr<CudaExecutor> create(
1568 int device_id, std::shared_ptr<Executor> master,
bool device_reset,
1570 CUstream_st* stream =
nullptr);
1581 static std::shared_ptr<CudaExecutor> create(
1582 int device_id, std::shared_ptr<Executor> master,
1583 std::shared_ptr<CudaAllocatorBase> alloc =
1584 std::make_shared<CudaAllocator>(),
1585 CUstream_st* stream =
nullptr);
1587 std::shared_ptr<Executor> get_master() noexcept
override;
1589 std::shared_ptr<const Executor> get_master()
const noexcept
override;
1591 void synchronize()
const override;
1595 std::string get_description()
const override;
1602 return this->get_exec_info().device_id;
1608 static int get_num_devices();
1615 return this->get_exec_info().num_pu_per_cu;
1623 return this->get_exec_info().num_computing_units;
1631 return this->get_exec_info().num_computing_units *
1632 this->get_exec_info().num_pu_per_cu;
1640 return this->get_exec_info().max_subgroup_size;
1648 return this->get_exec_info().major;
1656 return this->get_exec_info().minor;
1664 GKO_DEPRECATED(
"use get_blas_handle() instead")
1665 cublasContext* get_cublas_handle()
const {
return get_blas_handle(); }
1677 GKO_DEPRECATED(
"use get_sparselib_handle() instead")
1678 cusparseContext* get_cusparse_handle()
const
1680 return get_sparselib_handle();
1688 return cusparse_handle_.get();
1698 return this->get_exec_info().closest_pu_ids;
1717 void set_gpu_property();
1719 void init_handles();
1721 CudaExecutor(
int device_id, std::shared_ptr<Executor> master,
1722 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1723 : master_(master), alloc_{std::move(alloc)}, stream_{stream}
1725 this->get_exec_info().device_id = device_id;
1726 this->get_exec_info().num_computing_units = 0;
1727 this->get_exec_info().num_pu_per_cu = 0;
1728 this->CudaExecutor::populate_exec_info(
1730 this->set_gpu_property();
1731 this->init_handles();
1734 void* raw_alloc(
size_type size)
const override;
1736 void raw_free(
void* ptr)
const noexcept
override;
1738 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1740 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1742 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1744 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1746 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1748 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1750 void populate_exec_info(
const machine_topology* mach_topo)
override;
1753 std::shared_ptr<Executor> master_;
1755 template <
typename T>
1756 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1757 handle_manager<cublasContext> cublas_handle_;
1758 handle_manager<cusparseContext> cusparse_handle_;
1759 std::shared_ptr<CudaAllocatorBase> alloc_;
1760 CUstream_st* stream_;
1766 using DefaultExecutor = CudaExecutor;
1778 public std::enable_shared_from_this<HipExecutor>,
1779 public detail::EnableDeviceReset {
1797 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1798 "alloc_mode was replaced by the Allocator type "
1800 static std::shared_ptr<HipExecutor>
create(
1801 int device_id, std::shared_ptr<Executor> master,
bool device_reset,
1803 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1805 static std::shared_ptr<HipExecutor>
create(
1806 int device_id, std::shared_ptr<Executor> master,
1807 std::shared_ptr<HipAllocatorBase>
alloc =
1808 std::make_shared<HipAllocator>(),
1809 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1811 std::shared_ptr<Executor>
get_master() noexcept
override;
1813 std::shared_ptr<const Executor>
get_master()
const noexcept
override;
1826 return this->get_exec_info().device_id;
1839 return this->get_exec_info().num_pu_per_cu;
1847 return this->get_exec_info().num_computing_units;
1855 return this->get_exec_info().major;
1863 return this->get_exec_info().minor;
1871 return this->get_exec_info().num_computing_units *
1872 this->get_exec_info().num_pu_per_cu;
1880 return this->get_exec_info().max_subgroup_size;
1888 GKO_DEPRECATED(
"use get_blas_handle() instead")
1901 GKO_DEPRECATED(
"use get_sparselib_handle() instead")
1912 return hipsparse_handle_.get();
1929 return this->get_exec_info().closest_pu_ids;
1932 GKO_HIP_STREAM_STRUCT* get_stream()
const {
return stream_; }
1935 void set_gpu_property();
1937 void init_handles();
1939 HipExecutor(
int device_id, std::shared_ptr<Executor> master,
1940 std::shared_ptr<HipAllocatorBase>
alloc,
1941 GKO_HIP_STREAM_STRUCT* stream)
1942 : master_{std::move(master)}, alloc_{std::move(
alloc)}, stream_{stream}
1944 this->get_exec_info().device_id = device_id;
1945 this->get_exec_info().num_computing_units = 0;
1946 this->get_exec_info().num_pu_per_cu = 0;
1948 this->set_gpu_property();
1949 this->init_handles();
1952 void* raw_alloc(
size_type size)
const override;
1954 void raw_free(
void* ptr)
const noexcept
override;
1956 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1958 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1960 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1962 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1964 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1966 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1968 void populate_exec_info(
const machine_topology* mach_topo)
override;
1971 std::shared_ptr<Executor> master_;
1973 template <
typename T>
1974 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1975 handle_manager<hipblasContext> hipblas_handle_;
1976 handle_manager<hipsparseContext> hipsparse_handle_;
1977 std::shared_ptr<HipAllocatorBase> alloc_;
1978 GKO_HIP_STREAM_STRUCT* stream_;
1984 using DefaultExecutor = HipExecutor;
1996 public std::enable_shared_from_this<DpcppExecutor> {
2011 static std::shared_ptr<DpcppExecutor>
create(
2012 int device_id, std::shared_ptr<Executor> master,
2013 std::string device_type =
"all",
2014 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2016 std::shared_ptr<Executor>
get_master() noexcept
override;
2018 std::shared_ptr<const Executor>
get_master()
const noexcept
override;
2033 return this->get_exec_info().device_id;
2036 sycl::queue* get_queue()
const {
return queue_.get(); }
2054 return this->get_exec_info().subgroup_sizes;
2064 return this->get_exec_info().num_computing_units;
2072 return this->get_exec_info().num_computing_units *
2073 this->get_exec_info().num_pu_per_cu;
2083 return this->get_exec_info().max_workitem_sizes;
2093 return this->get_exec_info().max_workgroup_size;
2103 return this->get_exec_info().max_subgroup_size;
2113 return this->get_exec_info().device_type;
2117 void set_device_property(
2118 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2121 int device_id, std::shared_ptr<Executor> master,
2122 std::string device_type =
"all",
2123 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2126 std::for_each(device_type.begin(), device_type.end(),
2127 [](
char& c) { c = std::tolower(c); });
2128 this->get_exec_info().device_type = std::string(device_type);
2129 this->get_exec_info().device_id = device_id;
2130 this->set_device_property(property);
2133 void populate_exec_info(
const machine_topology* mach_topo)
override;
2135 void* raw_alloc(
size_type size)
const override;
2137 void raw_free(
void* ptr)
const noexcept
override;
2139 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2141 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
2143 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
2145 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
2147 bool verify_memory_to(
const OmpExecutor* dest_exec)
const override;
2149 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
2152 std::shared_ptr<Executor> master_;
2154 template <
typename T>
2155 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2156 queue_manager<sycl::queue> queue_;
2162 using DefaultExecutor = DpcppExecutor;
2167 #undef GKO_OVERRIDE_RAW_COPY_TO
2173 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_