5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
17 #include <type_traits>
20 #include <ginkgo/core/base/device.hpp>
21 #include <ginkgo/core/base/fwd_decls.hpp>
22 #include <ginkgo/core/base/machine_topology.hpp>
23 #include <ginkgo/core/base/memory.hpp>
24 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
25 #include <ginkgo/core/base/types.hpp>
26 #include <ginkgo/core/log/logger.hpp>
27 #include <ginkgo/core/synthesizer/containers.hpp>
68 constexpr
allocation_mode default_cuda_alloc_mode = allocation_mode::device;
70 constexpr
allocation_mode default_hip_alloc_mode = allocation_mode::device;
76 allocation_mode::unified_global;
78 #if (GINKGO_HIP_PLATFORM_HCC == 1)
81 constexpr
allocation_mode default_hip_alloc_mode = allocation_mode::device;
87 allocation_mode::unified_global;
101 enum class dpcpp_queue_property {
113 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
114 dpcpp_queue_property b)
116 return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
117 static_cast<int>(b));
124 #define GKO_FORWARD_DECLARE(_type, ...) class _type
126 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
128 #undef GKO_FORWARD_DECLARE
131 class ReferenceExecutor;
260 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
261 virtual void run(std::shared_ptr<const _type>) const
263 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
265 #undef GKO_DECLARE_RUN_OVERLOAD
268 virtual void run(std::shared_ptr<const ReferenceExecutor> executor)
const;
275 virtual const char*
get_name()
const noexcept;
291 template <
typename Closure>
292 class RegisteredOperation :
public Operation {
300 RegisteredOperation(
const char* name, Closure op)
301 : name_(name), op_(std::move(op))
304 const char* get_name() const noexcept
override {
return name_; }
306 void run(std::shared_ptr<const ReferenceExecutor> exec)
const override
311 void run(std::shared_ptr<const OmpExecutor> exec)
const override
316 void run(std::shared_ptr<const CudaExecutor> exec)
const override
321 void run(std::shared_ptr<const HipExecutor> exec)
const override
326 void run(std::shared_ptr<const DpcppExecutor> exec)
const override
337 template <
typename Closure>
338 RegisteredOperation<Closure> make_register_operation(
const char* name,
341 return RegisteredOperation<Closure>{name, std::move(op)};
419 #define GKO_REGISTER_OPERATION(_name, _kernel) \
420 template <typename... Args> \
421 auto make_##_name(Args&&... args) \
423 return ::gko::detail::make_register_operation( \
424 #_kernel, [&args...](auto exec) { \
425 using exec_type = decltype(exec); \
426 if constexpr (std::is_same< \
429 const ::gko::ReferenceExecutor>>:: \
431 ::gko::kernels::reference::_kernel( \
432 std::dynamic_pointer_cast< \
433 const ::gko::ReferenceExecutor>(exec), \
434 std::forward<Args>(args)...); \
435 } else if constexpr ( \
438 std::shared_ptr<const ::gko::OmpExecutor>>::value) { \
439 ::gko::kernels::omp::_kernel( \
440 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
442 std::forward<Args>(args)...); \
443 } else if constexpr ( \
446 std::shared_ptr<const ::gko::CudaExecutor>>::value) { \
447 ::gko::kernels::cuda::_kernel( \
448 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
450 std::forward<Args>(args)...); \
451 } else if constexpr ( \
454 std::shared_ptr<const ::gko::HipExecutor>>::value) { \
455 ::gko::kernels::hip::_kernel( \
456 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
458 std::forward<Args>(args)...); \
459 } else if constexpr ( \
462 std::shared_ptr<const ::gko::DpcppExecutor>>::value) { \
463 ::gko::kernels::dpcpp::_kernel( \
464 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
466 std::forward<Args>(args)...); \
468 GKO_NOT_IMPLEMENTED; \
472 static_assert(true, \
473 "This assert is used to counter the false positive extra " \
474 "semi-colon warnings")
514 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
515 template <typename... Args> \
516 auto make_##_name(Args&&... args) \
518 return ::gko::detail::make_register_operation( \
520 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
522 static_assert(true, \
523 "This assert is used to counter the false positive extra " \
524 "semi-colon warnings")
527 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
617 template <
typename T>
618 friend class detail::ExecutorBase;
620 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
653 template <
typename ClosureOmp,
typename ClosureCuda,
typename ClosureHip,
654 typename ClosureDpcpp>
656 "Please use the overload with std::string as first parameter.")
657 void
run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
658 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
const
660 LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,
662 op(op_omp, op_cuda, op_hip, op_dpcpp);
682 template <
typename ClosureReference,
typename ClosureOmp,
683 typename ClosureCuda,
typename ClosureHip,
typename ClosureDpcpp>
684 void run(std::string name,
const ClosureReference& op_ref,
685 const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
686 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
const
688 LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,
690 op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);
705 template <
typename T>
708 this->
template log<log::Logger::allocation_started>(
709 this, num_elems *
sizeof(T));
710 T* allocated = static_cast<T*>(this->raw_alloc(num_elems *
sizeof(T)));
711 this->
template log<log::Logger::allocation_completed>(
712 this, num_elems *
sizeof(T), reinterpret_cast<uintptr>(allocated));
723 void free(
void* ptr)
const noexcept
725 this->
template log<log::Logger::free_started>(
726 this, reinterpret_cast<uintptr>(ptr));
728 this->
template log<log::Logger::free_completed>(
729 this, reinterpret_cast<uintptr>(ptr));
744 template <
typename T>
746 const T* src_ptr, T* dest_ptr)
const
748 const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
749 const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
750 this->
template log<log::Logger::copy_started>(
751 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
752 if (
this != src_exec.
get()) {
753 src_exec->template log<log::Logger::copy_started>(
754 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
757 this->raw_copy_from(src_exec.
get(), num_elems *
sizeof(T), src_ptr,
760 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
763 std::cerr <<
"Not direct copy. Try to copy data from the masters."
766 auto src_master = src_exec->get_master().
get();
767 if (num_elems > 0 && src_master != src_exec.
get()) {
768 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
769 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
771 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
772 src_master->free(master_ptr);
775 this->
template log<log::Logger::copy_completed>(
776 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
777 if (
this != src_exec.
get()) {
778 src_exec->template log<log::Logger::copy_completed>(
779 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
794 template <
typename T>
797 this->
copy_from(
this, num_elems, src_ptr, dest_ptr);
809 template <
typename T>
813 this->
get_master()->copy_from(
this, 1, ptr, &out);
821 virtual std::shared_ptr<Executor>
get_master() noexcept = 0;
839 void add_logger(std::shared_ptr<const log::Logger> logger)
override
841 this->propagating_logger_refcount_.fetch_add(
842 logger->needs_propagation() ? 1 : 0);
843 this->EnableLogging<Executor>::add_logger(logger);
854 this->propagating_logger_refcount_.fetch_sub(
856 this->EnableLogging<Executor>::remove_logger(logger);
859 using EnableLogging<Executor>::remove_logger;
870 log_propagation_mode_ =
mode;
882 return this->propagating_logger_refcount_.load() > 0 &&
895 return this->verify_memory_from(other.get());
917 std::string device_type;
932 int num_computing_units = -1;
945 int num_pu_per_cu = -1;
955 std::vector<int> subgroup_sizes{};
965 int max_subgroup_size = -1;
977 std::vector<int> max_workitem_sizes{};
988 int max_workgroup_size;
1005 std::string pci_bus_id = std::string(13,
'x');
1017 std::vector<int> closest_pu_ids{};
1025 const exec_info& get_exec_info()
const {
return this->exec_info_; }
1036 virtual void* raw_alloc(
size_type size)
const = 0;
1045 virtual void raw_free(
void* ptr)
const noexcept = 0;
1057 virtual void raw_copy_from(
const Executor* src_exec,
size_type n_bytes,
1058 const void* src_ptr,
void* dest_ptr)
const = 0;
1069 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1070 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1071 const void* src_ptr, void* dest_ptr) const = 0
1073 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1075 #undef GKO_ENABLE_RAW_COPY_TO
1084 virtual bool verify_memory_from(
const Executor* src_exec)
const = 0;
1095 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1096 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1098 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1100 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1102 #undef GKO_ENABLE_VERIFY_MEMORY_TO
1110 virtual void populate_exec_info(
const machine_topology* mach_topo) = 0;
1117 exec_info& get_exec_info() {
return this->exec_info_; }
1119 exec_info exec_info_;
1123 std::atomic<int> propagating_logger_refcount_{};
1140 template <
typename ClosureReference,
typename ClosureOmp,
1141 typename ClosureCuda,
typename ClosureHip,
typename ClosureDpcpp>
1142 class LambdaOperation :
public Operation {
1144 LambdaOperation(std::string name,
const ClosureReference& op_ref,
1145 const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
1146 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
1147 : name_(std::move(name)),
1165 LambdaOperation(
const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
1166 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
1167 : LambdaOperation(
"unnamed", op_omp, op_omp, op_cuda, op_hip,
1171 void run(std::shared_ptr<const OmpExecutor>)
const override
1176 void run(std::shared_ptr<const ReferenceExecutor>)
const override
1181 void run(std::shared_ptr<const CudaExecutor>)
const override
1186 void run(std::shared_ptr<const HipExecutor>)
const override
1191 void run(std::shared_ptr<const DpcppExecutor>)
const override
1196 const char* get_name() const noexcept
override {
return name_.c_str(); }
1200 ClosureReference op_ref_;
1202 ClosureCuda op_cuda_;
1204 ClosureDpcpp op_dpcpp_;
1217 template <
typename T>
1244 std::shared_ptr<const Executor> exec_;
1248 template <
typename T>
1251 using pointer = T[];
1265 std::shared_ptr<const Executor> exec_;
1272 template <
typename ConcreteExecutor>
1273 class ExecutorBase :
public Executor {
1276 friend class ::gko::OmpExecutor;
1277 friend class ::gko::HipExecutor;
1278 friend class ::gko::DpcppExecutor;
1279 friend class ::gko::CudaExecutor;
1280 friend class ::gko::ReferenceExecutor;
1283 void run(
const Operation& op)
const override
1285 this->
template log<log::Logger::operation_launched>(
this, &op);
1286 auto scope_guard = get_scoped_device_id_guard();
1287 op.run(
self()->shared_from_this());
1288 this->
template log<log::Logger::operation_completed>(
this, &op);
1292 void raw_copy_from(
const Executor* src_exec,
size_type n_bytes,
1293 const void* src_ptr,
void* dest_ptr)
const override
1295 src_exec->raw_copy_to(
self(), n_bytes, src_ptr, dest_ptr);
1298 virtual bool verify_memory_from(
const Executor* src_exec)
const override
1300 return src_exec->verify_memory_to(
self());
1304 ConcreteExecutor*
self() noexcept
1306 return static_cast<ConcreteExecutor*>(
this);
1309 const ConcreteExecutor*
self()
const noexcept
1311 return static_cast<const ConcreteExecutor*>(
this);
1315 #undef GKO_DECLARE_EXECUTOR_FRIEND
1325 class EnableDeviceReset {
1333 "device_reset is no longer supported, call "
1334 "cudaDeviceReset/hipDeviceReset manually")
1335 void set_device_reset(
bool device_reset) {}
1343 "device_reset is no longer supported, call "
1344 "cudaDeviceReset/hipDeviceReset manually")
1345 bool get_device_reset() {
return false; }
1353 EnableDeviceReset() {}
1356 "device_reset is no longer supported, call "
1357 "cudaDeviceReset/hipDeviceReset manually")
1358 EnableDeviceReset(
bool device_reset) {}
1365 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1366 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1367 const void* src_ptr, void* dest_ptr) const override
1370 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1371 virtual bool verify_memory_to(const dest_* other) const override \
1375 static_assert(true, \
1376 "This assert is used to counter the false positive extra " \
1377 "semi-colon warnings")
1388 public std::enable_shared_from_this<OmpExecutor> {
1389 friend class detail::ExecutorBase<OmpExecutor>;
1398 std::shared_ptr<CpuAllocatorBase> alloc =
1399 std::make_shared<CpuAllocator>())
1401 return std::shared_ptr<OmpExecutor>(
new OmpExecutor(std::move(alloc)));
1404 std::shared_ptr<Executor> get_master() noexcept override;
1406 std::shared_ptr<const
Executor> get_master() const noexcept override;
1408 void synchronize() const override;
1410 int get_num_cores()
const
1412 return this->get_exec_info().num_computing_units;
1415 int get_num_threads_per_core()
const
1417 return this->get_exec_info().num_pu_per_cu;
1420 static int get_num_omp_threads();
1422 scoped_device_id_guard get_scoped_device_id_guard()
const override;
1424 std::string get_description()
const override;
1427 OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1428 : alloc_{std::move(alloc)}
1433 void populate_exec_info(
const machine_topology* mach_topo)
override;
1435 void* raw_alloc(
size_type size)
const override;
1437 void raw_free(
void* ptr)
const noexcept
override;
1439 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1441 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
true);
1443 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1445 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1447 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1449 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
1451 std::shared_ptr<CpuAllocatorBase> alloc_;
1457 using DefaultExecutor = OmpExecutor;
1473 static std::shared_ptr<ReferenceExecutor> create(
1474 std::shared_ptr<CpuAllocatorBase> alloc =
1475 std::make_shared<CpuAllocator>())
1477 return std::shared_ptr<ReferenceExecutor>(
1490 this->
template log<log::Logger::operation_launched>(
this, &op);
1491 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1492 this->shared_from_this()));
1493 this->
template log<log::Logger::operation_completed>(
this, &op);
1500 this->ReferenceExecutor::populate_exec_info(
1504 void populate_exec_info(
const machine_topology*)
override
1506 this->get_exec_info().device_id = -1;
1507 this->get_exec_info().num_computing_units = 1;
1508 this->get_exec_info().num_pu_per_cu = 1;
1511 bool verify_memory_from(
const Executor* src_exec)
const override
1513 return src_exec->verify_memory_to(
this);
1516 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
true);
1518 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1520 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1522 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1524 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1529 namespace reference {
1530 using DefaultExecutor = ReferenceExecutor;
1542 public std::enable_shared_from_this<CudaExecutor>,
1543 public detail::EnableDeviceReset {
1544 friend class detail::ExecutorBase<CudaExecutor>;
1561 "calling this CudaExecutor::create method is deprecated, because"
1562 "device_reset no longer has an effect"
1563 "call CudaExecutor::create("
1564 " int device_id, std::shared_ptr<Executor> master,"
1565 " std::shared_ptr<CudaAllocatorBase> alloc,"
1566 " CUstream_st* stream);"
1568 static std::shared_ptr<CudaExecutor> create(
1569 int device_id, std::shared_ptr<Executor> master,
bool device_reset,
1571 CUstream_st* stream =
nullptr);
1582 static std::shared_ptr<CudaExecutor> create(
1583 int device_id, std::shared_ptr<Executor> master,
1584 std::shared_ptr<CudaAllocatorBase> alloc =
1585 std::make_shared<CudaAllocator>(),
1586 CUstream_st* stream =
nullptr);
1588 std::shared_ptr<Executor> get_master() noexcept
override;
1590 std::shared_ptr<const Executor> get_master()
const noexcept
override;
1592 void synchronize()
const override;
1596 std::string get_description()
const override;
1603 return this->get_exec_info().device_id;
1609 static int get_num_devices();
1616 return this->get_exec_info().num_pu_per_cu;
1624 return this->get_exec_info().num_computing_units;
1632 return this->get_exec_info().num_computing_units *
1633 this->get_exec_info().num_pu_per_cu;
1641 return this->get_exec_info().max_subgroup_size;
1649 return this->get_exec_info().major;
1657 return this->get_exec_info().minor;
1665 return this->get_major_version() * 10 + this->get_minor_version();
1673 GKO_DEPRECATED(
"use get_blas_handle() instead")
1674 cublasContext* get_cublas_handle()
const {
return get_blas_handle(); }
1686 GKO_DEPRECATED(
"use get_sparselib_handle() instead")
1687 cusparseContext* get_cusparse_handle()
const
1689 return get_sparselib_handle();
1697 return cusparse_handle_.get();
1707 return this->get_exec_info().closest_pu_ids;
1726 void set_gpu_property();
1728 void init_handles();
1730 CudaExecutor(
int device_id, std::shared_ptr<Executor> master,
1731 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1732 : master_(master), alloc_{std::move(alloc)}, stream_{stream}
1734 this->get_exec_info().device_id = device_id;
1735 this->get_exec_info().num_computing_units = 0;
1736 this->get_exec_info().num_pu_per_cu = 0;
1737 this->CudaExecutor::populate_exec_info(
1739 this->set_gpu_property();
1740 this->init_handles();
1743 void* raw_alloc(
size_type size)
const override;
1745 void raw_free(
void* ptr)
const noexcept
override;
1747 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1749 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1751 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1753 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1755 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1757 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1759 void populate_exec_info(
const machine_topology* mach_topo)
override;
1762 std::shared_ptr<Executor> master_;
1764 template <
typename T>
1765 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1766 handle_manager<cublasContext> cublas_handle_;
1767 handle_manager<cusparseContext> cusparse_handle_;
1768 std::shared_ptr<CudaAllocatorBase> alloc_;
1769 CUstream_st* stream_;
1775 using DefaultExecutor = CudaExecutor;
1787 public std::enable_shared_from_this<HipExecutor>,
1788 public detail::EnableDeviceReset {
1806 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1807 "alloc_mode was replaced by the Allocator type "
1809 static std::shared_ptr<HipExecutor>
create(
1810 int device_id, std::shared_ptr<Executor> master,
bool device_reset,
1812 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1814 static std::shared_ptr<HipExecutor>
create(
1815 int device_id, std::shared_ptr<Executor> master,
1816 std::shared_ptr<HipAllocatorBase>
alloc =
1817 std::make_shared<HipAllocator>(),
1818 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1820 std::shared_ptr<Executor>
get_master() noexcept
override;
1822 std::shared_ptr<const Executor>
get_master()
const noexcept
override;
1835 return this->get_exec_info().device_id;
1848 return this->get_exec_info().num_pu_per_cu;
1856 return this->get_exec_info().num_computing_units;
1864 return this->get_exec_info().major;
1872 return this->get_exec_info().minor;
1880 return this->get_exec_info().num_computing_units *
1881 this->get_exec_info().num_pu_per_cu;
1889 return this->get_exec_info().max_subgroup_size;
1897 GKO_DEPRECATED(
"use get_blas_handle() instead")
1910 GKO_DEPRECATED(
"use get_sparselib_handle() instead")
1921 return hipsparse_handle_.get();
1938 return this->get_exec_info().closest_pu_ids;
1941 GKO_HIP_STREAM_STRUCT* get_stream()
const {
return stream_; }
1944 void set_gpu_property();
1946 void init_handles();
1948 HipExecutor(
int device_id, std::shared_ptr<Executor> master,
1949 std::shared_ptr<HipAllocatorBase>
alloc,
1950 GKO_HIP_STREAM_STRUCT* stream)
1951 : master_{std::move(master)}, alloc_{std::move(
alloc)}, stream_{stream}
1953 this->get_exec_info().device_id = device_id;
1954 this->get_exec_info().num_computing_units = 0;
1955 this->get_exec_info().num_pu_per_cu = 0;
1957 this->set_gpu_property();
1958 this->init_handles();
1961 void* raw_alloc(
size_type size)
const override;
1963 void raw_free(
void* ptr)
const noexcept
override;
1965 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1967 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1969 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1971 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1973 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1975 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1977 void populate_exec_info(
const machine_topology* mach_topo)
override;
1980 std::shared_ptr<Executor> master_;
1982 template <
typename T>
1983 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1984 handle_manager<hipblasContext> hipblas_handle_;
1985 handle_manager<hipsparseContext> hipsparse_handle_;
1986 std::shared_ptr<HipAllocatorBase> alloc_;
1987 GKO_HIP_STREAM_STRUCT* stream_;
1993 using DefaultExecutor = HipExecutor;
2005 public std::enable_shared_from_this<DpcppExecutor> {
2020 static std::shared_ptr<DpcppExecutor>
create(
2021 int device_id, std::shared_ptr<Executor> master,
2022 std::string device_type =
"all",
2023 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2025 std::shared_ptr<Executor>
get_master() noexcept
override;
2027 std::shared_ptr<const Executor>
get_master()
const noexcept
override;
2042 return this->get_exec_info().device_id;
2045 sycl::queue* get_queue()
const {
return queue_.get(); }
2063 return this->get_exec_info().subgroup_sizes;
2073 return this->get_exec_info().num_computing_units;
2081 return this->get_exec_info().num_computing_units *
2082 this->get_exec_info().num_pu_per_cu;
2092 return this->get_exec_info().max_workitem_sizes;
2102 return this->get_exec_info().max_workgroup_size;
2112 return this->get_exec_info().max_subgroup_size;
2122 return this->get_exec_info().device_type;
2126 void set_device_property(
2127 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2130 int device_id, std::shared_ptr<Executor> master,
2131 std::string device_type =
"all",
2132 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2135 std::for_each(device_type.begin(), device_type.end(),
2136 [](
char& c) { c = std::tolower(c); });
2137 this->get_exec_info().device_type = std::string(device_type);
2138 this->get_exec_info().device_id = device_id;
2139 this->set_device_property(property);
2142 void populate_exec_info(
const machine_topology* mach_topo)
override;
2144 void* raw_alloc(
size_type size)
const override;
2146 void raw_free(
void* ptr)
const noexcept
override;
2148 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2150 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
2152 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
2154 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
2156 bool verify_memory_to(
const OmpExecutor* dest_exec)
const override;
2158 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
2161 std::shared_ptr<Executor> master_;
2163 template <
typename T>
2164 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2165 queue_manager<sycl::queue> queue_;
2171 using DefaultExecutor = DpcppExecutor;
2176 #undef GKO_OVERRIDE_RAW_COPY_TO
2182 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_