5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
17 #include <type_traits>
21 #include <ginkgo/core/base/device.hpp>
22 #include <ginkgo/core/base/fwd_decls.hpp>
23 #include <ginkgo/core/base/machine_topology.hpp>
24 #include <ginkgo/core/base/memory.hpp>
25 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
26 #include <ginkgo/core/base/types.hpp>
27 #include <ginkgo/core/log/logger.hpp>
28 #include <ginkgo/core/synthesizer/containers.hpp>
69 constexpr
allocation_mode default_cuda_alloc_mode = allocation_mode::device;
71 constexpr
allocation_mode default_hip_alloc_mode = allocation_mode::device;
77 allocation_mode::unified_global;
79 #if (GINKGO_HIP_PLATFORM_HCC == 1)
82 constexpr
allocation_mode default_hip_alloc_mode = allocation_mode::device;
88 allocation_mode::unified_global;
102 enum class dpcpp_queue_property {
114 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
115 dpcpp_queue_property b)
117 return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
118 static_cast<int>(b));
125 #define GKO_FORWARD_DECLARE(_type, ...) class _type
127 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
129 #undef GKO_FORWARD_DECLARE
132 class ReferenceExecutor;
261 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
262 virtual void run(std::shared_ptr<const _type>) const
264 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
266 #undef GKO_DECLARE_RUN_OVERLOAD
269 virtual void run(std::shared_ptr<const ReferenceExecutor> executor)
const;
276 virtual const char*
get_name()
const noexcept;
292 template <
typename Closure>
293 class RegisteredOperation :
public Operation {
301 RegisteredOperation(
const char* name, Closure op)
302 : name_(name), op_(std::move(op))
305 const char* get_name() const noexcept
override {
return name_; }
307 void run(std::shared_ptr<const ReferenceExecutor> exec)
const override
312 void run(std::shared_ptr<const OmpExecutor> exec)
const override
317 void run(std::shared_ptr<const CudaExecutor> exec)
const override
322 void run(std::shared_ptr<const HipExecutor> exec)
const override
327 void run(std::shared_ptr<const DpcppExecutor> exec)
const override
338 template <
typename Closure>
339 RegisteredOperation<Closure> make_register_operation(
const char* name,
342 return RegisteredOperation<Closure>{name, std::move(op)};
420 #define GKO_REGISTER_OPERATION(_name, _kernel) \
421 template <typename... Args> \
422 auto make_##_name(Args&&... args) \
424 return ::gko::detail::make_register_operation( \
425 #_kernel, [&args...](auto exec) { \
426 using exec_type = decltype(exec); \
429 std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
431 ::gko::kernels::reference::_kernel( \
432 std::dynamic_pointer_cast< \
433 const ::gko::ReferenceExecutor>(exec), \
434 std::forward<Args>(args)...); \
435 } else if (std::is_same< \
437 std::shared_ptr<const ::gko::OmpExecutor>>:: \
439 ::gko::kernels::omp::_kernel( \
440 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
442 std::forward<Args>(args)...); \
443 } else if (std::is_same< \
445 std::shared_ptr<const ::gko::CudaExecutor>>:: \
447 ::gko::kernels::cuda::_kernel( \
448 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
450 std::forward<Args>(args)...); \
451 } else if (std::is_same< \
453 std::shared_ptr<const ::gko::HipExecutor>>:: \
455 ::gko::kernels::hip::_kernel( \
456 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
458 std::forward<Args>(args)...); \
459 } else if (std::is_same< \
461 std::shared_ptr<const ::gko::DpcppExecutor>>:: \
463 ::gko::kernels::dpcpp::_kernel( \
464 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
466 std::forward<Args>(args)...); \
468 GKO_NOT_IMPLEMENTED; \
472 static_assert(true, \
473 "This assert is used to counter the false positive extra " \
474 "semi-colon warnings")
514 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
515 template <typename... Args> \
516 auto make_##_name(Args&&... args) \
518 return ::gko::detail::make_register_operation( \
520 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
522 static_assert(true, \
523 "This assert is used to counter the false positive extra " \
524 "semi-colon warnings")
527 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
617 template <
typename T>
618 friend class detail::ExecutorBase;
620 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
653 template <
typename ClosureOmp,
typename ClosureCuda,
typename ClosureHip,
654 typename ClosureDpcpp>
655 void run(
const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
656 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
const
658 LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
659 op_omp, op_cuda, op_hip, op_dpcpp);
674 template <
typename T>
677 this->
template log<log::Logger::allocation_started>(
678 this, num_elems *
sizeof(T));
679 T* allocated = static_cast<T*>(this->raw_alloc(num_elems *
sizeof(T)));
680 this->
template log<log::Logger::allocation_completed>(
681 this, num_elems *
sizeof(T), reinterpret_cast<uintptr>(allocated));
692 void free(
void* ptr)
const noexcept
694 this->
template log<log::Logger::free_started>(
695 this, reinterpret_cast<uintptr>(ptr));
697 this->
template log<log::Logger::free_completed>(
698 this, reinterpret_cast<uintptr>(ptr));
713 template <
typename T>
715 const T* src_ptr, T* dest_ptr)
const
717 const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
718 const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
719 this->
template log<log::Logger::copy_started>(
720 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
721 if (
this != src_exec.
get()) {
722 src_exec->template log<log::Logger::copy_started>(
723 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
726 this->raw_copy_from(src_exec.
get(), num_elems *
sizeof(T), src_ptr,
729 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
732 std::clog <<
"Not direct copy. Try to copy data from the masters."
735 auto src_master = src_exec->get_master().
get();
736 if (num_elems > 0 && src_master != src_exec.
get()) {
737 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
738 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
740 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
741 src_master->free(master_ptr);
744 this->
template log<log::Logger::copy_completed>(
745 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
746 if (
this != src_exec.
get()) {
747 src_exec->template log<log::Logger::copy_completed>(
748 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
763 template <
typename T>
766 this->
copy_from(
this, num_elems, src_ptr, dest_ptr);
778 template <
typename T>
782 this->
get_master()->copy_from(
this, 1, ptr, &out);
790 virtual std::shared_ptr<Executor>
get_master() noexcept = 0;
808 void add_logger(std::shared_ptr<const log::Logger> logger)
override
810 this->propagating_logger_refcount_.fetch_add(
811 logger->needs_propagation() ? 1 : 0);
812 this->EnableLogging<Executor>::add_logger(logger);
823 this->propagating_logger_refcount_.fetch_sub(
825 this->EnableLogging<Executor>::remove_logger(logger);
828 using EnableLogging<Executor>::remove_logger;
839 log_propagation_mode_ =
mode;
851 return this->propagating_logger_refcount_.load() > 0 &&
864 return this->verify_memory_from(other.get());
883 std::string device_type;
898 int num_computing_units = -1;
911 int num_pu_per_cu = -1;
921 std::vector<int> subgroup_sizes{};
931 int max_subgroup_size = -1;
943 std::vector<int> max_workitem_sizes{};
954 int max_workgroup_size;
971 std::string pci_bus_id = std::string(13,
'x');
983 std::vector<int> closest_pu_ids{};
991 const exec_info& get_exec_info()
const {
return this->exec_info_; }
1002 virtual void* raw_alloc(
size_type size)
const = 0;
1011 virtual void raw_free(
void* ptr)
const noexcept = 0;
1023 virtual void raw_copy_from(
const Executor* src_exec,
size_type n_bytes,
1024 const void* src_ptr,
void* dest_ptr)
const = 0;
1035 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1036 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1037 const void* src_ptr, void* dest_ptr) const = 0
1039 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1041 #undef GKO_ENABLE_RAW_COPY_TO
1050 virtual bool verify_memory_from(
const Executor* src_exec)
const = 0;
1061 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1062 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1064 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1066 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1068 #undef GKO_ENABLE_VERIFY_MEMORY_TO
1076 virtual void populate_exec_info(
const machine_topology* mach_topo) = 0;
1083 exec_info& get_exec_info() {
return this->exec_info_; }
1085 exec_info exec_info_;
1089 std::atomic<int> propagating_logger_refcount_{};
1106 template <
typename ClosureOmp,
typename ClosureCuda,
typename ClosureHip,
1107 typename ClosureDpcpp>
1108 class LambdaOperation :
public Operation {
1120 LambdaOperation(
const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
1121 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
1128 void run(std::shared_ptr<const OmpExecutor>)
const override
1133 void run(std::shared_ptr<const ReferenceExecutor>)
const override
1138 void run(std::shared_ptr<const CudaExecutor>)
const override
1143 void run(std::shared_ptr<const HipExecutor>)
const override
1148 void run(std::shared_ptr<const DpcppExecutor>)
const override
1155 ClosureCuda op_cuda_;
1157 ClosureDpcpp op_dpcpp_;
1170 template <
typename T>
1197 std::shared_ptr<const Executor> exec_;
1201 template <
typename T>
1204 using pointer = T[];
1218 std::shared_ptr<const Executor> exec_;
1225 template <
typename ConcreteExecutor>
1226 class ExecutorBase :
public Executor {
1227 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
1233 void run(
const Operation& op)
const override
1235 this->
template log<log::Logger::operation_launched>(
this, &op);
1236 auto scope_guard = get_scoped_device_id_guard();
1237 op.run(
self()->shared_from_this());
1238 this->
template log<log::Logger::operation_completed>(
this, &op);
1242 void raw_copy_from(
const Executor* src_exec,
size_type n_bytes,
1243 const void* src_ptr,
void* dest_ptr)
const override
1245 src_exec->raw_copy_to(
self(), n_bytes, src_ptr, dest_ptr);
1248 virtual bool verify_memory_from(
const Executor* src_exec)
const override
1250 return src_exec->verify_memory_to(
self());
1254 ConcreteExecutor*
self() noexcept
1256 return static_cast<ConcreteExecutor*>(
this);
1259 const ConcreteExecutor*
self()
const noexcept
1261 return static_cast<const ConcreteExecutor*>(
this);
1265 #undef GKO_DECLARE_EXECUTOR_FRIEND
1275 class EnableDeviceReset {
1283 "device_reset is no longer supported, call "
1284 "cudaDeviceReset/hipDeviceReset manually")
1285 void set_device_reset(
bool device_reset) {}
1293 "device_reset is no longer supported, call "
1294 "cudaDeviceReset/hipDeviceReset manually")
1295 bool get_device_reset() {
return false; }
1303 EnableDeviceReset() {}
1306 "device_reset is no longer supported, call "
1307 "cudaDeviceReset/hipDeviceReset manually")
1308 EnableDeviceReset(
bool device_reset) {}
1315 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1316 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1317 const void* src_ptr, void* dest_ptr) const override
1320 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1321 virtual bool verify_memory_to(const dest_* other) const override \
1325 static_assert(true, \
1326 "This assert is used to counter the false positive extra " \
1327 "semi-colon warnings")
1338 public std::enable_shared_from_this<OmpExecutor> {
1339 friend class detail::ExecutorBase<OmpExecutor>;
1346 std::shared_ptr<CpuAllocatorBase> alloc =
1347 std::make_shared<CpuAllocator>())
1349 return std::shared_ptr<OmpExecutor>(
new OmpExecutor(std::move(alloc)));
1352 std::shared_ptr<Executor> get_master() noexcept override;
1354 std::shared_ptr<const
Executor> get_master() const noexcept override;
1356 void synchronize() const override;
1358 int get_num_cores()
const
1360 return this->get_exec_info().num_computing_units;
1363 int get_num_threads_per_core()
const
1365 return this->get_exec_info().num_pu_per_cu;
1368 static int get_num_omp_threads();
1370 scoped_device_id_guard get_scoped_device_id_guard()
const override;
1373 OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1374 : alloc_{std::move(alloc)}
1379 void populate_exec_info(
const machine_topology* mach_topo)
override;
1381 void* raw_alloc(
size_type size)
const override;
1383 void raw_free(
void* ptr)
const noexcept
override;
1385 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1387 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
true);
1389 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1391 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1393 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1395 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
1397 std::shared_ptr<CpuAllocatorBase> alloc_;
1403 using DefaultExecutor = OmpExecutor;
1417 static std::shared_ptr<ReferenceExecutor> create(
1418 std::shared_ptr<CpuAllocatorBase> alloc =
1419 std::make_shared<CpuAllocator>())
1421 return std::shared_ptr<ReferenceExecutor>(
1432 this->
template log<log::Logger::operation_launched>(
this, &op);
1433 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1434 this->shared_from_this()));
1435 this->
template log<log::Logger::operation_completed>(
this, &op);
1442 this->ReferenceExecutor::populate_exec_info(
1446 void populate_exec_info(
const machine_topology*)
override
1448 this->get_exec_info().device_id = -1;
1449 this->get_exec_info().num_computing_units = 1;
1450 this->get_exec_info().num_pu_per_cu = 1;
1453 bool verify_memory_from(
const Executor* src_exec)
const override
1455 return src_exec->verify_memory_to(
this);
1458 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
true);
1460 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1462 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1464 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1466 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1471 namespace reference {
1472 using DefaultExecutor = ReferenceExecutor;
1484 public std::enable_shared_from_this<CudaExecutor>,
1485 public detail::EnableDeviceReset {
1486 friend class detail::ExecutorBase<CudaExecutor>;
1501 "calling this CudaExecutor::create method is deprecated, because"
1502 "device_reset no longer has an effect"
1503 "call CudaExecutor::create("
1504 " int device_id, std::shared_ptr<Executor> master,"
1505 " std::shared_ptr<CudaAllocatorBase> alloc,"
1506 " CUstream_st* stream);"
1508 static std::shared_ptr<CudaExecutor> create(
1509 int device_id, std::shared_ptr<Executor> master,
bool device_reset,
1511 CUstream_st* stream =
nullptr);
1522 static std::shared_ptr<CudaExecutor> create(
1523 int device_id, std::shared_ptr<Executor> master,
1524 std::shared_ptr<CudaAllocatorBase> alloc =
1525 std::make_shared<CudaAllocator>(),
1526 CUstream_st* stream =
nullptr);
1528 std::shared_ptr<Executor> get_master() noexcept
override;
1530 std::shared_ptr<const Executor> get_master()
const noexcept
override;
1532 void synchronize()
const override;
1541 return this->get_exec_info().device_id;
1547 static int get_num_devices();
1554 return this->get_exec_info().num_pu_per_cu;
1562 return this->get_exec_info().num_computing_units;
1570 return this->get_exec_info().num_computing_units *
1571 this->get_exec_info().num_pu_per_cu;
1579 return this->get_exec_info().max_subgroup_size;
1587 return this->get_exec_info().major;
1595 return this->get_exec_info().minor;
1612 return cusparse_handle_.get();
1622 return this->get_exec_info().closest_pu_ids;
1641 void set_gpu_property();
1643 void init_handles();
1645 CudaExecutor(
int device_id, std::shared_ptr<Executor> master,
1646 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1647 : alloc_{std::move(alloc)}, master_(master), stream_{stream}
1649 this->get_exec_info().device_id = device_id;
1650 this->get_exec_info().num_computing_units = 0;
1651 this->get_exec_info().num_pu_per_cu = 0;
1652 this->CudaExecutor::populate_exec_info(
1654 this->set_gpu_property();
1655 this->init_handles();
1658 void* raw_alloc(
size_type size)
const override;
1660 void raw_free(
void* ptr)
const noexcept
override;
1662 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1664 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1666 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1668 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1670 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1672 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1674 void populate_exec_info(
const machine_topology* mach_topo)
override;
1677 std::shared_ptr<Executor> master_;
1679 template <
typename T>
1680 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1681 handle_manager<cublasContext> cublas_handle_;
1682 handle_manager<cusparseContext> cusparse_handle_;
1683 std::shared_ptr<CudaAllocatorBase> alloc_;
1684 CUstream_st* stream_;
1690 using DefaultExecutor = CudaExecutor;
1702 public std::enable_shared_from_this<HipExecutor>,
1703 public detail::EnableDeviceReset {
1719 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1720 "alloc_mode was replaced by the Allocator type "
1722 static std::shared_ptr<HipExecutor>
create(
1723 int device_id, std::shared_ptr<Executor> master,
bool device_reset,
1725 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1727 static std::shared_ptr<HipExecutor>
create(
1728 int device_id, std::shared_ptr<Executor> master,
1729 std::shared_ptr<HipAllocatorBase>
alloc =
1730 std::make_shared<HipAllocator>(),
1731 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1733 std::shared_ptr<Executor>
get_master() noexcept
override;
1735 std::shared_ptr<const Executor>
get_master()
const noexcept
override;
1746 return this->get_exec_info().device_id;
1759 return this->get_exec_info().num_pu_per_cu;
1767 return this->get_exec_info().num_computing_units;
1775 return this->get_exec_info().major;
1783 return this->get_exec_info().minor;
1791 return this->get_exec_info().num_computing_units *
1792 this->get_exec_info().num_pu_per_cu;
1800 return this->get_exec_info().max_subgroup_size;
1817 return hipsparse_handle_.get();
1834 return this->get_exec_info().closest_pu_ids;
1837 GKO_HIP_STREAM_STRUCT* get_stream()
const {
return stream_; }
1840 void set_gpu_property();
1842 void init_handles();
1844 HipExecutor(
int device_id, std::shared_ptr<Executor> master,
1845 std::shared_ptr<HipAllocatorBase>
alloc,
1846 GKO_HIP_STREAM_STRUCT* stream)
1847 : master_{std::move(master)}, alloc_{std::move(
alloc)}, stream_{stream}
1849 this->get_exec_info().device_id = device_id;
1850 this->get_exec_info().num_computing_units = 0;
1851 this->get_exec_info().num_pu_per_cu = 0;
1853 this->set_gpu_property();
1854 this->init_handles();
1857 void* raw_alloc(
size_type size)
const override;
1859 void raw_free(
void* ptr)
const noexcept
override;
1861 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1863 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1865 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1867 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1869 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1871 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1873 void populate_exec_info(
const machine_topology* mach_topo)
override;
1876 std::shared_ptr<Executor> master_;
1878 template <
typename T>
1879 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1880 handle_manager<hipblasContext> hipblas_handle_;
1881 handle_manager<hipsparseContext> hipsparse_handle_;
1882 std::shared_ptr<HipAllocatorBase> alloc_;
1883 GKO_HIP_STREAM_STRUCT* stream_;
1889 using DefaultExecutor = HipExecutor;
1901 public std::enable_shared_from_this<DpcppExecutor> {
1914 static std::shared_ptr<DpcppExecutor>
create(
1915 int device_id, std::shared_ptr<Executor> master,
1916 std::string device_type =
"all",
1917 dpcpp_queue_property property = dpcpp_queue_property::in_order);
1919 std::shared_ptr<Executor>
get_master() noexcept
override;
1921 std::shared_ptr<const Executor>
get_master()
const noexcept
override;
1934 return this->get_exec_info().device_id;
1937 sycl::queue* get_queue()
const {
return queue_.get(); }
1955 return this->get_exec_info().subgroup_sizes;
1965 return this->get_exec_info().num_computing_units;
1973 return this->get_exec_info().num_computing_units *
1974 this->get_exec_info().num_pu_per_cu;
1984 return this->get_exec_info().max_workitem_sizes;
1994 return this->get_exec_info().max_workgroup_size;
2004 return this->get_exec_info().max_subgroup_size;
2014 return this->get_exec_info().device_type;
2018 void set_device_property(
2019 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2022 int device_id, std::shared_ptr<Executor> master,
2023 std::string device_type =
"all",
2024 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2027 std::for_each(device_type.begin(), device_type.end(),
2028 [](
char& c) { c = std::tolower(c); });
2029 this->get_exec_info().device_type = std::string(device_type);
2030 this->get_exec_info().device_id = device_id;
2031 this->set_device_property(property);
2034 void populate_exec_info(
const machine_topology* mach_topo)
override;
2036 void* raw_alloc(
size_type size)
const override;
2038 void raw_free(
void* ptr)
const noexcept
override;
2040 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2042 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
2044 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
2046 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
2048 bool verify_memory_to(
const OmpExecutor* dest_exec)
const override;
2050 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
2053 std::shared_ptr<Executor> master_;
2055 template <
typename T>
2056 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2057 queue_manager<sycl::queue> queue_;
2063 using DefaultExecutor = DpcppExecutor;
2068 #undef GKO_OVERRIDE_RAW_COPY_TO
2074 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_