Ginkgo  Generated from pipelines/1330831941 branch based on master. Ginkgo version 1.8.0
A numerical linear algebra library targeting many-core architectures
executor.hpp
1 // SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
2 //
3 // SPDX-License-Identifier: BSD-3-Clause
4 
5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
7 
8 
9 #include <array>
10 #include <atomic>
11 #include <iostream>
12 #include <memory>
13 #include <mutex>
14 #include <sstream>
15 #include <string>
16 #include <tuple>
17 #include <type_traits>
18 #include <vector>
19 
20 
21 #include <ginkgo/core/base/device.hpp>
22 #include <ginkgo/core/base/fwd_decls.hpp>
23 #include <ginkgo/core/base/machine_topology.hpp>
24 #include <ginkgo/core/base/memory.hpp>
25 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
26 #include <ginkgo/core/base/types.hpp>
27 #include <ginkgo/core/log/logger.hpp>
28 #include <ginkgo/core/synthesizer/containers.hpp>
29 
30 
31 namespace gko {
32 
33 
41  never,
47  automatic
48 };
49 
50 
63 enum class allocation_mode { device, unified_global, unified_host };
64 
65 
66 #ifdef NDEBUG
67 
68 // When in release, prefer device allocations
69 constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
70 
71 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
72 
73 #else
74 
75 // When in debug, always UM allocations.
76 constexpr allocation_mode default_cuda_alloc_mode =
77  allocation_mode::unified_global;
78 
79 #if (GINKGO_HIP_PLATFORM_HCC == 1)
80 
81 // HIP on AMD GPUs does not support UM, so always prefer device allocations.
82 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
83 
84 #else
85 
86 // HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
87 constexpr allocation_mode default_hip_alloc_mode =
88  allocation_mode::unified_global;
89 
90 #endif
91 
92 #endif
93 
94 
95 } // namespace gko
96 
97 
102 enum class dpcpp_queue_property {
106  in_order = 1,
107 
111  enable_profiling = 2
112 };
113 
114 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
115  dpcpp_queue_property b)
116 {
117  return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
118  static_cast<int>(b));
119 }
120 
121 
122 namespace gko {
123 
124 
125 #define GKO_FORWARD_DECLARE(_type, ...) class _type
126 
127 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
128 
129 #undef GKO_FORWARD_DECLARE
130 
131 
132 class ReferenceExecutor;
133 
134 
135 namespace detail {
136 
137 
138 template <typename>
139 class ExecutorBase;
140 
141 
142 } // namespace detail
143 
144 
259 class Operation {
260 public:
261 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
262  virtual void run(std::shared_ptr<const _type>) const
263 
264  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
265 
266 #undef GKO_DECLARE_RUN_OVERLOAD
267 
268  // ReferenceExecutor overload can be defaulted to OmpExecutor's
269  virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
270 
276  virtual const char* get_name() const noexcept;
277 };
278 
279 
280 namespace detail {
281 
282 
292 template <typename Closure>
293 class RegisteredOperation : public Operation {
294 public:
301  RegisteredOperation(const char* name, Closure op)
302  : name_(name), op_(std::move(op))
303  {}
304 
305  const char* get_name() const noexcept override { return name_; }
306 
307  void run(std::shared_ptr<const ReferenceExecutor> exec) const override
308  {
309  op_(exec);
310  }
311 
312  void run(std::shared_ptr<const OmpExecutor> exec) const override
313  {
314  op_(exec);
315  }
316 
317  void run(std::shared_ptr<const CudaExecutor> exec) const override
318  {
319  op_(exec);
320  }
321 
322  void run(std::shared_ptr<const HipExecutor> exec) const override
323  {
324  op_(exec);
325  }
326 
327  void run(std::shared_ptr<const DpcppExecutor> exec) const override
328  {
329  op_(exec);
330  }
331 
332 private:
333  const char* name_;
334  Closure op_;
335 };
336 
337 
338 template <typename Closure>
339 RegisteredOperation<Closure> make_register_operation(const char* name,
340  Closure op)
341 {
342  return RegisteredOperation<Closure>{name, std::move(op)};
343 }
344 
345 
346 } // namespace detail
347 
348 
420 #define GKO_REGISTER_OPERATION(_name, _kernel) \
421  template <typename... Args> \
422  auto make_##_name(Args&&... args) \
423  { \
424  return ::gko::detail::make_register_operation( \
425  #_kernel, [&args...](auto exec) { \
426  using exec_type = decltype(exec); \
427  if (std::is_same< \
428  exec_type, \
429  std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
430  value) { \
431  ::gko::kernels::reference::_kernel( \
432  std::dynamic_pointer_cast< \
433  const ::gko::ReferenceExecutor>(exec), \
434  std::forward<Args>(args)...); \
435  } else if (std::is_same< \
436  exec_type, \
437  std::shared_ptr<const ::gko::OmpExecutor>>:: \
438  value) { \
439  ::gko::kernels::omp::_kernel( \
440  std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
441  exec), \
442  std::forward<Args>(args)...); \
443  } else if (std::is_same< \
444  exec_type, \
445  std::shared_ptr<const ::gko::CudaExecutor>>:: \
446  value) { \
447  ::gko::kernels::cuda::_kernel( \
448  std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
449  exec), \
450  std::forward<Args>(args)...); \
451  } else if (std::is_same< \
452  exec_type, \
453  std::shared_ptr<const ::gko::HipExecutor>>:: \
454  value) { \
455  ::gko::kernels::hip::_kernel( \
456  std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
457  exec), \
458  std::forward<Args>(args)...); \
459  } else if (std::is_same< \
460  exec_type, \
461  std::shared_ptr<const ::gko::DpcppExecutor>>:: \
462  value) { \
463  ::gko::kernels::dpcpp::_kernel( \
464  std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
465  exec), \
466  std::forward<Args>(args)...); \
467  } else { \
468  GKO_NOT_IMPLEMENTED; \
469  } \
470  }); \
471  } \
472  static_assert(true, \
473  "This assert is used to counter the false positive extra " \
474  "semi-colon warnings")
475 
476 
514 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
515  template <typename... Args> \
516  auto make_##_name(Args&&... args) \
517  { \
518  return ::gko::detail::make_register_operation( \
519  #_kernel, \
520  [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
521  } \
522  static_assert(true, \
523  "This assert is used to counter the false positive extra " \
524  "semi-colon warnings")
525 
526 
527 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
528 
616 class Executor : public log::EnableLogging<Executor> {
617  template <typename T>
618  friend class detail::ExecutorBase;
619 
620  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
621  friend class ReferenceExecutor;
622 
623 public:
624  virtual ~Executor() = default;
625 
626  Executor() = default;
627  Executor(Executor&) = delete;
628  Executor(Executor&&) = delete;
629  Executor& operator=(Executor&) = delete;
630  Executor& operator=(Executor&&) = delete;
631 
637  virtual void run(const Operation& op) const = 0;
638 
653  template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
654  typename ClosureDpcpp>
655  void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
656  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
657  {
658  LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
659  op_omp, op_cuda, op_hip, op_dpcpp);
660  this->run(op);
661  }
662 
674  template <typename T>
675  T* alloc(size_type num_elems) const
676  {
677  this->template log<log::Logger::allocation_started>(
678  this, num_elems * sizeof(T));
679  T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
680  this->template log<log::Logger::allocation_completed>(
681  this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
682  return allocated;
683  }
684 
692  void free(void* ptr) const noexcept
693  {
694  this->template log<log::Logger::free_started>(
695  this, reinterpret_cast<uintptr>(ptr));
696  this->raw_free(ptr);
697  this->template log<log::Logger::free_completed>(
698  this, reinterpret_cast<uintptr>(ptr));
699  }
700 
713  template <typename T>
715  const T* src_ptr, T* dest_ptr) const
716  {
717  const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
718  const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
719  this->template log<log::Logger::copy_started>(
720  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
721  if (this != src_exec.get()) {
722  src_exec->template log<log::Logger::copy_started>(
723  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
724  }
725  try {
726  this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,
727  dest_ptr);
728  } catch (NotSupported&) {
729 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
730  // Unoptimized copy. Try to go through the masters.
731  // output to log when verbose >= 1 and debug build
732  std::clog << "Not direct copy. Try to copy data from the masters."
733  << std::endl;
734 #endif
735  auto src_master = src_exec->get_master().get();
736  if (num_elems > 0 && src_master != src_exec.get()) {
737  auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
738  src_master->copy_from<T>(src_exec, num_elems, src_ptr,
739  master_ptr);
740  this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
741  src_master->free(master_ptr);
742  }
743  }
744  this->template log<log::Logger::copy_completed>(
745  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
746  if (this != src_exec.get()) {
747  src_exec->template log<log::Logger::copy_completed>(
748  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
749  }
750  }
751 
763  template <typename T>
764  void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
765  {
766  this->copy_from(this, num_elems, src_ptr, dest_ptr);
767  }
768 
778  template <typename T>
779  T copy_val_to_host(const T* ptr) const
780  {
781  T out{};
782  this->get_master()->copy_from(this, 1, ptr, &out);
783  return out;
784  }
785 
790  virtual std::shared_ptr<Executor> get_master() noexcept = 0;
791 
795  virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
796 
800  virtual void synchronize() const = 0;
801 
808  void add_logger(std::shared_ptr<const log::Logger> logger) override
809  {
810  this->propagating_logger_refcount_.fetch_add(
811  logger->needs_propagation() ? 1 : 0);
812  this->EnableLogging<Executor>::add_logger(logger);
813  }
814 
821  void remove_logger(const log::Logger* logger) override
822  {
823  this->propagating_logger_refcount_.fetch_sub(
824  logger->needs_propagation() ? 1 : 0);
825  this->EnableLogging<Executor>::remove_logger(logger);
826  }
827 
828  using EnableLogging<Executor>::remove_logger;
829 
838  {
839  log_propagation_mode_ = mode;
840  }
841 
849  bool should_propagate_log() const
850  {
851  return this->propagating_logger_refcount_.load() > 0 &&
852  log_propagation_mode_ == log_propagation_mode::automatic;
853  }
854 
862  bool memory_accessible(const std::shared_ptr<const Executor>& other) const
863  {
864  return this->verify_memory_from(other.get());
865  }
866 
867  virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
868 
869 protected:
874  struct exec_info {
878  int device_id = -1;
879 
883  std::string device_type;
884 
888  int numa_node = -1;
889 
898  int num_computing_units = -1;
899 
911  int num_pu_per_cu = -1;
912 
921  std::vector<int> subgroup_sizes{};
922 
931  int max_subgroup_size = -1;
932 
943  std::vector<int> max_workitem_sizes{};
944 
954  int max_workgroup_size;
955 
959  int major = -1;
960 
964  int minor = -1;
965 
971  std::string pci_bus_id = std::string(13, 'x');
972 
983  std::vector<int> closest_pu_ids{};
984  };
985 
991  const exec_info& get_exec_info() const { return this->exec_info_; }
992 
1002  virtual void* raw_alloc(size_type size) const = 0;
1003 
1011  virtual void raw_free(void* ptr) const noexcept = 0;
1012 
1023  virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1024  const void* src_ptr, void* dest_ptr) const = 0;
1025 
1035 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1036  virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1037  const void* src_ptr, void* dest_ptr) const = 0
1038 
1039  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1040 
1041 #undef GKO_ENABLE_RAW_COPY_TO
1042 
1050  virtual bool verify_memory_from(const Executor* src_exec) const = 0;
1051 
1061 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1062  virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1063 
1064  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1065 
1066  GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1067 
1068 #undef GKO_ENABLE_VERIFY_MEMORY_TO
1069 
1076  virtual void populate_exec_info(const machine_topology* mach_topo) = 0;
1077 
1083  exec_info& get_exec_info() { return this->exec_info_; }
1084 
1085  exec_info exec_info_;
1086 
1088 
1089  std::atomic<int> propagating_logger_refcount_{};
1090 
1091 private:
1106  template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
1107  typename ClosureDpcpp>
1108  class LambdaOperation : public Operation {
1109  public:
1120  LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1121  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1122  : op_omp_(op_omp),
1123  op_cuda_(op_cuda),
1124  op_hip_(op_hip),
1125  op_dpcpp_(op_dpcpp)
1126  {}
1127 
1128  void run(std::shared_ptr<const OmpExecutor>) const override
1129  {
1130  op_omp_();
1131  }
1132 
1133  void run(std::shared_ptr<const ReferenceExecutor>) const override
1134  {
1135  op_omp_();
1136  }
1137 
1138  void run(std::shared_ptr<const CudaExecutor>) const override
1139  {
1140  op_cuda_();
1141  }
1142 
1143  void run(std::shared_ptr<const HipExecutor>) const override
1144  {
1145  op_hip_();
1146  }
1147 
1148  void run(std::shared_ptr<const DpcppExecutor>) const override
1149  {
1150  op_dpcpp_();
1151  }
1152 
1153  private:
1154  ClosureOmp op_omp_;
1155  ClosureCuda op_cuda_;
1156  ClosureHip op_hip_;
1157  ClosureDpcpp op_dpcpp_;
1158  };
1159 };
1160 
1161 
1170 template <typename T>
1172 public:
1173  using pointer = T*;
1174 
1180  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1181  : exec_{exec}
1182  {}
1183 
1189  void operator()(pointer ptr) const
1190  {
1191  if (exec_) {
1192  exec_->free(ptr);
1193  }
1194  }
1195 
1196 private:
1197  std::shared_ptr<const Executor> exec_;
1198 };
1199 
1200 // a specialization for arrays
1201 template <typename T>
1202 class executor_deleter<T[]> {
1203 public:
1204  using pointer = T[];
1205 
1206  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1207  : exec_{exec}
1208  {}
1209 
1210  void operator()(pointer ptr) const
1211  {
1212  if (exec_) {
1213  exec_->free(ptr);
1214  }
1215  }
1216 
1217 private:
1218  std::shared_ptr<const Executor> exec_;
1219 };
1220 
1221 
1222 namespace detail {
1223 
1224 
1225 template <typename ConcreteExecutor>
1226 class ExecutorBase : public Executor {
1227  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
1228  friend class ReferenceExecutor;
1229 
1230 public:
1231  using Executor::run;
1232 
1233  void run(const Operation& op) const override
1234  {
1235  this->template log<log::Logger::operation_launched>(this, &op);
1236  auto scope_guard = get_scoped_device_id_guard();
1237  op.run(self()->shared_from_this());
1238  this->template log<log::Logger::operation_completed>(this, &op);
1239  }
1240 
1241 protected:
1242  void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1243  const void* src_ptr, void* dest_ptr) const override
1244  {
1245  src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1246  }
1247 
1248  virtual bool verify_memory_from(const Executor* src_exec) const override
1249  {
1250  return src_exec->verify_memory_to(self());
1251  }
1252 
1253 private:
1254  ConcreteExecutor* self() noexcept
1255  {
1256  return static_cast<ConcreteExecutor*>(this);
1257  }
1258 
1259  const ConcreteExecutor* self() const noexcept
1260  {
1261  return static_cast<const ConcreteExecutor*>(this);
1262  }
1263 };
1264 
1265 #undef GKO_DECLARE_EXECUTOR_FRIEND
1266 
1267 
1275 class EnableDeviceReset {
1276 public:
1282  GKO_DEPRECATED(
1283  "device_reset is no longer supported, call "
1284  "cudaDeviceReset/hipDeviceReset manually")
1285  void set_device_reset(bool device_reset) {}
1286 
1292  GKO_DEPRECATED(
1293  "device_reset is no longer supported, call "
1294  "cudaDeviceReset/hipDeviceReset manually")
1295  bool get_device_reset() { return false; }
1296 
1297 protected:
1303  EnableDeviceReset() {}
1304 
1305  GKO_DEPRECATED(
1306  "device_reset is no longer supported, call "
1307  "cudaDeviceReset/hipDeviceReset manually")
1308  EnableDeviceReset(bool device_reset) {}
1309 };
1310 
1311 
1312 } // namespace detail
1313 
1314 
1315 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1316  void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1317  const void* src_ptr, void* dest_ptr) const override
1318 
1319 
1320 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1321  virtual bool verify_memory_to(const dest_* other) const override \
1322  { \
1323  return bool_; \
1324  } \
1325  static_assert(true, \
1326  "This assert is used to counter the false positive extra " \
1327  "semi-colon warnings")
1328 
1329 
1337 class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1338  public std::enable_shared_from_this<OmpExecutor> {
1339  friend class detail::ExecutorBase<OmpExecutor>;
1340 
1341 public:
1345  static std::shared_ptr<OmpExecutor> create(
1346  std::shared_ptr<CpuAllocatorBase> alloc =
1347  std::make_shared<CpuAllocator>())
1348  {
1349  return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
1350  }
1351 
1352  std::shared_ptr<Executor> get_master() noexcept override;
1353 
1354  std::shared_ptr<const Executor> get_master() const noexcept override;
1355 
1356  void synchronize() const override;
1357 
1358  int get_num_cores() const
1359  {
1360  return this->get_exec_info().num_computing_units;
1361  }
1362 
1363  int get_num_threads_per_core() const
1364  {
1365  return this->get_exec_info().num_pu_per_cu;
1366  }
1367 
1368  static int get_num_omp_threads();
1369 
1370  scoped_device_id_guard get_scoped_device_id_guard() const override;
1371 
1372 protected:
1373  OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1374  : alloc_{std::move(alloc)}
1375  {
1376  this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
1377  }
1378 
1379  void populate_exec_info(const machine_topology* mach_topo) override;
1380 
1381  void* raw_alloc(size_type size) const override;
1382 
1383  void raw_free(void* ptr) const noexcept override;
1384 
1385  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1386 
1387  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1388 
1389  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1390 
1391  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1392 
1393  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1394 
1395  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1396 
1397  std::shared_ptr<CpuAllocatorBase> alloc_;
1398 };
1399 
1400 
1401 namespace kernels {
1402 namespace omp {
1403 using DefaultExecutor = OmpExecutor;
1404 } // namespace omp
1405 } // namespace kernels
1406 
1407 
1416 public:
1417  static std::shared_ptr<ReferenceExecutor> create(
1418  std::shared_ptr<CpuAllocatorBase> alloc =
1419  std::make_shared<CpuAllocator>())
1420  {
1421  return std::shared_ptr<ReferenceExecutor>(
1422  new ReferenceExecutor(std::move(alloc)));
1423  }
1424 
1425  scoped_device_id_guard get_scoped_device_id_guard() const override
1426  {
1427  return {this, 0};
1428  }
1429 
1430  void run(const Operation& op) const override
1431  {
1432  this->template log<log::Logger::operation_launched>(this, &op);
1433  op.run(std::static_pointer_cast<const ReferenceExecutor>(
1434  this->shared_from_this()));
1435  this->template log<log::Logger::operation_completed>(this, &op);
1436  }
1437 
1438 protected:
1439  ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1440  : OmpExecutor{std::move(alloc)}
1441  {
1442  this->ReferenceExecutor::populate_exec_info(
1444  }
1445 
1446  void populate_exec_info(const machine_topology*) override
1447  {
1448  this->get_exec_info().device_id = -1;
1449  this->get_exec_info().num_computing_units = 1;
1450  this->get_exec_info().num_pu_per_cu = 1;
1451  }
1452 
1453  bool verify_memory_from(const Executor* src_exec) const override
1454  {
1455  return src_exec->verify_memory_to(this);
1456  }
1457 
1458  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1459 
1460  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1461 
1462  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1463 
1464  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1465 
1466  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1467 };
1468 
1469 
1470 namespace kernels {
1471 namespace reference {
1472 using DefaultExecutor = ReferenceExecutor;
1473 } // namespace reference
1474 } // namespace kernels
1475 
1476 
1483 class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1484  public std::enable_shared_from_this<CudaExecutor>,
1485  public detail::EnableDeviceReset {
1486  friend class detail::ExecutorBase<CudaExecutor>;
1487 
1488 public:
1500  GKO_DEPRECATED(
1501  "calling this CudaExecutor::create method is deprecated, because"
1502  "device_reset no longer has an effect"
1503  "call CudaExecutor::create("
1504  " int device_id, std::shared_ptr<Executor> master,"
1505  " std::shared_ptr<CudaAllocatorBase> alloc,"
1506  " CUstream_st* stream);"
1507  "instead")
1508  static std::shared_ptr<CudaExecutor> create(
1509  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1510  allocation_mode alloc_mode = default_cuda_alloc_mode,
1511  CUstream_st* stream = nullptr);
1512 
1522  static std::shared_ptr<CudaExecutor> create(
1523  int device_id, std::shared_ptr<Executor> master,
1524  std::shared_ptr<CudaAllocatorBase> alloc =
1525  std::make_shared<CudaAllocator>(),
1526  CUstream_st* stream = nullptr);
1527 
1528  std::shared_ptr<Executor> get_master() noexcept override;
1529 
1530  std::shared_ptr<const Executor> get_master() const noexcept override;
1531 
1532  void synchronize() const override;
1533 
1534  scoped_device_id_guard get_scoped_device_id_guard() const override;
1535 
1539  int get_device_id() const noexcept
1540  {
1541  return this->get_exec_info().device_id;
1542  }
1543 
1547  static int get_num_devices();
1548 
1552  int get_num_warps_per_sm() const noexcept
1553  {
1554  return this->get_exec_info().num_pu_per_cu;
1555  }
1556 
1560  int get_num_multiprocessor() const noexcept
1561  {
1562  return this->get_exec_info().num_computing_units;
1563  }
1564 
1568  int get_num_warps() const noexcept
1569  {
1570  return this->get_exec_info().num_computing_units *
1571  this->get_exec_info().num_pu_per_cu;
1572  }
1573 
1577  int get_warp_size() const noexcept
1578  {
1579  return this->get_exec_info().max_subgroup_size;
1580  }
1581 
1585  int get_major_version() const noexcept
1586  {
1587  return this->get_exec_info().major;
1588  }
1589 
1593  int get_minor_version() const noexcept
1594  {
1595  return this->get_exec_info().minor;
1596  }
1597 
1603  cublasContext* get_cublas_handle() const { return cublas_handle_.get(); }
1604 
1610  cusparseContext* get_cusparse_handle() const
1611  {
1612  return cusparse_handle_.get();
1613  }
1614 
1620  std::vector<int> get_closest_pus() const
1621  {
1622  return this->get_exec_info().closest_pu_ids;
1623  }
1624 
1630  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1631 
1638  CUstream_st* get_stream() const { return stream_; }
1639 
1640 protected:
1641  void set_gpu_property();
1642 
1643  void init_handles();
1644 
1645  CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1646  std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1647  : alloc_{std::move(alloc)}, master_(master), stream_{stream}
1648  {
1649  this->get_exec_info().device_id = device_id;
1650  this->get_exec_info().num_computing_units = 0;
1651  this->get_exec_info().num_pu_per_cu = 0;
1652  this->CudaExecutor::populate_exec_info(
1654  this->set_gpu_property();
1655  this->init_handles();
1656  }
1657 
1658  void* raw_alloc(size_type size) const override;
1659 
1660  void raw_free(void* ptr) const noexcept override;
1661 
1662  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1663 
1664  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1665 
1666  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1667 
1668  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1669 
1670  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1671 
1672  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1673 
1674  void populate_exec_info(const machine_topology* mach_topo) override;
1675 
1676 private:
1677  std::shared_ptr<Executor> master_;
1678 
1679  template <typename T>
1680  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1681  handle_manager<cublasContext> cublas_handle_;
1682  handle_manager<cusparseContext> cusparse_handle_;
1683  std::shared_ptr<CudaAllocatorBase> alloc_;
1684  CUstream_st* stream_;
1685 };
1686 
1687 
1688 namespace kernels {
1689 namespace cuda {
1690 using DefaultExecutor = CudaExecutor;
1691 } // namespace cuda
1692 } // namespace kernels
1693 
1694 
1701 class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1702  public std::enable_shared_from_this<HipExecutor>,
1703  public detail::EnableDeviceReset {
1704  friend class detail::ExecutorBase<HipExecutor>;
1705 
1706 public:
1718  GKO_DEPRECATED(
1719  "device_reset is deprecated entirely, call hipDeviceReset directly. "
1720  "alloc_mode was replaced by the Allocator type "
1721  "hierarchy.")
1722  static std::shared_ptr<HipExecutor> create(
1723  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1724  allocation_mode alloc_mode = default_hip_alloc_mode,
1725  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1726 
1727  static std::shared_ptr<HipExecutor> create(
1728  int device_id, std::shared_ptr<Executor> master,
1729  std::shared_ptr<HipAllocatorBase> alloc =
1730  std::make_shared<HipAllocator>(),
1731  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1732 
1733  std::shared_ptr<Executor> get_master() noexcept override;
1734 
1735  std::shared_ptr<const Executor> get_master() const noexcept override;
1736 
1737  void synchronize() const override;
1738 
1739  scoped_device_id_guard get_scoped_device_id_guard() const override;
1740 
1744  int get_device_id() const noexcept
1745  {
1746  return this->get_exec_info().device_id;
1747  }
1748 
1752  static int get_num_devices();
1753 
1757  int get_num_warps_per_sm() const noexcept
1758  {
1759  return this->get_exec_info().num_pu_per_cu;
1760  }
1761 
1765  int get_num_multiprocessor() const noexcept
1766  {
1767  return this->get_exec_info().num_computing_units;
1768  }
1769 
1773  int get_major_version() const noexcept
1774  {
1775  return this->get_exec_info().major;
1776  }
1777 
1781  int get_minor_version() const noexcept
1782  {
1783  return this->get_exec_info().minor;
1784  }
1785 
1789  int get_num_warps() const noexcept
1790  {
1791  return this->get_exec_info().num_computing_units *
1792  this->get_exec_info().num_pu_per_cu;
1793  }
1794 
1798  int get_warp_size() const noexcept
1799  {
1800  return this->get_exec_info().max_subgroup_size;
1801  }
1802 
1808  hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); }
1809 
1815  hipsparseContext* get_hipsparse_handle() const
1816  {
1817  return hipsparse_handle_.get();
1818  }
1819 
1825  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1826 
1832  std::vector<int> get_closest_pus() const
1833  {
1834  return this->get_exec_info().closest_pu_ids;
1835  }
1836 
1837  GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
1838 
1839 protected:
1840  void set_gpu_property();
1841 
1842  void init_handles();
1843 
1844  HipExecutor(int device_id, std::shared_ptr<Executor> master,
1845  std::shared_ptr<HipAllocatorBase> alloc,
1846  GKO_HIP_STREAM_STRUCT* stream)
1847  : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1848  {
1849  this->get_exec_info().device_id = device_id;
1850  this->get_exec_info().num_computing_units = 0;
1851  this->get_exec_info().num_pu_per_cu = 0;
1852  this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1853  this->set_gpu_property();
1854  this->init_handles();
1855  }
1856 
1857  void* raw_alloc(size_type size) const override;
1858 
1859  void raw_free(void* ptr) const noexcept override;
1860 
1861  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1862 
1863  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1864 
1865  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1866 
1867  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1868 
1869  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1870 
1871  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1872 
1873  void populate_exec_info(const machine_topology* mach_topo) override;
1874 
1875 private:
1876  std::shared_ptr<Executor> master_;
1877 
1878  template <typename T>
1879  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1880  handle_manager<hipblasContext> hipblas_handle_;
1881  handle_manager<hipsparseContext> hipsparse_handle_;
1882  std::shared_ptr<HipAllocatorBase> alloc_;
1883  GKO_HIP_STREAM_STRUCT* stream_;
1884 };
1885 
1886 
1887 namespace kernels {
1888 namespace hip {
1889 using DefaultExecutor = HipExecutor;
1890 } // namespace hip
1891 } // namespace kernels
1892 
1893 
1900 class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
1901  public std::enable_shared_from_this<DpcppExecutor> {
1902  friend class detail::ExecutorBase<DpcppExecutor>;
1903 
1904 public:
1914  static std::shared_ptr<DpcppExecutor> create(
1915  int device_id, std::shared_ptr<Executor> master,
1916  std::string device_type = "all",
1917  dpcpp_queue_property property = dpcpp_queue_property::in_order);
1918 
1919  std::shared_ptr<Executor> get_master() noexcept override;
1920 
1921  std::shared_ptr<const Executor> get_master() const noexcept override;
1922 
1923  void synchronize() const override;
1924 
1925  scoped_device_id_guard get_scoped_device_id_guard() const override;
1926 
1932  int get_device_id() const noexcept
1933  {
1934  return this->get_exec_info().device_id;
1935  }
1936 
1937  sycl::queue* get_queue() const { return queue_.get(); }
1938 
1946  static int get_num_devices(std::string device_type);
1947 
1953  const std::vector<int>& get_subgroup_sizes() const noexcept
1954  {
1955  return this->get_exec_info().subgroup_sizes;
1956  }
1957 
1963  int get_num_computing_units() const noexcept
1964  {
1965  return this->get_exec_info().num_computing_units;
1966  }
1967 
1971  int get_num_subgroups() const noexcept
1972  {
1973  return this->get_exec_info().num_computing_units *
1974  this->get_exec_info().num_pu_per_cu;
1975  }
1976 
1982  const std::vector<int>& get_max_workitem_sizes() const noexcept
1983  {
1984  return this->get_exec_info().max_workitem_sizes;
1985  }
1986 
1992  int get_max_workgroup_size() const noexcept
1993  {
1994  return this->get_exec_info().max_workgroup_size;
1995  }
1996 
2002  int get_max_subgroup_size() const noexcept
2003  {
2004  return this->get_exec_info().max_subgroup_size;
2005  }
2006 
2012  std::string get_device_type() const noexcept
2013  {
2014  return this->get_exec_info().device_type;
2015  }
2016 
2017 protected:
2018  void set_device_property(
2019  dpcpp_queue_property property = dpcpp_queue_property::in_order);
2020 
2021  DpcppExecutor(
2022  int device_id, std::shared_ptr<Executor> master,
2023  std::string device_type = "all",
2024  dpcpp_queue_property property = dpcpp_queue_property::in_order)
2025  : master_(master)
2026  {
2027  std::for_each(device_type.begin(), device_type.end(),
2028  [](char& c) { c = std::tolower(c); });
2029  this->get_exec_info().device_type = std::string(device_type);
2030  this->get_exec_info().device_id = device_id;
2031  this->set_device_property(property);
2032  }
2033 
2034  void populate_exec_info(const machine_topology* mach_topo) override;
2035 
2036  void* raw_alloc(size_type size) const override;
2037 
2038  void raw_free(void* ptr) const noexcept override;
2039 
2040  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2041 
2042  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
2043 
2044  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
2045 
2046  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
2047 
2048  bool verify_memory_to(const OmpExecutor* dest_exec) const override;
2049 
2050  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
2051 
2052 private:
2053  std::shared_ptr<Executor> master_;
2054 
2055  template <typename T>
2056  using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2057  queue_manager<sycl::queue> queue_;
2058 };
2059 
2060 
2061 namespace kernels {
2062 namespace dpcpp {
2063 using DefaultExecutor = DpcppExecutor;
2064 } // namespace dpcpp
2065 } // namespace kernels
2066 
2067 
2068 #undef GKO_OVERRIDE_RAW_COPY_TO
2069 
2070 
2071 } // namespace gko
2072 
2073 
2074 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
gko::allocation_mode
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition: executor.hpp:63
gko::CudaExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1552
gko::executor_deleter
This is a deleter that uses an executor's free method to deallocate the data.
Definition: executor.hpp:1171
gko::HipExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1757
gko::executor_deleter::executor_deleter
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition: executor.hpp:1180
gko::CudaExecutor::get_stream
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition: executor.hpp:1638
gko::Executor::synchronize
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor::get_max_workitem_sizes
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition: executor.hpp:1982
gko::Executor::free
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition: executor.hpp:692
gko::HipExecutor::get_hipblas_handle
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1808
gko::Executor::memory_accessible
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition: executor.hpp:862
gko::DpcppExecutor::get_device_type
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition: executor.hpp:2012
gko::HipExecutor::get_num_devices
static int get_num_devices()
Get the number of devices present on the system.
gko::HipExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1825
gko::scoped_device_id_guard
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition: scoped_device_id_guard.hpp:76
gko::DpcppExecutor::get_num_devices
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
gko::DpcppExecutor::get_num_computing_units
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition: executor.hpp:1963
gko::log_propagation_mode::automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
gko::Executor::remove_logger
void remove_logger(const log::Logger *logger) override
Definition: executor.hpp:821
gko::DpcppExecutor::get_num_subgroups
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition: executor.hpp:1971
gko::size_type
std::size_t size_type
Integral type used for allocation quantities.
Definition: types.hpp:108
gko::Executor::copy_from
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition: executor.hpp:714
gko::Executor::get_master
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
gko::ptr_param::get
T * get() const
Definition: utils_helper.hpp:77
gko::Executor::run
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
gko::HipExecutor
This is the Executor subclass which represents the HIP enhanced device.
Definition: executor.hpp:1701
gko::CudaExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1593
gko::DpcppExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::log_propagation_mode::never
Events only get reported at loggers attached to the triggering object.
gko::CudaExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1560
gko::ReferenceExecutor
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition: executor.hpp:1415
gko::CudaExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1577
gko::NotSupported
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition: exception.hpp:128
gko::HipExecutor::get_device_id
int get_device_id() const noexcept
Get the HIP device id of the device associated to this executor.
Definition: executor.hpp:1744
gko::HipExecutor::get_hipsparse_handle
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1815
gko::CudaExecutor
This is the Executor subclass which represents the CUDA device.
Definition: executor.hpp:1483
gko::log_propagation_mode
log_propagation_mode
How Logger events are propagated to their Executor.
Definition: executor.hpp:35
gko::HipExecutor::create
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
gko
The Ginkgo namespace.
Definition: abstract_factory.hpp:20
gko::Executor::add_logger
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition: executor.hpp:808
gko::CudaExecutor::get_cublas_handle
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition: executor.hpp:1603
gko::HipExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::executor_deleter::operator()
void operator()(pointer ptr) const
Deletes the object.
Definition: executor.hpp:1189
gko::HipExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1832
gko::DpcppExecutor::get_max_subgroup_size
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition: executor.hpp:2002
gko::log::EnableLogging
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition: logger.hpp:749
gko::Operation::get_name
virtual const char * get_name() const noexcept
Returns the operation's name.
gko::DpcppExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor
This is the Executor subclass which represents a DPC++ enhanced device.
Definition: executor.hpp:1900
gko::ptr_param
This class is used for function parameters in the place of raw pointers.
Definition: utils_helper.hpp:43
gko::log::Logger
Definition: logger.hpp:76
gko::Executor::copy_val_to_host
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition: executor.hpp:779
gko::OmpExecutor
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition: executor.hpp:1337
gko::Executor::run
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition: executor.hpp:655
gko::CudaExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1620
gko::stop::mode
mode
The mode for the residual norm criterion.
Definition: residual_norm.hpp:37
gko::OmpExecutor::create
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition: executor.hpp:1345
gko::CudaExecutor::get_cusparse_handle
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition: executor.hpp:1610
gko::Executor::alloc
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition: executor.hpp:675
gko::ReferenceExecutor::run
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition: executor.hpp:1430
gko::Executor::copy
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition: executor.hpp:764
gko::HipExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1765
gko::Executor::should_propagate_log
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition: executor.hpp:849
gko::DpcppExecutor::get_device_id
int get_device_id() const noexcept
Get the DPCPP device id of the device associated to this executor.
Definition: executor.hpp:1932
gko::DpcppExecutor::get_max_workgroup_size
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition: executor.hpp:1992
gko::CudaExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1585
gko::CudaExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1568
gko::log::Logger::needs_propagation
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition: logger.hpp:643
gko::CudaExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1630
gko::Executor
The first step in using the Ginkgo library consists of creating an executor.
Definition: executor.hpp:616
gko::HipExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1789
gko::HipExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1773
gko::DpcppExecutor::create
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
gko::machine_topology::get_instance
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition: machine_topology.hpp:183
gko::HipExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::Operation
Operations can be used to define functionalities whose implementations differ among devices.
Definition: executor.hpp:259
gko::Executor::set_log_propagation_mode
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition: executor.hpp:837
gko::DpcppExecutor::get_subgroup_sizes
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition: executor.hpp:1953
gko::HipExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1798
gko::HipExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1781
gko::CudaExecutor::get_device_id
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition: executor.hpp:1539