Ginkgo  Generated from pipelines/2406079695 branch based on develop. Ginkgo version 1.12.0
A numerical linear algebra library targeting many-core architectures
executor.hpp
1 // SPDX-FileCopyrightText: 2017 - 2026 The Ginkgo authors
2 //
3 // SPDX-License-Identifier: BSD-3-Clause
4 
5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
7 
8 
9 #include <array>
10 #include <atomic>
11 #include <iostream>
12 #include <memory>
13 #include <mutex>
14 #include <sstream>
15 #include <string>
16 #include <tuple>
17 #include <type_traits>
18 #include <vector>
19 
20 #include <ginkgo/core/base/device.hpp>
21 #include <ginkgo/core/base/fwd_decls.hpp>
22 #include <ginkgo/core/base/machine_topology.hpp>
23 #include <ginkgo/core/base/memory.hpp>
24 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
25 #include <ginkgo/core/base/types.hpp>
26 #include <ginkgo/core/log/logger.hpp>
27 #include <ginkgo/core/synthesizer/containers.hpp>
28 
29 
30 namespace gko {
31 
32 
40  never,
46  automatic
47 };
48 
49 
62 enum class allocation_mode { device, unified_global, unified_host };
63 
64 
65 #ifdef NDEBUG
66 
67 // When in release, prefer device allocations
68 constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
69 
70 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
71 
72 #else
73 
74 // When in debug, always UM allocations.
75 constexpr allocation_mode default_cuda_alloc_mode =
76  allocation_mode::unified_global;
77 
78 #if (GINKGO_HIP_PLATFORM_HCC == 1)
79 
80 // HIP on AMD GPUs does not support UM, so always prefer device allocations.
81 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
82 
83 #else
84 
85 // HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
86 constexpr allocation_mode default_hip_alloc_mode =
87  allocation_mode::unified_global;
88 
89 #endif
90 
91 #endif
92 
93 
94 } // namespace gko
95 
96 
101 enum class dpcpp_queue_property {
105  in_order = 1,
106 
110  enable_profiling = 2
111 };
112 
113 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
114  dpcpp_queue_property b)
115 {
116  return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
117  static_cast<int>(b));
118 }
119 
120 
121 namespace gko {
122 
123 
124 #define GKO_FORWARD_DECLARE(_type, ...) class _type
125 
126 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
127 
128 #undef GKO_FORWARD_DECLARE
129 
130 
131 class ReferenceExecutor;
132 
133 
134 namespace detail {
135 
136 
137 template <typename>
138 class ExecutorBase;
139 
140 
141 } // namespace detail
142 
143 
258 class Operation {
259 public:
260 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
261  virtual void run(std::shared_ptr<const _type>) const
262 
263  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
264 
265 #undef GKO_DECLARE_RUN_OVERLOAD
266 
267  // ReferenceExecutor overload can be defaulted to OmpExecutor's
268  virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
269 
275  virtual const char* get_name() const noexcept;
276 };
277 
278 
279 namespace detail {
280 
281 
291 template <typename Closure>
292 class RegisteredOperation : public Operation {
293 public:
300  RegisteredOperation(const char* name, Closure op)
301  : name_(name), op_(std::move(op))
302  {}
303 
304  const char* get_name() const noexcept override { return name_; }
305 
306  void run(std::shared_ptr<const ReferenceExecutor> exec) const override
307  {
308  op_(exec);
309  }
310 
311  void run(std::shared_ptr<const OmpExecutor> exec) const override
312  {
313  op_(exec);
314  }
315 
316  void run(std::shared_ptr<const CudaExecutor> exec) const override
317  {
318  op_(exec);
319  }
320 
321  void run(std::shared_ptr<const HipExecutor> exec) const override
322  {
323  op_(exec);
324  }
325 
326  void run(std::shared_ptr<const DpcppExecutor> exec) const override
327  {
328  op_(exec);
329  }
330 
331 private:
332  const char* name_;
333  Closure op_;
334 };
335 
336 
337 template <typename Closure>
338 RegisteredOperation<Closure> make_register_operation(const char* name,
339  Closure op)
340 {
341  return RegisteredOperation<Closure>{name, std::move(op)};
342 }
343 
344 
345 } // namespace detail
346 
347 
419 #define GKO_REGISTER_OPERATION(_name, _kernel) \
420  template <typename... Args> \
421  auto make_##_name(Args&&... args) \
422  { \
423  return ::gko::detail::make_register_operation( \
424  #_kernel, [&args...](auto exec) { \
425  using exec_type = decltype(exec); \
426  if constexpr (std::is_same< \
427  exec_type, \
428  std::shared_ptr< \
429  const ::gko::ReferenceExecutor>>:: \
430  value) { \
431  ::gko::kernels::reference::_kernel( \
432  std::dynamic_pointer_cast< \
433  const ::gko::ReferenceExecutor>(exec), \
434  std::forward<Args>(args)...); \
435  } else if constexpr ( \
436  std::is_same< \
437  exec_type, \
438  std::shared_ptr<const ::gko::OmpExecutor>>::value) { \
439  ::gko::kernels::omp::_kernel( \
440  std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
441  exec), \
442  std::forward<Args>(args)...); \
443  } else if constexpr ( \
444  std::is_same< \
445  exec_type, \
446  std::shared_ptr<const ::gko::CudaExecutor>>::value) { \
447  ::gko::kernels::cuda::_kernel( \
448  std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
449  exec), \
450  std::forward<Args>(args)...); \
451  } else if constexpr ( \
452  std::is_same< \
453  exec_type, \
454  std::shared_ptr<const ::gko::HipExecutor>>::value) { \
455  ::gko::kernels::hip::_kernel( \
456  std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
457  exec), \
458  std::forward<Args>(args)...); \
459  } else if constexpr ( \
460  std::is_same< \
461  exec_type, \
462  std::shared_ptr<const ::gko::DpcppExecutor>>::value) { \
463  ::gko::kernels::dpcpp::_kernel( \
464  std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
465  exec), \
466  std::forward<Args>(args)...); \
467  } else { \
468  GKO_NOT_IMPLEMENTED; \
469  } \
470  }); \
471  } \
472  static_assert(true, \
473  "This assert is used to counter the false positive extra " \
474  "semi-colon warnings")
475 
476 
514 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
515  template <typename... Args> \
516  auto make_##_name(Args&&... args) \
517  { \
518  return ::gko::detail::make_register_operation( \
519  #_kernel, \
520  [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
521  } \
522  static_assert(true, \
523  "This assert is used to counter the false positive extra " \
524  "semi-colon warnings")
525 
526 
527 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
528 
616 class Executor : public log::EnableLogging<Executor> {
617  template <typename T>
618  friend class detail::ExecutorBase;
619 
620  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
621  friend class ReferenceExecutor;
622 
623 public:
624  virtual ~Executor() = default;
625 
626  Executor() = default;
627  Executor(Executor&) = delete;
628  Executor(Executor&&) = delete;
629  Executor& operator=(Executor&) = delete;
630  Executor& operator=(Executor&&) = delete;
631 
637  virtual void run(const Operation& op) const = 0;
638 
653  template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
654  typename ClosureDpcpp>
655  GKO_DEPRECATED(
656  "Please use the overload with std::string as first parameter.")
657  void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
658  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
659  {
660  LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,
661  ClosureDpcpp>
662  op(op_omp, op_cuda, op_hip, op_dpcpp);
663  this->run(op);
664  }
665 
682  template <typename ClosureReference, typename ClosureOmp,
683  typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
684  void run(std::string name, const ClosureReference& op_ref,
685  const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
686  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
687  {
688  LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,
689  ClosureDpcpp>
690  op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);
691  this->run(op);
692  }
693 
705  template <typename T>
706  T* alloc(size_type num_elems) const
707  {
708  this->template log<log::Logger::allocation_started>(
709  this, num_elems * sizeof(T));
710  T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
711  this->template log<log::Logger::allocation_completed>(
712  this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
713  return allocated;
714  }
715 
723  void free(void* ptr) const noexcept
724  {
725  this->template log<log::Logger::free_started>(
726  this, reinterpret_cast<uintptr>(ptr));
727  this->raw_free(ptr);
728  this->template log<log::Logger::free_completed>(
729  this, reinterpret_cast<uintptr>(ptr));
730  }
731 
744  template <typename T>
746  const T* src_ptr, T* dest_ptr) const
747  {
748  const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
749  const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
750  this->template log<log::Logger::copy_started>(
751  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
752  if (this != src_exec.get()) {
753  src_exec->template log<log::Logger::copy_started>(
754  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
755  }
756  try {
757  this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,
758  dest_ptr);
759  } catch (NotSupported&) {
760 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
761  // Unoptimized copy. Try to go through the masters.
762  // output to log when verbose >= 1 and debug build
763  std::cerr << "Not direct copy. Try to copy data from the masters."
764  << std::endl;
765 #endif
766  auto src_master = src_exec->get_master().get();
767  if (num_elems > 0 && src_master != src_exec.get()) {
768  auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
769  src_master->copy_from<T>(src_exec, num_elems, src_ptr,
770  master_ptr);
771  this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
772  src_master->free(master_ptr);
773  }
774  }
775  this->template log<log::Logger::copy_completed>(
776  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
777  if (this != src_exec.get()) {
778  src_exec->template log<log::Logger::copy_completed>(
779  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
780  }
781  }
782 
794  template <typename T>
795  void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
796  {
797  this->copy_from(this, num_elems, src_ptr, dest_ptr);
798  }
799 
809  template <typename T>
810  T copy_val_to_host(const T* ptr) const
811  {
812  T out{};
813  this->get_master()->copy_from(this, 1, ptr, &out);
814  return out;
815  }
816 
821  virtual std::shared_ptr<Executor> get_master() noexcept = 0;
822 
826  virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
827 
831  virtual void synchronize() const = 0;
832 
839  void add_logger(std::shared_ptr<const log::Logger> logger) override
840  {
841  this->propagating_logger_refcount_.fetch_add(
842  logger->needs_propagation() ? 1 : 0);
843  this->EnableLogging<Executor>::add_logger(logger);
844  }
845 
852  void remove_logger(const log::Logger* logger) override
853  {
854  this->propagating_logger_refcount_.fetch_sub(
855  logger->needs_propagation() ? 1 : 0);
856  this->EnableLogging<Executor>::remove_logger(logger);
857  }
858 
859  using EnableLogging<Executor>::remove_logger;
860 
869  {
870  log_propagation_mode_ = mode;
871  }
872 
880  bool should_propagate_log() const
881  {
882  return this->propagating_logger_refcount_.load() > 0 &&
883  log_propagation_mode_ == log_propagation_mode::automatic;
884  }
885 
893  bool memory_accessible(const std::shared_ptr<const Executor>& other) const
894  {
895  return this->verify_memory_from(other.get());
896  }
897 
898  virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
899 
901  virtual std::string get_description() const = 0;
902 
903 protected:
908  struct exec_info {
912  int device_id = -1;
913 
917  std::string device_type;
918 
922  int numa_node = -1;
923 
932  int num_computing_units = -1;
933 
945  int num_pu_per_cu = -1;
946 
955  std::vector<int> subgroup_sizes{};
956 
965  int max_subgroup_size = -1;
966 
977  std::vector<int> max_workitem_sizes{};
978 
988  int max_workgroup_size;
989 
993  int major = -1;
994 
998  int minor = -1;
999 
1005  std::string pci_bus_id = std::string(13, 'x');
1006 
1017  std::vector<int> closest_pu_ids{};
1018  };
1019 
1025  const exec_info& get_exec_info() const { return this->exec_info_; }
1026 
1036  virtual void* raw_alloc(size_type size) const = 0;
1037 
1045  virtual void raw_free(void* ptr) const noexcept = 0;
1046 
1057  virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1058  const void* src_ptr, void* dest_ptr) const = 0;
1059 
1069 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1070  virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1071  const void* src_ptr, void* dest_ptr) const = 0
1072 
1073  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1074 
1075 #undef GKO_ENABLE_RAW_COPY_TO
1076 
1084  virtual bool verify_memory_from(const Executor* src_exec) const = 0;
1085 
1095 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1096  virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1097 
1098  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1099 
1100  GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1101 
1102 #undef GKO_ENABLE_VERIFY_MEMORY_TO
1103 
1110  virtual void populate_exec_info(const machine_topology* mach_topo) = 0;
1111 
1117  exec_info& get_exec_info() { return this->exec_info_; }
1118 
1119  exec_info exec_info_;
1120 
1122 
1123  std::atomic<int> propagating_logger_refcount_{};
1124 
1125 private:
1140  template <typename ClosureReference, typename ClosureOmp,
1141  typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
1142  class LambdaOperation : public Operation {
1143  public:
1144  LambdaOperation(std::string name, const ClosureReference& op_ref,
1145  const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1146  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1147  : name_(std::move(name)),
1148  op_ref_(op_ref),
1149  op_omp_(op_omp),
1150  op_cuda_(op_cuda),
1151  op_hip_(op_hip),
1152  op_dpcpp_(op_dpcpp)
1153  {}
1154 
1165  LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1166  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1167  : LambdaOperation("unnamed", op_omp, op_omp, op_cuda, op_hip,
1168  op_dpcpp)
1169  {}
1170 
1171  void run(std::shared_ptr<const OmpExecutor>) const override
1172  {
1173  op_omp_();
1174  }
1175 
1176  void run(std::shared_ptr<const ReferenceExecutor>) const override
1177  {
1178  op_ref_();
1179  }
1180 
1181  void run(std::shared_ptr<const CudaExecutor>) const override
1182  {
1183  op_cuda_();
1184  }
1185 
1186  void run(std::shared_ptr<const HipExecutor>) const override
1187  {
1188  op_hip_();
1189  }
1190 
1191  void run(std::shared_ptr<const DpcppExecutor>) const override
1192  {
1193  op_dpcpp_();
1194  }
1195 
1196  const char* get_name() const noexcept override { return name_.c_str(); }
1197 
1198  private:
1199  std::string name_;
1200  ClosureReference op_ref_;
1201  ClosureOmp op_omp_;
1202  ClosureCuda op_cuda_;
1203  ClosureHip op_hip_;
1204  ClosureDpcpp op_dpcpp_;
1205  };
1206 };
1207 
1208 
1217 template <typename T>
1219 public:
1220  using pointer = T*;
1221 
1227  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1228  : exec_{exec}
1229  {}
1230 
1236  void operator()(pointer ptr) const
1237  {
1238  if (exec_) {
1239  exec_->free(ptr);
1240  }
1241  }
1242 
1243 private:
1244  std::shared_ptr<const Executor> exec_;
1245 };
1246 
1247 // a specialization for arrays
1248 template <typename T>
1249 class executor_deleter<T[]> {
1250 public:
1251  using pointer = T[];
1252 
1253  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1254  : exec_{exec}
1255  {}
1256 
1257  void operator()(pointer ptr) const
1258  {
1259  if (exec_) {
1260  exec_->free(ptr);
1261  }
1262  }
1263 
1264 private:
1265  std::shared_ptr<const Executor> exec_;
1266 };
1267 
1268 
1269 namespace detail {
1270 
1271 
1272 template <typename ConcreteExecutor>
1273 class ExecutorBase : public Executor {
1274  // friend class is not in the nearest enclosing namesace, so we write the
1275  // full name
1276  friend class ::gko::OmpExecutor;
1277  friend class ::gko::HipExecutor;
1278  friend class ::gko::DpcppExecutor;
1279  friend class ::gko::CudaExecutor;
1280  friend class ::gko::ReferenceExecutor;
1281 
1282 public:
1283  void run(const Operation& op) const override
1284  {
1285  this->template log<log::Logger::operation_launched>(this, &op);
1286  auto scope_guard = get_scoped_device_id_guard();
1287  op.run(self()->shared_from_this());
1288  this->template log<log::Logger::operation_completed>(this, &op);
1289  }
1290 
1291 protected:
1292  void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1293  const void* src_ptr, void* dest_ptr) const override
1294  {
1295  src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1296  }
1297 
1298  virtual bool verify_memory_from(const Executor* src_exec) const override
1299  {
1300  return src_exec->verify_memory_to(self());
1301  }
1302 
1303 private:
1304  ConcreteExecutor* self() noexcept
1305  {
1306  return static_cast<ConcreteExecutor*>(this);
1307  }
1308 
1309  const ConcreteExecutor* self() const noexcept
1310  {
1311  return static_cast<const ConcreteExecutor*>(this);
1312  }
1313 };
1314 
1315 #undef GKO_DECLARE_EXECUTOR_FRIEND
1316 
1317 
1325 class EnableDeviceReset {
1326 public:
1332  GKO_DEPRECATED(
1333  "device_reset is no longer supported, call "
1334  "cudaDeviceReset/hipDeviceReset manually")
1335  void set_device_reset(bool device_reset) {}
1336 
1342  GKO_DEPRECATED(
1343  "device_reset is no longer supported, call "
1344  "cudaDeviceReset/hipDeviceReset manually")
1345  bool get_device_reset() { return false; }
1346 
1347 protected:
1353  EnableDeviceReset() {}
1354 
1355  GKO_DEPRECATED(
1356  "device_reset is no longer supported, call "
1357  "cudaDeviceReset/hipDeviceReset manually")
1358  EnableDeviceReset(bool device_reset) {}
1359 };
1360 
1361 
1362 } // namespace detail
1363 
1364 
1365 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1366  void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1367  const void* src_ptr, void* dest_ptr) const override
1368 
1369 
1370 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1371  virtual bool verify_memory_to(const dest_* other) const override \
1372  { \
1373  return bool_; \
1374  } \
1375  static_assert(true, \
1376  "This assert is used to counter the false positive extra " \
1377  "semi-colon warnings")
1378 
1379 
1387 class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1388  public std::enable_shared_from_this<OmpExecutor> {
1389  friend class detail::ExecutorBase<OmpExecutor>;
1390 
1391 public:
1392  using Executor::run;
1393 
1397  static std::shared_ptr<OmpExecutor> create(
1398  std::shared_ptr<CpuAllocatorBase> alloc =
1399  std::make_shared<CpuAllocator>())
1400  {
1401  return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
1402  }
1403 
1404  std::shared_ptr<Executor> get_master() noexcept override;
1405 
1406  std::shared_ptr<const Executor> get_master() const noexcept override;
1407 
1408  void synchronize() const override;
1409 
1410  int get_num_cores() const
1411  {
1412  return this->get_exec_info().num_computing_units;
1413  }
1414 
1415  int get_num_threads_per_core() const
1416  {
1417  return this->get_exec_info().num_pu_per_cu;
1418  }
1419 
1420  static int get_num_omp_threads();
1421 
1422  scoped_device_id_guard get_scoped_device_id_guard() const override;
1423 
1424  std::string get_description() const override;
1425 
1426 protected:
1427  OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1428  : alloc_{std::move(alloc)}
1429  {
1430  this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
1431  }
1432 
1433  void populate_exec_info(const machine_topology* mach_topo) override;
1434 
1435  void* raw_alloc(size_type size) const override;
1436 
1437  void raw_free(void* ptr) const noexcept override;
1438 
1439  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1440 
1441  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1442 
1443  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1444 
1445  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1446 
1447  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1448 
1449  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1450 
1451  std::shared_ptr<CpuAllocatorBase> alloc_;
1452 };
1453 
1454 
1455 namespace kernels {
1456 namespace omp {
1457 using DefaultExecutor = OmpExecutor;
1458 } // namespace omp
1459 } // namespace kernels
1460 
1461 
1470 public:
1471  using Executor::run;
1472 
1473  static std::shared_ptr<ReferenceExecutor> create(
1474  std::shared_ptr<CpuAllocatorBase> alloc =
1475  std::make_shared<CpuAllocator>())
1476  {
1477  return std::shared_ptr<ReferenceExecutor>(
1478  new ReferenceExecutor(std::move(alloc)));
1479  }
1480 
1481  scoped_device_id_guard get_scoped_device_id_guard() const override
1482  {
1483  return {this, 0};
1484  }
1485 
1486  std::string get_description() const override { return "ReferenceExecutor"; }
1487 
1488  void run(const Operation& op) const override
1489  {
1490  this->template log<log::Logger::operation_launched>(this, &op);
1491  op.run(std::static_pointer_cast<const ReferenceExecutor>(
1492  this->shared_from_this()));
1493  this->template log<log::Logger::operation_completed>(this, &op);
1494  }
1495 
1496 protected:
1497  ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1498  : OmpExecutor{std::move(alloc)}
1499  {
1500  this->ReferenceExecutor::populate_exec_info(
1502  }
1503 
1504  void populate_exec_info(const machine_topology*) override
1505  {
1506  this->get_exec_info().device_id = -1;
1507  this->get_exec_info().num_computing_units = 1;
1508  this->get_exec_info().num_pu_per_cu = 1;
1509  }
1510 
1511  bool verify_memory_from(const Executor* src_exec) const override
1512  {
1513  return src_exec->verify_memory_to(this);
1514  }
1515 
1516  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1517 
1518  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1519 
1520  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1521 
1522  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1523 
1524  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1525 };
1526 
1527 
1528 namespace kernels {
1529 namespace reference {
1530 using DefaultExecutor = ReferenceExecutor;
1531 } // namespace reference
1532 } // namespace kernels
1533 
1534 
1541 class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1542  public std::enable_shared_from_this<CudaExecutor>,
1543  public detail::EnableDeviceReset {
1544  friend class detail::ExecutorBase<CudaExecutor>;
1545 
1546 public:
1547  using Executor::run;
1548 
1560  GKO_DEPRECATED(
1561  "calling this CudaExecutor::create method is deprecated, because"
1562  "device_reset no longer has an effect"
1563  "call CudaExecutor::create("
1564  " int device_id, std::shared_ptr<Executor> master,"
1565  " std::shared_ptr<CudaAllocatorBase> alloc,"
1566  " CUstream_st* stream);"
1567  "instead")
1568  static std::shared_ptr<CudaExecutor> create(
1569  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1570  allocation_mode alloc_mode = default_cuda_alloc_mode,
1571  CUstream_st* stream = nullptr);
1572 
1582  static std::shared_ptr<CudaExecutor> create(
1583  int device_id, std::shared_ptr<Executor> master,
1584  std::shared_ptr<CudaAllocatorBase> alloc =
1585  std::make_shared<CudaAllocator>(),
1586  CUstream_st* stream = nullptr);
1587 
1588  std::shared_ptr<Executor> get_master() noexcept override;
1589 
1590  std::shared_ptr<const Executor> get_master() const noexcept override;
1591 
1592  void synchronize() const override;
1593 
1594  scoped_device_id_guard get_scoped_device_id_guard() const override;
1595 
1596  std::string get_description() const override;
1597 
1601  int get_device_id() const noexcept
1602  {
1603  return this->get_exec_info().device_id;
1604  }
1605 
1609  static int get_num_devices();
1610 
1614  int get_num_warps_per_sm() const noexcept
1615  {
1616  return this->get_exec_info().num_pu_per_cu;
1617  }
1618 
1622  int get_num_multiprocessor() const noexcept
1623  {
1624  return this->get_exec_info().num_computing_units;
1625  }
1626 
1630  int get_num_warps() const noexcept
1631  {
1632  return this->get_exec_info().num_computing_units *
1633  this->get_exec_info().num_pu_per_cu;
1634  }
1635 
1639  int get_warp_size() const noexcept
1640  {
1641  return this->get_exec_info().max_subgroup_size;
1642  }
1643 
1647  int get_major_version() const noexcept
1648  {
1649  return this->get_exec_info().major;
1650  }
1651 
1655  int get_minor_version() const noexcept
1656  {
1657  return this->get_exec_info().minor;
1658  }
1659 
1663  int get_compute_capability() const noexcept
1664  {
1665  return this->get_major_version() * 10 + this->get_minor_version();
1666  }
1667 
1673  GKO_DEPRECATED("use get_blas_handle() instead")
1674  cublasContext* get_cublas_handle() const { return get_blas_handle(); }
1675 
1679  cublasContext* get_blas_handle() const { return cublas_handle_.get(); }
1680 
1686  GKO_DEPRECATED("use get_sparselib_handle() instead")
1687  cusparseContext* get_cusparse_handle() const
1688  {
1689  return get_sparselib_handle();
1690  }
1691 
1695  cusparseContext* get_sparselib_handle() const
1696  {
1697  return cusparse_handle_.get();
1698  }
1699 
1705  std::vector<int> get_closest_pus() const
1706  {
1707  return this->get_exec_info().closest_pu_ids;
1708  }
1709 
1715  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1716 
1723  CUstream_st* get_stream() const { return stream_; }
1724 
1725 protected:
1726  void set_gpu_property();
1727 
1728  void init_handles();
1729 
1730  CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1731  std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1732  : master_(master), alloc_{std::move(alloc)}, stream_{stream}
1733  {
1734  this->get_exec_info().device_id = device_id;
1735  this->get_exec_info().num_computing_units = 0;
1736  this->get_exec_info().num_pu_per_cu = 0;
1737  this->CudaExecutor::populate_exec_info(
1739  this->set_gpu_property();
1740  this->init_handles();
1741  }
1742 
1743  void* raw_alloc(size_type size) const override;
1744 
1745  void raw_free(void* ptr) const noexcept override;
1746 
1747  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1748 
1749  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1750 
1751  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1752 
1753  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1754 
1755  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1756 
1757  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1758 
1759  void populate_exec_info(const machine_topology* mach_topo) override;
1760 
1761 private:
1762  std::shared_ptr<Executor> master_;
1763 
1764  template <typename T>
1765  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1766  handle_manager<cublasContext> cublas_handle_;
1767  handle_manager<cusparseContext> cusparse_handle_;
1768  std::shared_ptr<CudaAllocatorBase> alloc_;
1769  CUstream_st* stream_;
1770 };
1771 
1772 
1773 namespace kernels {
1774 namespace cuda {
1775 using DefaultExecutor = CudaExecutor;
1776 } // namespace cuda
1777 } // namespace kernels
1778 
1779 
1786 class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1787  public std::enable_shared_from_this<HipExecutor>,
1788  public detail::EnableDeviceReset {
1789  friend class detail::ExecutorBase<HipExecutor>;
1790 
1791 public:
1792  using Executor::run;
1793 
1805  GKO_DEPRECATED(
1806  "device_reset is deprecated entirely, call hipDeviceReset directly. "
1807  "alloc_mode was replaced by the Allocator type "
1808  "hierarchy.")
1809  static std::shared_ptr<HipExecutor> create(
1810  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1811  allocation_mode alloc_mode = default_hip_alloc_mode,
1812  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1813 
1814  static std::shared_ptr<HipExecutor> create(
1815  int device_id, std::shared_ptr<Executor> master,
1816  std::shared_ptr<HipAllocatorBase> alloc =
1817  std::make_shared<HipAllocator>(),
1818  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1819 
1820  std::shared_ptr<Executor> get_master() noexcept override;
1821 
1822  std::shared_ptr<const Executor> get_master() const noexcept override;
1823 
1824  void synchronize() const override;
1825 
1826  scoped_device_id_guard get_scoped_device_id_guard() const override;
1827 
1828  std::string get_description() const override;
1829 
1833  int get_device_id() const noexcept
1834  {
1835  return this->get_exec_info().device_id;
1836  }
1837 
1841  static int get_num_devices();
1842 
1846  int get_num_warps_per_sm() const noexcept
1847  {
1848  return this->get_exec_info().num_pu_per_cu;
1849  }
1850 
1854  int get_num_multiprocessor() const noexcept
1855  {
1856  return this->get_exec_info().num_computing_units;
1857  }
1858 
1862  int get_major_version() const noexcept
1863  {
1864  return this->get_exec_info().major;
1865  }
1866 
1870  int get_minor_version() const noexcept
1871  {
1872  return this->get_exec_info().minor;
1873  }
1874 
1878  int get_num_warps() const noexcept
1879  {
1880  return this->get_exec_info().num_computing_units *
1881  this->get_exec_info().num_pu_per_cu;
1882  }
1883 
1887  int get_warp_size() const noexcept
1888  {
1889  return this->get_exec_info().max_subgroup_size;
1890  }
1891 
1897  GKO_DEPRECATED("use get_blas_handle() instead")
1898  hipblasContext* get_hipblas_handle() const { return get_blas_handle(); }
1899 
1903  hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); }
1904 
1910  GKO_DEPRECATED("use get_sparselib_handle() instead")
1911  hipsparseContext* get_hipsparse_handle() const
1912  {
1913  return get_sparselib_handle();
1914  }
1915 
1919  hipsparseContext* get_sparselib_handle() const
1920  {
1921  return hipsparse_handle_.get();
1922  }
1923 
1929  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1930 
1936  std::vector<int> get_closest_pus() const
1937  {
1938  return this->get_exec_info().closest_pu_ids;
1939  }
1940 
1941  GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
1942 
1943 protected:
1944  void set_gpu_property();
1945 
1946  void init_handles();
1947 
1948  HipExecutor(int device_id, std::shared_ptr<Executor> master,
1949  std::shared_ptr<HipAllocatorBase> alloc,
1950  GKO_HIP_STREAM_STRUCT* stream)
1951  : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1952  {
1953  this->get_exec_info().device_id = device_id;
1954  this->get_exec_info().num_computing_units = 0;
1955  this->get_exec_info().num_pu_per_cu = 0;
1956  this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1957  this->set_gpu_property();
1958  this->init_handles();
1959  }
1960 
1961  void* raw_alloc(size_type size) const override;
1962 
1963  void raw_free(void* ptr) const noexcept override;
1964 
1965  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1966 
1967  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1968 
1969  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1970 
1971  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1972 
1973  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1974 
1975  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1976 
1977  void populate_exec_info(const machine_topology* mach_topo) override;
1978 
1979 private:
1980  std::shared_ptr<Executor> master_;
1981 
1982  template <typename T>
1983  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1984  handle_manager<hipblasContext> hipblas_handle_;
1985  handle_manager<hipsparseContext> hipsparse_handle_;
1986  std::shared_ptr<HipAllocatorBase> alloc_;
1987  GKO_HIP_STREAM_STRUCT* stream_;
1988 };
1989 
1990 
1991 namespace kernels {
1992 namespace hip {
1993 using DefaultExecutor = HipExecutor;
1994 } // namespace hip
1995 } // namespace kernels
1996 
1997 
2004 class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
2005  public std::enable_shared_from_this<DpcppExecutor> {
2006  friend class detail::ExecutorBase<DpcppExecutor>;
2007 
2008 public:
2009  using Executor::run;
2010 
2020  static std::shared_ptr<DpcppExecutor> create(
2021  int device_id, std::shared_ptr<Executor> master,
2022  std::string device_type = "all",
2023  dpcpp_queue_property property = dpcpp_queue_property::in_order);
2024 
2025  std::shared_ptr<Executor> get_master() noexcept override;
2026 
2027  std::shared_ptr<const Executor> get_master() const noexcept override;
2028 
2029  void synchronize() const override;
2030 
2031  scoped_device_id_guard get_scoped_device_id_guard() const override;
2032 
2033  std::string get_description() const override;
2034 
2040  int get_device_id() const noexcept
2041  {
2042  return this->get_exec_info().device_id;
2043  }
2044 
2045  sycl::queue* get_queue() const { return queue_.get(); }
2046 
2054  static int get_num_devices(std::string device_type);
2055 
2061  const std::vector<int>& get_subgroup_sizes() const noexcept
2062  {
2063  return this->get_exec_info().subgroup_sizes;
2064  }
2065 
2071  int get_num_computing_units() const noexcept
2072  {
2073  return this->get_exec_info().num_computing_units;
2074  }
2075 
2079  int get_num_subgroups() const noexcept
2080  {
2081  return this->get_exec_info().num_computing_units *
2082  this->get_exec_info().num_pu_per_cu;
2083  }
2084 
2090  const std::vector<int>& get_max_workitem_sizes() const noexcept
2091  {
2092  return this->get_exec_info().max_workitem_sizes;
2093  }
2094 
2100  int get_max_workgroup_size() const noexcept
2101  {
2102  return this->get_exec_info().max_workgroup_size;
2103  }
2104 
2110  int get_max_subgroup_size() const noexcept
2111  {
2112  return this->get_exec_info().max_subgroup_size;
2113  }
2114 
2120  std::string get_device_type() const noexcept
2121  {
2122  return this->get_exec_info().device_type;
2123  }
2124 
2125 protected:
2126  void set_device_property(
2127  dpcpp_queue_property property = dpcpp_queue_property::in_order);
2128 
2129  DpcppExecutor(
2130  int device_id, std::shared_ptr<Executor> master,
2131  std::string device_type = "all",
2132  dpcpp_queue_property property = dpcpp_queue_property::in_order)
2133  : master_(master)
2134  {
2135  std::for_each(device_type.begin(), device_type.end(),
2136  [](char& c) { c = std::tolower(c); });
2137  this->get_exec_info().device_type = std::string(device_type);
2138  this->get_exec_info().device_id = device_id;
2139  this->set_device_property(property);
2140  }
2141 
2142  void populate_exec_info(const machine_topology* mach_topo) override;
2143 
2144  void* raw_alloc(size_type size) const override;
2145 
2146  void raw_free(void* ptr) const noexcept override;
2147 
2148  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2149 
2150  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
2151 
2152  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
2153 
2154  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
2155 
2156  bool verify_memory_to(const OmpExecutor* dest_exec) const override;
2157 
2158  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
2159 
2160 private:
2161  std::shared_ptr<Executor> master_;
2162 
2163  template <typename T>
2164  using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2165  queue_manager<sycl::queue> queue_;
2166 };
2167 
2168 
2169 namespace kernels {
2170 namespace dpcpp {
2171 using DefaultExecutor = DpcppExecutor;
2172 } // namespace dpcpp
2173 } // namespace kernels
2174 
2175 
2176 #undef GKO_OVERRIDE_RAW_COPY_TO
2177 
2178 
2179 } // namespace gko
2180 
2181 
2182 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
gko::allocation_mode
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition: executor.hpp:62
gko::CudaExecutor::get_sparselib_handle
cusparseContext * get_sparselib_handle() const
Get the cusparse handle for this executor.
Definition: executor.hpp:1695
gko::CudaExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1614
gko::executor_deleter
This is a deleter that uses an executor's free method to deallocate the data.
Definition: executor.hpp:1218
gko::HipExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1846
gko::executor_deleter::executor_deleter
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition: executor.hpp:1227
gko::CudaExecutor::get_stream
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition: executor.hpp:1723
gko::Executor::synchronize
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor::get_max_workitem_sizes
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition: executor.hpp:2090
gko::Executor::free
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition: executor.hpp:723
gko::HipExecutor::get_hipblas_handle
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1898
gko::Executor::memory_accessible
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition: executor.hpp:893
gko::DpcppExecutor::get_device_type
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition: executor.hpp:2120
gko::HipExecutor::get_num_devices
static int get_num_devices()
Get the number of devices present on the system.
gko::HipExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1929
gko::scoped_device_id_guard
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition: scoped_device_id_guard.hpp:76
gko::DpcppExecutor::get_num_devices
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
gko::DpcppExecutor::get_num_computing_units
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition: executor.hpp:2071
gko::log_propagation_mode::automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
gko::Executor::remove_logger
void remove_logger(const log::Logger *logger) override
Definition: executor.hpp:852
gko::DpcppExecutor::get_num_subgroups
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition: executor.hpp:2079
gko::ReferenceExecutor::get_description
std::string get_description() const override
Definition: executor.hpp:1486
gko::size_type
std::size_t size_type
Integral type used for allocation quantities.
Definition: types.hpp:90
gko::Executor::copy_from
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition: executor.hpp:745
gko::DpcppExecutor::get_description
std::string get_description() const override
gko::Executor::get_master
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
gko::ptr_param::get
T * get() const
Definition: utils_helper.hpp:75
gko::Executor::run
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
gko::HipExecutor
This is the Executor subclass which represents the HIP enhanced device.
Definition: executor.hpp:1786
gko::CudaExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1655
gko::DpcppExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::log_propagation_mode::never
Events only get reported at loggers attached to the triggering object.
gko::CudaExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1622
gko::ReferenceExecutor
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition: executor.hpp:1469
gko::CudaExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1639
gko::NotSupported
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition: exception.hpp:127
gko::HipExecutor::get_device_id
int get_device_id() const noexcept
Get the HIP device id of the device associated to this executor.
Definition: executor.hpp:1833
gko::HipExecutor::get_hipsparse_handle
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1911
gko::CudaExecutor
This is the Executor subclass which represents the CUDA device.
Definition: executor.hpp:1541
gko::log_propagation_mode
log_propagation_mode
How Logger events are propagated to their Executor.
Definition: executor.hpp:34
gko::HipExecutor::create
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
gko
The Ginkgo namespace.
Definition: abstract_factory.hpp:20
gko::Executor::add_logger
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition: executor.hpp:839
gko::HipExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::CudaExecutor::get_blas_handle
cublasContext * get_blas_handle() const
Get the cublas handle for this executor.
Definition: executor.hpp:1679
gko::executor_deleter::operator()
void operator()(pointer ptr) const
Deletes the object.
Definition: executor.hpp:1236
gko::HipExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1936
gko::DpcppExecutor::get_max_subgroup_size
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition: executor.hpp:2110
gko::log::EnableLogging
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition: logger.hpp:786
gko::Operation::get_name
virtual const char * get_name() const noexcept
Returns the operation's name.
gko::DpcppExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor
This is the Executor subclass which represents a DPC++ enhanced device.
Definition: executor.hpp:2004
gko::ptr_param
This class is used for function parameters in the place of raw pointers.
Definition: utils_helper.hpp:41
gko::log::Logger
Definition: logger.hpp:74
gko::Executor::copy_val_to_host
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition: executor.hpp:810
gko::OmpExecutor
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition: executor.hpp:1387
gko::CudaExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1705
gko::HipExecutor::get_sparselib_handle
hipsparseContext * get_sparselib_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1919
gko::Executor::get_description
virtual std::string get_description() const =0
gko::stop::mode
mode
The mode for the residual norm criterion.
Definition: residual_norm.hpp:37
gko::OmpExecutor::create
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition: executor.hpp:1397
gko::HipExecutor::get_description
std::string get_description() const override
gko::Executor::alloc
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition: executor.hpp:706
gko::ReferenceExecutor::run
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition: executor.hpp:1488
gko::Executor::copy
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition: executor.hpp:795
gko::HipExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1854
gko::Executor::should_propagate_log
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition: executor.hpp:880
gko::DpcppExecutor::get_device_id
int get_device_id() const noexcept
Get the DPCPP device id of the device associated to this executor.
Definition: executor.hpp:2040
gko::DpcppExecutor::get_max_workgroup_size
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition: executor.hpp:2100
gko::CudaExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1647
gko::CudaExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1630
gko::log::Logger::needs_propagation
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition: logger.hpp:680
gko::CudaExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1715
gko::Executor
The first step in using the Ginkgo library consists of creating an executor.
Definition: executor.hpp:616
gko::HipExecutor::get_blas_handle
hipblasContext * get_blas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1903
gko::HipExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1878
gko::CudaExecutor::get_compute_capability
int get_compute_capability() const noexcept
Get the compute capability.
Definition: executor.hpp:1663
gko::HipExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1862
gko::DpcppExecutor::create
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
gko::machine_topology::get_instance
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition: machine_topology.hpp:182
gko::HipExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::Operation
Operations can be used to define functionalities whose implementations differ among devices.
Definition: executor.hpp:258
gko::Executor::set_log_propagation_mode
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition: executor.hpp:868
gko::DpcppExecutor::get_subgroup_sizes
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition: executor.hpp:2061
gko::HipExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1887
gko::HipExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1870
gko::CudaExecutor::get_device_id
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition: executor.hpp:1601
gko::Executor::run
void run(std::string name, const ClosureReference &op_ref, const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition: executor.hpp:684