Ginkgo  Generated from pipelines/2017069469 branch based on develop. Ginkgo version 1.11.0
A numerical linear algebra library targeting many-core architectures
executor.hpp
1 // SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
2 //
3 // SPDX-License-Identifier: BSD-3-Clause
4 
5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
7 
8 
9 #include <array>
10 #include <atomic>
11 #include <iostream>
12 #include <memory>
13 #include <mutex>
14 #include <sstream>
15 #include <string>
16 #include <tuple>
17 #include <type_traits>
18 #include <vector>
19 
20 #include <ginkgo/core/base/device.hpp>
21 #include <ginkgo/core/base/fwd_decls.hpp>
22 #include <ginkgo/core/base/machine_topology.hpp>
23 #include <ginkgo/core/base/memory.hpp>
24 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
25 #include <ginkgo/core/base/types.hpp>
26 #include <ginkgo/core/log/logger.hpp>
27 #include <ginkgo/core/synthesizer/containers.hpp>
28 
29 
30 namespace gko {
31 
32 
40  never,
46  automatic
47 };
48 
49 
62 enum class allocation_mode { device, unified_global, unified_host };
63 
64 
65 #ifdef NDEBUG
66 
67 // When in release, prefer device allocations
68 constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
69 
70 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
71 
72 #else
73 
74 // When in debug, always UM allocations.
75 constexpr allocation_mode default_cuda_alloc_mode =
76  allocation_mode::unified_global;
77 
78 #if (GINKGO_HIP_PLATFORM_HCC == 1)
79 
80 // HIP on AMD GPUs does not support UM, so always prefer device allocations.
81 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
82 
83 #else
84 
85 // HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
86 constexpr allocation_mode default_hip_alloc_mode =
87  allocation_mode::unified_global;
88 
89 #endif
90 
91 #endif
92 
93 
94 } // namespace gko
95 
96 
101 enum class dpcpp_queue_property {
105  in_order = 1,
106 
110  enable_profiling = 2
111 };
112 
113 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
114  dpcpp_queue_property b)
115 {
116  return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
117  static_cast<int>(b));
118 }
119 
120 
121 namespace gko {
122 
123 
124 #define GKO_FORWARD_DECLARE(_type, ...) class _type
125 
126 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
127 
128 #undef GKO_FORWARD_DECLARE
129 
130 
131 class ReferenceExecutor;
132 
133 
134 namespace detail {
135 
136 
137 template <typename>
138 class ExecutorBase;
139 
140 
141 } // namespace detail
142 
143 
258 class Operation {
259 public:
260 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
261  virtual void run(std::shared_ptr<const _type>) const
262 
263  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
264 
265 #undef GKO_DECLARE_RUN_OVERLOAD
266 
267  // ReferenceExecutor overload can be defaulted to OmpExecutor's
268  virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
269 
275  virtual const char* get_name() const noexcept;
276 };
277 
278 
279 namespace detail {
280 
281 
291 template <typename Closure>
292 class RegisteredOperation : public Operation {
293 public:
300  RegisteredOperation(const char* name, Closure op)
301  : name_(name), op_(std::move(op))
302  {}
303 
304  const char* get_name() const noexcept override { return name_; }
305 
306  void run(std::shared_ptr<const ReferenceExecutor> exec) const override
307  {
308  op_(exec);
309  }
310 
311  void run(std::shared_ptr<const OmpExecutor> exec) const override
312  {
313  op_(exec);
314  }
315 
316  void run(std::shared_ptr<const CudaExecutor> exec) const override
317  {
318  op_(exec);
319  }
320 
321  void run(std::shared_ptr<const HipExecutor> exec) const override
322  {
323  op_(exec);
324  }
325 
326  void run(std::shared_ptr<const DpcppExecutor> exec) const override
327  {
328  op_(exec);
329  }
330 
331 private:
332  const char* name_;
333  Closure op_;
334 };
335 
336 
337 template <typename Closure>
338 RegisteredOperation<Closure> make_register_operation(const char* name,
339  Closure op)
340 {
341  return RegisteredOperation<Closure>{name, std::move(op)};
342 }
343 
344 
345 } // namespace detail
346 
347 
419 #define GKO_REGISTER_OPERATION(_name, _kernel) \
420  template <typename... Args> \
421  auto make_##_name(Args&&... args) \
422  { \
423  return ::gko::detail::make_register_operation( \
424  #_kernel, [&args...](auto exec) { \
425  using exec_type = decltype(exec); \
426  if (std::is_same< \
427  exec_type, \
428  std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
429  value) { \
430  ::gko::kernels::reference::_kernel( \
431  std::dynamic_pointer_cast< \
432  const ::gko::ReferenceExecutor>(exec), \
433  std::forward<Args>(args)...); \
434  } else if (std::is_same< \
435  exec_type, \
436  std::shared_ptr<const ::gko::OmpExecutor>>:: \
437  value) { \
438  ::gko::kernels::omp::_kernel( \
439  std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
440  exec), \
441  std::forward<Args>(args)...); \
442  } else if (std::is_same< \
443  exec_type, \
444  std::shared_ptr<const ::gko::CudaExecutor>>:: \
445  value) { \
446  ::gko::kernels::cuda::_kernel( \
447  std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
448  exec), \
449  std::forward<Args>(args)...); \
450  } else if (std::is_same< \
451  exec_type, \
452  std::shared_ptr<const ::gko::HipExecutor>>:: \
453  value) { \
454  ::gko::kernels::hip::_kernel( \
455  std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
456  exec), \
457  std::forward<Args>(args)...); \
458  } else if (std::is_same< \
459  exec_type, \
460  std::shared_ptr<const ::gko::DpcppExecutor>>:: \
461  value) { \
462  ::gko::kernels::dpcpp::_kernel( \
463  std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
464  exec), \
465  std::forward<Args>(args)...); \
466  } else { \
467  GKO_NOT_IMPLEMENTED; \
468  } \
469  }); \
470  } \
471  static_assert(true, \
472  "This assert is used to counter the false positive extra " \
473  "semi-colon warnings")
474 
475 
513 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
514  template <typename... Args> \
515  auto make_##_name(Args&&... args) \
516  { \
517  return ::gko::detail::make_register_operation( \
518  #_kernel, \
519  [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
520  } \
521  static_assert(true, \
522  "This assert is used to counter the false positive extra " \
523  "semi-colon warnings")
524 
525 
526 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
527 
615 class Executor : public log::EnableLogging<Executor> {
616  template <typename T>
617  friend class detail::ExecutorBase;
618 
619  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
620  friend class ReferenceExecutor;
621 
622 public:
623  virtual ~Executor() = default;
624 
625  Executor() = default;
626  Executor(Executor&) = delete;
627  Executor(Executor&&) = delete;
628  Executor& operator=(Executor&) = delete;
629  Executor& operator=(Executor&&) = delete;
630 
636  virtual void run(const Operation& op) const = 0;
637 
652  template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
653  typename ClosureDpcpp>
654  GKO_DEPRECATED(
655  "Please use the overload with std::string as first parameter.")
656  void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
657  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
658  {
659  LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,
660  ClosureDpcpp>
661  op(op_omp, op_cuda, op_hip, op_dpcpp);
662  this->run(op);
663  }
664 
681  template <typename ClosureReference, typename ClosureOmp,
682  typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
683  void run(std::string name, const ClosureReference& op_ref,
684  const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
685  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
686  {
687  LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,
688  ClosureDpcpp>
689  op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);
690  this->run(op);
691  }
692 
704  template <typename T>
705  T* alloc(size_type num_elems) const
706  {
707  this->template log<log::Logger::allocation_started>(
708  this, num_elems * sizeof(T));
709  T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
710  this->template log<log::Logger::allocation_completed>(
711  this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
712  return allocated;
713  }
714 
722  void free(void* ptr) const noexcept
723  {
724  this->template log<log::Logger::free_started>(
725  this, reinterpret_cast<uintptr>(ptr));
726  this->raw_free(ptr);
727  this->template log<log::Logger::free_completed>(
728  this, reinterpret_cast<uintptr>(ptr));
729  }
730 
743  template <typename T>
745  const T* src_ptr, T* dest_ptr) const
746  {
747  const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
748  const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
749  this->template log<log::Logger::copy_started>(
750  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
751  if (this != src_exec.get()) {
752  src_exec->template log<log::Logger::copy_started>(
753  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
754  }
755  try {
756  this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,
757  dest_ptr);
758  } catch (NotSupported&) {
759 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
760  // Unoptimized copy. Try to go through the masters.
761  // output to log when verbose >= 1 and debug build
762  std::clog << "Not direct copy. Try to copy data from the masters."
763  << std::endl;
764 #endif
765  auto src_master = src_exec->get_master().get();
766  if (num_elems > 0 && src_master != src_exec.get()) {
767  auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
768  src_master->copy_from<T>(src_exec, num_elems, src_ptr,
769  master_ptr);
770  this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
771  src_master->free(master_ptr);
772  }
773  }
774  this->template log<log::Logger::copy_completed>(
775  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
776  if (this != src_exec.get()) {
777  src_exec->template log<log::Logger::copy_completed>(
778  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
779  }
780  }
781 
793  template <typename T>
794  void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
795  {
796  this->copy_from(this, num_elems, src_ptr, dest_ptr);
797  }
798 
808  template <typename T>
809  T copy_val_to_host(const T* ptr) const
810  {
811  T out{};
812  this->get_master()->copy_from(this, 1, ptr, &out);
813  return out;
814  }
815 
820  virtual std::shared_ptr<Executor> get_master() noexcept = 0;
821 
825  virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
826 
830  virtual void synchronize() const = 0;
831 
838  void add_logger(std::shared_ptr<const log::Logger> logger) override
839  {
840  this->propagating_logger_refcount_.fetch_add(
841  logger->needs_propagation() ? 1 : 0);
842  this->EnableLogging<Executor>::add_logger(logger);
843  }
844 
851  void remove_logger(const log::Logger* logger) override
852  {
853  this->propagating_logger_refcount_.fetch_sub(
854  logger->needs_propagation() ? 1 : 0);
855  this->EnableLogging<Executor>::remove_logger(logger);
856  }
857 
858  using EnableLogging<Executor>::remove_logger;
859 
868  {
869  log_propagation_mode_ = mode;
870  }
871 
879  bool should_propagate_log() const
880  {
881  return this->propagating_logger_refcount_.load() > 0 &&
882  log_propagation_mode_ == log_propagation_mode::automatic;
883  }
884 
892  bool memory_accessible(const std::shared_ptr<const Executor>& other) const
893  {
894  return this->verify_memory_from(other.get());
895  }
896 
897  virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
898 
900  virtual std::string get_description() const = 0;
901 
902 protected:
907  struct exec_info {
911  int device_id = -1;
912 
916  std::string device_type;
917 
921  int numa_node = -1;
922 
931  int num_computing_units = -1;
932 
944  int num_pu_per_cu = -1;
945 
954  std::vector<int> subgroup_sizes{};
955 
964  int max_subgroup_size = -1;
965 
976  std::vector<int> max_workitem_sizes{};
977 
987  int max_workgroup_size;
988 
992  int major = -1;
993 
997  int minor = -1;
998 
1004  std::string pci_bus_id = std::string(13, 'x');
1005 
1016  std::vector<int> closest_pu_ids{};
1017  };
1018 
1024  const exec_info& get_exec_info() const { return this->exec_info_; }
1025 
1035  virtual void* raw_alloc(size_type size) const = 0;
1036 
1044  virtual void raw_free(void* ptr) const noexcept = 0;
1045 
1056  virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1057  const void* src_ptr, void* dest_ptr) const = 0;
1058 
1068 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1069  virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1070  const void* src_ptr, void* dest_ptr) const = 0
1071 
1072  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1073 
1074 #undef GKO_ENABLE_RAW_COPY_TO
1075 
1083  virtual bool verify_memory_from(const Executor* src_exec) const = 0;
1084 
1094 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1095  virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1096 
1097  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1098 
1099  GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1100 
1101 #undef GKO_ENABLE_VERIFY_MEMORY_TO
1102 
1109  virtual void populate_exec_info(const machine_topology* mach_topo) = 0;
1110 
1116  exec_info& get_exec_info() { return this->exec_info_; }
1117 
1118  exec_info exec_info_;
1119 
1121 
1122  std::atomic<int> propagating_logger_refcount_{};
1123 
1124 private:
1139  template <typename ClosureReference, typename ClosureOmp,
1140  typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
1141  class LambdaOperation : public Operation {
1142  public:
1143  LambdaOperation(std::string name, const ClosureReference& op_ref,
1144  const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1145  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1146  : name_(std::move(name)),
1147  op_ref_(op_ref),
1148  op_omp_(op_omp),
1149  op_cuda_(op_cuda),
1150  op_hip_(op_hip),
1151  op_dpcpp_(op_dpcpp)
1152  {}
1153 
1164  LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1165  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1166  : LambdaOperation("unnamed", op_omp, op_omp, op_cuda, op_hip,
1167  op_dpcpp)
1168  {}
1169 
1170  void run(std::shared_ptr<const OmpExecutor>) const override
1171  {
1172  op_omp_();
1173  }
1174 
1175  void run(std::shared_ptr<const ReferenceExecutor>) const override
1176  {
1177  op_ref_();
1178  }
1179 
1180  void run(std::shared_ptr<const CudaExecutor>) const override
1181  {
1182  op_cuda_();
1183  }
1184 
1185  void run(std::shared_ptr<const HipExecutor>) const override
1186  {
1187  op_hip_();
1188  }
1189 
1190  void run(std::shared_ptr<const DpcppExecutor>) const override
1191  {
1192  op_dpcpp_();
1193  }
1194 
1195  const char* get_name() const noexcept override { return name_.c_str(); }
1196 
1197  private:
1198  std::string name_;
1199  ClosureReference op_ref_;
1200  ClosureOmp op_omp_;
1201  ClosureCuda op_cuda_;
1202  ClosureHip op_hip_;
1203  ClosureDpcpp op_dpcpp_;
1204  };
1205 };
1206 
1207 
1216 template <typename T>
1218 public:
1219  using pointer = T*;
1220 
1226  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1227  : exec_{exec}
1228  {}
1229 
1235  void operator()(pointer ptr) const
1236  {
1237  if (exec_) {
1238  exec_->free(ptr);
1239  }
1240  }
1241 
1242 private:
1243  std::shared_ptr<const Executor> exec_;
1244 };
1245 
1246 // a specialization for arrays
1247 template <typename T>
1248 class executor_deleter<T[]> {
1249 public:
1250  using pointer = T[];
1251 
1252  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1253  : exec_{exec}
1254  {}
1255 
1256  void operator()(pointer ptr) const
1257  {
1258  if (exec_) {
1259  exec_->free(ptr);
1260  }
1261  }
1262 
1263 private:
1264  std::shared_ptr<const Executor> exec_;
1265 };
1266 
1267 
1268 namespace detail {
1269 
1270 
1271 template <typename ConcreteExecutor>
1272 class ExecutorBase : public Executor {
1273  // friend class is not in the nearest enclosing namesace, so we write the
1274  // full name
1275  friend class ::gko::OmpExecutor;
1276  friend class ::gko::HipExecutor;
1277  friend class ::gko::DpcppExecutor;
1278  friend class ::gko::CudaExecutor;
1279  friend class ::gko::ReferenceExecutor;
1280 
1281 public:
1282  void run(const Operation& op) const override
1283  {
1284  this->template log<log::Logger::operation_launched>(this, &op);
1285  auto scope_guard = get_scoped_device_id_guard();
1286  op.run(self()->shared_from_this());
1287  this->template log<log::Logger::operation_completed>(this, &op);
1288  }
1289 
1290 protected:
1291  void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1292  const void* src_ptr, void* dest_ptr) const override
1293  {
1294  src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1295  }
1296 
1297  virtual bool verify_memory_from(const Executor* src_exec) const override
1298  {
1299  return src_exec->verify_memory_to(self());
1300  }
1301 
1302 private:
1303  ConcreteExecutor* self() noexcept
1304  {
1305  return static_cast<ConcreteExecutor*>(this);
1306  }
1307 
1308  const ConcreteExecutor* self() const noexcept
1309  {
1310  return static_cast<const ConcreteExecutor*>(this);
1311  }
1312 };
1313 
1314 #undef GKO_DECLARE_EXECUTOR_FRIEND
1315 
1316 
1324 class EnableDeviceReset {
1325 public:
1331  GKO_DEPRECATED(
1332  "device_reset is no longer supported, call "
1333  "cudaDeviceReset/hipDeviceReset manually")
1334  void set_device_reset(bool device_reset) {}
1335 
1341  GKO_DEPRECATED(
1342  "device_reset is no longer supported, call "
1343  "cudaDeviceReset/hipDeviceReset manually")
1344  bool get_device_reset() { return false; }
1345 
1346 protected:
1352  EnableDeviceReset() {}
1353 
1354  GKO_DEPRECATED(
1355  "device_reset is no longer supported, call "
1356  "cudaDeviceReset/hipDeviceReset manually")
1357  EnableDeviceReset(bool device_reset) {}
1358 };
1359 
1360 
1361 } // namespace detail
1362 
1363 
1364 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1365  void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1366  const void* src_ptr, void* dest_ptr) const override
1367 
1368 
1369 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1370  virtual bool verify_memory_to(const dest_* other) const override \
1371  { \
1372  return bool_; \
1373  } \
1374  static_assert(true, \
1375  "This assert is used to counter the false positive extra " \
1376  "semi-colon warnings")
1377 
1378 
1386 class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1387  public std::enable_shared_from_this<OmpExecutor> {
1388  friend class detail::ExecutorBase<OmpExecutor>;
1389 
1390 public:
1391  using Executor::run;
1392 
1396  static std::shared_ptr<OmpExecutor> create(
1397  std::shared_ptr<CpuAllocatorBase> alloc =
1398  std::make_shared<CpuAllocator>())
1399  {
1400  return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
1401  }
1402 
1403  std::shared_ptr<Executor> get_master() noexcept override;
1404 
1405  std::shared_ptr<const Executor> get_master() const noexcept override;
1406 
1407  void synchronize() const override;
1408 
1409  int get_num_cores() const
1410  {
1411  return this->get_exec_info().num_computing_units;
1412  }
1413 
1414  int get_num_threads_per_core() const
1415  {
1416  return this->get_exec_info().num_pu_per_cu;
1417  }
1418 
1419  static int get_num_omp_threads();
1420 
1421  scoped_device_id_guard get_scoped_device_id_guard() const override;
1422 
1423  std::string get_description() const override;
1424 
1425 protected:
1426  OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1427  : alloc_{std::move(alloc)}
1428  {
1429  this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
1430  }
1431 
1432  void populate_exec_info(const machine_topology* mach_topo) override;
1433 
1434  void* raw_alloc(size_type size) const override;
1435 
1436  void raw_free(void* ptr) const noexcept override;
1437 
1438  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1439 
1440  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1441 
1442  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1443 
1444  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1445 
1446  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1447 
1448  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1449 
1450  std::shared_ptr<CpuAllocatorBase> alloc_;
1451 };
1452 
1453 
1454 namespace kernels {
1455 namespace omp {
1456 using DefaultExecutor = OmpExecutor;
1457 } // namespace omp
1458 } // namespace kernels
1459 
1460 
1469 public:
1470  using Executor::run;
1471 
1472  static std::shared_ptr<ReferenceExecutor> create(
1473  std::shared_ptr<CpuAllocatorBase> alloc =
1474  std::make_shared<CpuAllocator>())
1475  {
1476  return std::shared_ptr<ReferenceExecutor>(
1477  new ReferenceExecutor(std::move(alloc)));
1478  }
1479 
1480  scoped_device_id_guard get_scoped_device_id_guard() const override
1481  {
1482  return {this, 0};
1483  }
1484 
1485  std::string get_description() const override { return "ReferenceExecutor"; }
1486 
1487  void run(const Operation& op) const override
1488  {
1489  this->template log<log::Logger::operation_launched>(this, &op);
1490  op.run(std::static_pointer_cast<const ReferenceExecutor>(
1491  this->shared_from_this()));
1492  this->template log<log::Logger::operation_completed>(this, &op);
1493  }
1494 
1495 protected:
1496  ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1497  : OmpExecutor{std::move(alloc)}
1498  {
1499  this->ReferenceExecutor::populate_exec_info(
1501  }
1502 
1503  void populate_exec_info(const machine_topology*) override
1504  {
1505  this->get_exec_info().device_id = -1;
1506  this->get_exec_info().num_computing_units = 1;
1507  this->get_exec_info().num_pu_per_cu = 1;
1508  }
1509 
1510  bool verify_memory_from(const Executor* src_exec) const override
1511  {
1512  return src_exec->verify_memory_to(this);
1513  }
1514 
1515  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1516 
1517  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1518 
1519  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1520 
1521  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1522 
1523  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1524 };
1525 
1526 
1527 namespace kernels {
1528 namespace reference {
1529 using DefaultExecutor = ReferenceExecutor;
1530 } // namespace reference
1531 } // namespace kernels
1532 
1533 
1540 class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1541  public std::enable_shared_from_this<CudaExecutor>,
1542  public detail::EnableDeviceReset {
1543  friend class detail::ExecutorBase<CudaExecutor>;
1544 
1545 public:
1546  using Executor::run;
1547 
1559  GKO_DEPRECATED(
1560  "calling this CudaExecutor::create method is deprecated, because"
1561  "device_reset no longer has an effect"
1562  "call CudaExecutor::create("
1563  " int device_id, std::shared_ptr<Executor> master,"
1564  " std::shared_ptr<CudaAllocatorBase> alloc,"
1565  " CUstream_st* stream);"
1566  "instead")
1567  static std::shared_ptr<CudaExecutor> create(
1568  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1569  allocation_mode alloc_mode = default_cuda_alloc_mode,
1570  CUstream_st* stream = nullptr);
1571 
1581  static std::shared_ptr<CudaExecutor> create(
1582  int device_id, std::shared_ptr<Executor> master,
1583  std::shared_ptr<CudaAllocatorBase> alloc =
1584  std::make_shared<CudaAllocator>(),
1585  CUstream_st* stream = nullptr);
1586 
1587  std::shared_ptr<Executor> get_master() noexcept override;
1588 
1589  std::shared_ptr<const Executor> get_master() const noexcept override;
1590 
1591  void synchronize() const override;
1592 
1593  scoped_device_id_guard get_scoped_device_id_guard() const override;
1594 
1595  std::string get_description() const override;
1596 
1600  int get_device_id() const noexcept
1601  {
1602  return this->get_exec_info().device_id;
1603  }
1604 
1608  static int get_num_devices();
1609 
1613  int get_num_warps_per_sm() const noexcept
1614  {
1615  return this->get_exec_info().num_pu_per_cu;
1616  }
1617 
1621  int get_num_multiprocessor() const noexcept
1622  {
1623  return this->get_exec_info().num_computing_units;
1624  }
1625 
1629  int get_num_warps() const noexcept
1630  {
1631  return this->get_exec_info().num_computing_units *
1632  this->get_exec_info().num_pu_per_cu;
1633  }
1634 
1638  int get_warp_size() const noexcept
1639  {
1640  return this->get_exec_info().max_subgroup_size;
1641  }
1642 
1646  int get_major_version() const noexcept
1647  {
1648  return this->get_exec_info().major;
1649  }
1650 
1654  int get_minor_version() const noexcept
1655  {
1656  return this->get_exec_info().minor;
1657  }
1658 
1662  int get_compute_capability() const noexcept
1663  {
1664  return this->get_major_version() * 10 + this->get_minor_version();
1665  }
1666 
1672  GKO_DEPRECATED("use get_blas_handle() instead")
1673  cublasContext* get_cublas_handle() const { return get_blas_handle(); }
1674 
1678  cublasContext* get_blas_handle() const { return cublas_handle_.get(); }
1679 
1685  GKO_DEPRECATED("use get_sparselib_handle() instead")
1686  cusparseContext* get_cusparse_handle() const
1687  {
1688  return get_sparselib_handle();
1689  }
1690 
1694  cusparseContext* get_sparselib_handle() const
1695  {
1696  return cusparse_handle_.get();
1697  }
1698 
1704  std::vector<int> get_closest_pus() const
1705  {
1706  return this->get_exec_info().closest_pu_ids;
1707  }
1708 
1714  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1715 
1722  CUstream_st* get_stream() const { return stream_; }
1723 
1724 protected:
1725  void set_gpu_property();
1726 
1727  void init_handles();
1728 
1729  CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1730  std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1731  : master_(master), alloc_{std::move(alloc)}, stream_{stream}
1732  {
1733  this->get_exec_info().device_id = device_id;
1734  this->get_exec_info().num_computing_units = 0;
1735  this->get_exec_info().num_pu_per_cu = 0;
1736  this->CudaExecutor::populate_exec_info(
1738  this->set_gpu_property();
1739  this->init_handles();
1740  }
1741 
1742  void* raw_alloc(size_type size) const override;
1743 
1744  void raw_free(void* ptr) const noexcept override;
1745 
1746  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1747 
1748  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1749 
1750  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1751 
1752  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1753 
1754  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1755 
1756  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1757 
1758  void populate_exec_info(const machine_topology* mach_topo) override;
1759 
1760 private:
1761  std::shared_ptr<Executor> master_;
1762 
1763  template <typename T>
1764  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1765  handle_manager<cublasContext> cublas_handle_;
1766  handle_manager<cusparseContext> cusparse_handle_;
1767  std::shared_ptr<CudaAllocatorBase> alloc_;
1768  CUstream_st* stream_;
1769 };
1770 
1771 
1772 namespace kernels {
1773 namespace cuda {
1774 using DefaultExecutor = CudaExecutor;
1775 } // namespace cuda
1776 } // namespace kernels
1777 
1778 
1785 class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1786  public std::enable_shared_from_this<HipExecutor>,
1787  public detail::EnableDeviceReset {
1788  friend class detail::ExecutorBase<HipExecutor>;
1789 
1790 public:
1791  using Executor::run;
1792 
1804  GKO_DEPRECATED(
1805  "device_reset is deprecated entirely, call hipDeviceReset directly. "
1806  "alloc_mode was replaced by the Allocator type "
1807  "hierarchy.")
1808  static std::shared_ptr<HipExecutor> create(
1809  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1810  allocation_mode alloc_mode = default_hip_alloc_mode,
1811  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1812 
1813  static std::shared_ptr<HipExecutor> create(
1814  int device_id, std::shared_ptr<Executor> master,
1815  std::shared_ptr<HipAllocatorBase> alloc =
1816  std::make_shared<HipAllocator>(),
1817  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1818 
1819  std::shared_ptr<Executor> get_master() noexcept override;
1820 
1821  std::shared_ptr<const Executor> get_master() const noexcept override;
1822 
1823  void synchronize() const override;
1824 
1825  scoped_device_id_guard get_scoped_device_id_guard() const override;
1826 
1827  std::string get_description() const override;
1828 
1832  int get_device_id() const noexcept
1833  {
1834  return this->get_exec_info().device_id;
1835  }
1836 
1840  static int get_num_devices();
1841 
1845  int get_num_warps_per_sm() const noexcept
1846  {
1847  return this->get_exec_info().num_pu_per_cu;
1848  }
1849 
1853  int get_num_multiprocessor() const noexcept
1854  {
1855  return this->get_exec_info().num_computing_units;
1856  }
1857 
1861  int get_major_version() const noexcept
1862  {
1863  return this->get_exec_info().major;
1864  }
1865 
1869  int get_minor_version() const noexcept
1870  {
1871  return this->get_exec_info().minor;
1872  }
1873 
1877  int get_num_warps() const noexcept
1878  {
1879  return this->get_exec_info().num_computing_units *
1880  this->get_exec_info().num_pu_per_cu;
1881  }
1882 
1886  int get_warp_size() const noexcept
1887  {
1888  return this->get_exec_info().max_subgroup_size;
1889  }
1890 
1896  GKO_DEPRECATED("use get_blas_handle() instead")
1897  hipblasContext* get_hipblas_handle() const { return get_blas_handle(); }
1898 
1902  hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); }
1903 
1909  GKO_DEPRECATED("use get_sparselib_handle() instead")
1910  hipsparseContext* get_hipsparse_handle() const
1911  {
1912  return get_sparselib_handle();
1913  }
1914 
1918  hipsparseContext* get_sparselib_handle() const
1919  {
1920  return hipsparse_handle_.get();
1921  }
1922 
1928  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1929 
1935  std::vector<int> get_closest_pus() const
1936  {
1937  return this->get_exec_info().closest_pu_ids;
1938  }
1939 
1940  GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
1941 
1942 protected:
1943  void set_gpu_property();
1944 
1945  void init_handles();
1946 
1947  HipExecutor(int device_id, std::shared_ptr<Executor> master,
1948  std::shared_ptr<HipAllocatorBase> alloc,
1949  GKO_HIP_STREAM_STRUCT* stream)
1950  : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1951  {
1952  this->get_exec_info().device_id = device_id;
1953  this->get_exec_info().num_computing_units = 0;
1954  this->get_exec_info().num_pu_per_cu = 0;
1955  this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1956  this->set_gpu_property();
1957  this->init_handles();
1958  }
1959 
1960  void* raw_alloc(size_type size) const override;
1961 
1962  void raw_free(void* ptr) const noexcept override;
1963 
1964  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1965 
1966  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1967 
1968  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1969 
1970  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1971 
1972  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1973 
1974  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1975 
1976  void populate_exec_info(const machine_topology* mach_topo) override;
1977 
1978 private:
1979  std::shared_ptr<Executor> master_;
1980 
1981  template <typename T>
1982  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1983  handle_manager<hipblasContext> hipblas_handle_;
1984  handle_manager<hipsparseContext> hipsparse_handle_;
1985  std::shared_ptr<HipAllocatorBase> alloc_;
1986  GKO_HIP_STREAM_STRUCT* stream_;
1987 };
1988 
1989 
1990 namespace kernels {
1991 namespace hip {
1992 using DefaultExecutor = HipExecutor;
1993 } // namespace hip
1994 } // namespace kernels
1995 
1996 
2003 class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
2004  public std::enable_shared_from_this<DpcppExecutor> {
2005  friend class detail::ExecutorBase<DpcppExecutor>;
2006 
2007 public:
2008  using Executor::run;
2009 
2019  static std::shared_ptr<DpcppExecutor> create(
2020  int device_id, std::shared_ptr<Executor> master,
2021  std::string device_type = "all",
2022  dpcpp_queue_property property = dpcpp_queue_property::in_order);
2023 
2024  std::shared_ptr<Executor> get_master() noexcept override;
2025 
2026  std::shared_ptr<const Executor> get_master() const noexcept override;
2027 
2028  void synchronize() const override;
2029 
2030  scoped_device_id_guard get_scoped_device_id_guard() const override;
2031 
2032  std::string get_description() const override;
2033 
2039  int get_device_id() const noexcept
2040  {
2041  return this->get_exec_info().device_id;
2042  }
2043 
2044  sycl::queue* get_queue() const { return queue_.get(); }
2045 
2053  static int get_num_devices(std::string device_type);
2054 
2060  const std::vector<int>& get_subgroup_sizes() const noexcept
2061  {
2062  return this->get_exec_info().subgroup_sizes;
2063  }
2064 
2070  int get_num_computing_units() const noexcept
2071  {
2072  return this->get_exec_info().num_computing_units;
2073  }
2074 
2078  int get_num_subgroups() const noexcept
2079  {
2080  return this->get_exec_info().num_computing_units *
2081  this->get_exec_info().num_pu_per_cu;
2082  }
2083 
2089  const std::vector<int>& get_max_workitem_sizes() const noexcept
2090  {
2091  return this->get_exec_info().max_workitem_sizes;
2092  }
2093 
2099  int get_max_workgroup_size() const noexcept
2100  {
2101  return this->get_exec_info().max_workgroup_size;
2102  }
2103 
2109  int get_max_subgroup_size() const noexcept
2110  {
2111  return this->get_exec_info().max_subgroup_size;
2112  }
2113 
2119  std::string get_device_type() const noexcept
2120  {
2121  return this->get_exec_info().device_type;
2122  }
2123 
2124 protected:
2125  void set_device_property(
2126  dpcpp_queue_property property = dpcpp_queue_property::in_order);
2127 
2128  DpcppExecutor(
2129  int device_id, std::shared_ptr<Executor> master,
2130  std::string device_type = "all",
2131  dpcpp_queue_property property = dpcpp_queue_property::in_order)
2132  : master_(master)
2133  {
2134  std::for_each(device_type.begin(), device_type.end(),
2135  [](char& c) { c = std::tolower(c); });
2136  this->get_exec_info().device_type = std::string(device_type);
2137  this->get_exec_info().device_id = device_id;
2138  this->set_device_property(property);
2139  }
2140 
2141  void populate_exec_info(const machine_topology* mach_topo) override;
2142 
2143  void* raw_alloc(size_type size) const override;
2144 
2145  void raw_free(void* ptr) const noexcept override;
2146 
2147  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2148 
2149  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
2150 
2151  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
2152 
2153  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
2154 
2155  bool verify_memory_to(const OmpExecutor* dest_exec) const override;
2156 
2157  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
2158 
2159 private:
2160  std::shared_ptr<Executor> master_;
2161 
2162  template <typename T>
2163  using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2164  queue_manager<sycl::queue> queue_;
2165 };
2166 
2167 
2168 namespace kernels {
2169 namespace dpcpp {
2170 using DefaultExecutor = DpcppExecutor;
2171 } // namespace dpcpp
2172 } // namespace kernels
2173 
2174 
2175 #undef GKO_OVERRIDE_RAW_COPY_TO
2176 
2177 
2178 } // namespace gko
2179 
2180 
2181 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
gko::allocation_mode
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition: executor.hpp:62
gko::CudaExecutor::get_sparselib_handle
cusparseContext * get_sparselib_handle() const
Get the cusparse handle for this executor.
Definition: executor.hpp:1694
gko::CudaExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1613
gko::executor_deleter
This is a deleter that uses an executor's free method to deallocate the data.
Definition: executor.hpp:1217
gko::HipExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1845
gko::executor_deleter::executor_deleter
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition: executor.hpp:1226
gko::CudaExecutor::get_stream
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition: executor.hpp:1722
gko::Executor::synchronize
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor::get_max_workitem_sizes
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition: executor.hpp:2089
gko::Executor::free
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition: executor.hpp:722
gko::HipExecutor::get_hipblas_handle
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1897
gko::Executor::memory_accessible
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition: executor.hpp:892
gko::DpcppExecutor::get_device_type
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition: executor.hpp:2119
gko::HipExecutor::get_num_devices
static int get_num_devices()
Get the number of devices present on the system.
gko::HipExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1928
gko::scoped_device_id_guard
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition: scoped_device_id_guard.hpp:76
gko::DpcppExecutor::get_num_devices
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
gko::DpcppExecutor::get_num_computing_units
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition: executor.hpp:2070
gko::log_propagation_mode::automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
gko::Executor::remove_logger
void remove_logger(const log::Logger *logger) override
Definition: executor.hpp:851
gko::DpcppExecutor::get_num_subgroups
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition: executor.hpp:2078
gko::ReferenceExecutor::get_description
std::string get_description() const override
Definition: executor.hpp:1485
gko::size_type
std::size_t size_type
Integral type used for allocation quantities.
Definition: types.hpp:90
gko::Executor::copy_from
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition: executor.hpp:744
gko::DpcppExecutor::get_description
std::string get_description() const override
gko::Executor::get_master
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
gko::ptr_param::get
T * get() const
Definition: utils_helper.hpp:75
gko::Executor::run
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
gko::HipExecutor
This is the Executor subclass which represents the HIP enhanced device.
Definition: executor.hpp:1785
gko::CudaExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1654
gko::DpcppExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::log_propagation_mode::never
Events only get reported at loggers attached to the triggering object.
gko::CudaExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1621
gko::ReferenceExecutor
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition: executor.hpp:1468
gko::CudaExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1638
gko::NotSupported
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition: exception.hpp:127
gko::HipExecutor::get_device_id
int get_device_id() const noexcept
Get the HIP device id of the device associated to this executor.
Definition: executor.hpp:1832
gko::HipExecutor::get_hipsparse_handle
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1910
gko::CudaExecutor
This is the Executor subclass which represents the CUDA device.
Definition: executor.hpp:1540
gko::log_propagation_mode
log_propagation_mode
How Logger events are propagated to their Executor.
Definition: executor.hpp:34
gko::HipExecutor::create
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
gko
The Ginkgo namespace.
Definition: abstract_factory.hpp:20
gko::Executor::add_logger
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition: executor.hpp:838
gko::HipExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::CudaExecutor::get_blas_handle
cublasContext * get_blas_handle() const
Get the cublas handle for this executor.
Definition: executor.hpp:1678
gko::executor_deleter::operator()
void operator()(pointer ptr) const
Deletes the object.
Definition: executor.hpp:1235
gko::HipExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1935
gko::DpcppExecutor::get_max_subgroup_size
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition: executor.hpp:2109
gko::log::EnableLogging
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition: logger.hpp:786
gko::Operation::get_name
virtual const char * get_name() const noexcept
Returns the operation's name.
gko::DpcppExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor
This is the Executor subclass which represents a DPC++ enhanced device.
Definition: executor.hpp:2003
gko::ptr_param
This class is used for function parameters in the place of raw pointers.
Definition: utils_helper.hpp:41
gko::log::Logger
Definition: logger.hpp:74
gko::Executor::copy_val_to_host
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition: executor.hpp:809
gko::OmpExecutor
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition: executor.hpp:1386
gko::CudaExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1704
gko::HipExecutor::get_sparselib_handle
hipsparseContext * get_sparselib_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1918
gko::Executor::get_description
virtual std::string get_description() const =0
gko::stop::mode
mode
The mode for the residual norm criterion.
Definition: residual_norm.hpp:38
gko::OmpExecutor::create
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition: executor.hpp:1396
gko::HipExecutor::get_description
std::string get_description() const override
gko::Executor::alloc
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition: executor.hpp:705
gko::ReferenceExecutor::run
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition: executor.hpp:1487
gko::Executor::copy
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition: executor.hpp:794
gko::HipExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1853
gko::Executor::should_propagate_log
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition: executor.hpp:879
gko::DpcppExecutor::get_device_id
int get_device_id() const noexcept
Get the DPCPP device id of the device associated to this executor.
Definition: executor.hpp:2039
gko::DpcppExecutor::get_max_workgroup_size
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition: executor.hpp:2099
gko::CudaExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1646
gko::CudaExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1629
gko::log::Logger::needs_propagation
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition: logger.hpp:680
gko::CudaExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1714
gko::Executor
The first step in using the Ginkgo library consists of creating an executor.
Definition: executor.hpp:615
gko::HipExecutor::get_blas_handle
hipblasContext * get_blas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1902
gko::HipExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1877
gko::CudaExecutor::get_compute_capability
int get_compute_capability() const noexcept
Get the compute capability.
Definition: executor.hpp:1662
gko::HipExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1861
gko::DpcppExecutor::create
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
gko::machine_topology::get_instance
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition: machine_topology.hpp:182
gko::HipExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::Operation
Operations can be used to define functionalities whose implementations differ among devices.
Definition: executor.hpp:258
gko::Executor::set_log_propagation_mode
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition: executor.hpp:867
gko::DpcppExecutor::get_subgroup_sizes
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition: executor.hpp:2060
gko::HipExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1886
gko::HipExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1869
gko::CudaExecutor::get_device_id
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition: executor.hpp:1600
gko::Executor::run
void run(std::string name, const ClosureReference &op_ref, const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition: executor.hpp:683