Ginkgo  Generated from pipelines/1554403166 branch based on develop. Ginkgo version 1.9.0
A numerical linear algebra library targeting many-core architectures
executor.hpp
1 // SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
2 //
3 // SPDX-License-Identifier: BSD-3-Clause
4 
5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
7 
8 
9 #include <array>
10 #include <atomic>
11 #include <iostream>
12 #include <memory>
13 #include <mutex>
14 #include <sstream>
15 #include <string>
16 #include <tuple>
17 #include <type_traits>
18 #include <vector>
19 
20 #include <ginkgo/core/base/device.hpp>
21 #include <ginkgo/core/base/fwd_decls.hpp>
22 #include <ginkgo/core/base/machine_topology.hpp>
23 #include <ginkgo/core/base/memory.hpp>
24 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
25 #include <ginkgo/core/base/types.hpp>
26 #include <ginkgo/core/log/logger.hpp>
27 #include <ginkgo/core/synthesizer/containers.hpp>
28 
29 
30 namespace gko {
31 
32 
40  never,
46  automatic
47 };
48 
49 
62 enum class allocation_mode { device, unified_global, unified_host };
63 
64 
65 #ifdef NDEBUG
66 
67 // When in release, prefer device allocations
68 constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
69 
70 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
71 
72 #else
73 
74 // When in debug, always UM allocations.
75 constexpr allocation_mode default_cuda_alloc_mode =
76  allocation_mode::unified_global;
77 
78 #if (GINKGO_HIP_PLATFORM_HCC == 1)
79 
80 // HIP on AMD GPUs does not support UM, so always prefer device allocations.
81 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
82 
83 #else
84 
85 // HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
86 constexpr allocation_mode default_hip_alloc_mode =
87  allocation_mode::unified_global;
88 
89 #endif
90 
91 #endif
92 
93 
94 } // namespace gko
95 
96 
101 enum class dpcpp_queue_property {
105  in_order = 1,
106 
110  enable_profiling = 2
111 };
112 
113 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
114  dpcpp_queue_property b)
115 {
116  return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
117  static_cast<int>(b));
118 }
119 
120 
121 namespace gko {
122 
123 
124 #define GKO_FORWARD_DECLARE(_type, ...) class _type
125 
126 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
127 
128 #undef GKO_FORWARD_DECLARE
129 
130 
131 class ReferenceExecutor;
132 
133 
134 namespace detail {
135 
136 
137 template <typename>
138 class ExecutorBase;
139 
140 
141 } // namespace detail
142 
143 
258 class Operation {
259 public:
260 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
261  virtual void run(std::shared_ptr<const _type>) const
262 
263  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
264 
265 #undef GKO_DECLARE_RUN_OVERLOAD
266 
267  // ReferenceExecutor overload can be defaulted to OmpExecutor's
268  virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
269 
275  virtual const char* get_name() const noexcept;
276 };
277 
278 
279 namespace detail {
280 
281 
291 template <typename Closure>
292 class RegisteredOperation : public Operation {
293 public:
300  RegisteredOperation(const char* name, Closure op)
301  : name_(name), op_(std::move(op))
302  {}
303 
304  const char* get_name() const noexcept override { return name_; }
305 
306  void run(std::shared_ptr<const ReferenceExecutor> exec) const override
307  {
308  op_(exec);
309  }
310 
311  void run(std::shared_ptr<const OmpExecutor> exec) const override
312  {
313  op_(exec);
314  }
315 
316  void run(std::shared_ptr<const CudaExecutor> exec) const override
317  {
318  op_(exec);
319  }
320 
321  void run(std::shared_ptr<const HipExecutor> exec) const override
322  {
323  op_(exec);
324  }
325 
326  void run(std::shared_ptr<const DpcppExecutor> exec) const override
327  {
328  op_(exec);
329  }
330 
331 private:
332  const char* name_;
333  Closure op_;
334 };
335 
336 
337 template <typename Closure>
338 RegisteredOperation<Closure> make_register_operation(const char* name,
339  Closure op)
340 {
341  return RegisteredOperation<Closure>{name, std::move(op)};
342 }
343 
344 
345 } // namespace detail
346 
347 
419 #define GKO_REGISTER_OPERATION(_name, _kernel) \
420  template <typename... Args> \
421  auto make_##_name(Args&&... args) \
422  { \
423  return ::gko::detail::make_register_operation( \
424  #_kernel, [&args...](auto exec) { \
425  using exec_type = decltype(exec); \
426  if (std::is_same< \
427  exec_type, \
428  std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
429  value) { \
430  ::gko::kernels::reference::_kernel( \
431  std::dynamic_pointer_cast< \
432  const ::gko::ReferenceExecutor>(exec), \
433  std::forward<Args>(args)...); \
434  } else if (std::is_same< \
435  exec_type, \
436  std::shared_ptr<const ::gko::OmpExecutor>>:: \
437  value) { \
438  ::gko::kernels::omp::_kernel( \
439  std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
440  exec), \
441  std::forward<Args>(args)...); \
442  } else if (std::is_same< \
443  exec_type, \
444  std::shared_ptr<const ::gko::CudaExecutor>>:: \
445  value) { \
446  ::gko::kernels::cuda::_kernel( \
447  std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
448  exec), \
449  std::forward<Args>(args)...); \
450  } else if (std::is_same< \
451  exec_type, \
452  std::shared_ptr<const ::gko::HipExecutor>>:: \
453  value) { \
454  ::gko::kernels::hip::_kernel( \
455  std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
456  exec), \
457  std::forward<Args>(args)...); \
458  } else if (std::is_same< \
459  exec_type, \
460  std::shared_ptr<const ::gko::DpcppExecutor>>:: \
461  value) { \
462  ::gko::kernels::dpcpp::_kernel( \
463  std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
464  exec), \
465  std::forward<Args>(args)...); \
466  } else { \
467  GKO_NOT_IMPLEMENTED; \
468  } \
469  }); \
470  } \
471  static_assert(true, \
472  "This assert is used to counter the false positive extra " \
473  "semi-colon warnings")
474 
475 
513 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
514  template <typename... Args> \
515  auto make_##_name(Args&&... args) \
516  { \
517  return ::gko::detail::make_register_operation( \
518  #_kernel, \
519  [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
520  } \
521  static_assert(true, \
522  "This assert is used to counter the false positive extra " \
523  "semi-colon warnings")
524 
525 
526 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
527 
615 class Executor : public log::EnableLogging<Executor> {
616  template <typename T>
617  friend class detail::ExecutorBase;
618 
619  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
620  friend class ReferenceExecutor;
621 
622 public:
623  virtual ~Executor() = default;
624 
625  Executor() = default;
626  Executor(Executor&) = delete;
627  Executor(Executor&&) = delete;
628  Executor& operator=(Executor&) = delete;
629  Executor& operator=(Executor&&) = delete;
630 
636  virtual void run(const Operation& op) const = 0;
637 
652  template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
653  typename ClosureDpcpp>
654  GKO_DEPRECATED(
655  "Please use the overload with std::string as first parameter.")
656  void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
657  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
658  {
659  LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,
660  ClosureDpcpp>
661  op(op_omp, op_cuda, op_hip, op_dpcpp);
662  this->run(op);
663  }
664 
681  template <typename ClosureReference, typename ClosureOmp,
682  typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
683  void run(std::string name, const ClosureReference& op_ref,
684  const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
685  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
686  {
687  LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,
688  ClosureDpcpp>
689  op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);
690  this->run(op);
691  }
692 
704  template <typename T>
705  T* alloc(size_type num_elems) const
706  {
707  this->template log<log::Logger::allocation_started>(
708  this, num_elems * sizeof(T));
709  T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
710  this->template log<log::Logger::allocation_completed>(
711  this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
712  return allocated;
713  }
714 
722  void free(void* ptr) const noexcept
723  {
724  this->template log<log::Logger::free_started>(
725  this, reinterpret_cast<uintptr>(ptr));
726  this->raw_free(ptr);
727  this->template log<log::Logger::free_completed>(
728  this, reinterpret_cast<uintptr>(ptr));
729  }
730 
743  template <typename T>
745  const T* src_ptr, T* dest_ptr) const
746  {
747  const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
748  const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
749  this->template log<log::Logger::copy_started>(
750  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
751  if (this != src_exec.get()) {
752  src_exec->template log<log::Logger::copy_started>(
753  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
754  }
755  try {
756  this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,
757  dest_ptr);
758  } catch (NotSupported&) {
759 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
760  // Unoptimized copy. Try to go through the masters.
761  // output to log when verbose >= 1 and debug build
762  std::clog << "Not direct copy. Try to copy data from the masters."
763  << std::endl;
764 #endif
765  auto src_master = src_exec->get_master().get();
766  if (num_elems > 0 && src_master != src_exec.get()) {
767  auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
768  src_master->copy_from<T>(src_exec, num_elems, src_ptr,
769  master_ptr);
770  this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
771  src_master->free(master_ptr);
772  }
773  }
774  this->template log<log::Logger::copy_completed>(
775  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
776  if (this != src_exec.get()) {
777  src_exec->template log<log::Logger::copy_completed>(
778  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
779  }
780  }
781 
793  template <typename T>
794  void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
795  {
796  this->copy_from(this, num_elems, src_ptr, dest_ptr);
797  }
798 
808  template <typename T>
809  T copy_val_to_host(const T* ptr) const
810  {
811  T out{};
812  this->get_master()->copy_from(this, 1, ptr, &out);
813  return out;
814  }
815 
820  virtual std::shared_ptr<Executor> get_master() noexcept = 0;
821 
825  virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
826 
830  virtual void synchronize() const = 0;
831 
838  void add_logger(std::shared_ptr<const log::Logger> logger) override
839  {
840  this->propagating_logger_refcount_.fetch_add(
841  logger->needs_propagation() ? 1 : 0);
842  this->EnableLogging<Executor>::add_logger(logger);
843  }
844 
851  void remove_logger(const log::Logger* logger) override
852  {
853  this->propagating_logger_refcount_.fetch_sub(
854  logger->needs_propagation() ? 1 : 0);
855  this->EnableLogging<Executor>::remove_logger(logger);
856  }
857 
858  using EnableLogging<Executor>::remove_logger;
859 
868  {
869  log_propagation_mode_ = mode;
870  }
871 
879  bool should_propagate_log() const
880  {
881  return this->propagating_logger_refcount_.load() > 0 &&
882  log_propagation_mode_ == log_propagation_mode::automatic;
883  }
884 
892  bool memory_accessible(const std::shared_ptr<const Executor>& other) const
893  {
894  return this->verify_memory_from(other.get());
895  }
896 
897  virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
898 
900  virtual std::string get_description() const = 0;
901 
902 protected:
907  struct exec_info {
911  int device_id = -1;
912 
916  std::string device_type;
917 
921  int numa_node = -1;
922 
931  int num_computing_units = -1;
932 
944  int num_pu_per_cu = -1;
945 
954  std::vector<int> subgroup_sizes{};
955 
964  int max_subgroup_size = -1;
965 
976  std::vector<int> max_workitem_sizes{};
977 
987  int max_workgroup_size;
988 
992  int major = -1;
993 
997  int minor = -1;
998 
1004  std::string pci_bus_id = std::string(13, 'x');
1005 
1016  std::vector<int> closest_pu_ids{};
1017  };
1018 
1024  const exec_info& get_exec_info() const { return this->exec_info_; }
1025 
1035  virtual void* raw_alloc(size_type size) const = 0;
1036 
1044  virtual void raw_free(void* ptr) const noexcept = 0;
1045 
1056  virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1057  const void* src_ptr, void* dest_ptr) const = 0;
1058 
1068 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1069  virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1070  const void* src_ptr, void* dest_ptr) const = 0
1071 
1072  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1073 
1074 #undef GKO_ENABLE_RAW_COPY_TO
1075 
1083  virtual bool verify_memory_from(const Executor* src_exec) const = 0;
1084 
1094 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1095  virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1096 
1097  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1098 
1099  GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1100 
1101 #undef GKO_ENABLE_VERIFY_MEMORY_TO
1102 
1109  virtual void populate_exec_info(const machine_topology* mach_topo) = 0;
1110 
1116  exec_info& get_exec_info() { return this->exec_info_; }
1117 
1118  exec_info exec_info_;
1119 
1121 
1122  std::atomic<int> propagating_logger_refcount_{};
1123 
1124 private:
1139  template <typename ClosureReference, typename ClosureOmp,
1140  typename ClosureCuda, typename ClosureHip, typename ClosureDpcpp>
1141  class LambdaOperation : public Operation {
1142  public:
1143  LambdaOperation(std::string name, const ClosureReference& op_ref,
1144  const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1145  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1146  : name_(std::move(name)),
1147  op_ref_(op_ref),
1148  op_omp_(op_omp),
1149  op_cuda_(op_cuda),
1150  op_hip_(op_hip),
1151  op_dpcpp_(op_dpcpp)
1152  {}
1153 
1164  LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1165  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1166  : LambdaOperation("unnamed", op_omp, op_omp, op_cuda, op_hip,
1167  op_dpcpp)
1168  {}
1169 
1170  void run(std::shared_ptr<const OmpExecutor>) const override
1171  {
1172  op_omp_();
1173  }
1174 
1175  void run(std::shared_ptr<const ReferenceExecutor>) const override
1176  {
1177  op_ref_();
1178  }
1179 
1180  void run(std::shared_ptr<const CudaExecutor>) const override
1181  {
1182  op_cuda_();
1183  }
1184 
1185  void run(std::shared_ptr<const HipExecutor>) const override
1186  {
1187  op_hip_();
1188  }
1189 
1190  void run(std::shared_ptr<const DpcppExecutor>) const override
1191  {
1192  op_dpcpp_();
1193  }
1194 
1195  const char* get_name() const noexcept override { return name_.c_str(); }
1196 
1197  private:
1198  std::string name_;
1199  ClosureReference op_ref_;
1200  ClosureOmp op_omp_;
1201  ClosureCuda op_cuda_;
1202  ClosureHip op_hip_;
1203  ClosureDpcpp op_dpcpp_;
1204  };
1205 };
1206 
1207 
1216 template <typename T>
1218 public:
1219  using pointer = T*;
1220 
1226  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1227  : exec_{exec}
1228  {}
1229 
1235  void operator()(pointer ptr) const
1236  {
1237  if (exec_) {
1238  exec_->free(ptr);
1239  }
1240  }
1241 
1242 private:
1243  std::shared_ptr<const Executor> exec_;
1244 };
1245 
1246 // a specialization for arrays
1247 template <typename T>
1248 class executor_deleter<T[]> {
1249 public:
1250  using pointer = T[];
1251 
1252  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1253  : exec_{exec}
1254  {}
1255 
1256  void operator()(pointer ptr) const
1257  {
1258  if (exec_) {
1259  exec_->free(ptr);
1260  }
1261  }
1262 
1263 private:
1264  std::shared_ptr<const Executor> exec_;
1265 };
1266 
1267 
1268 namespace detail {
1269 
1270 
1271 template <typename ConcreteExecutor>
1272 class ExecutorBase : public Executor {
1273  // friend class is not in the nearest enclosing namesace, so we write the
1274  // full name
1275  friend class ::gko::OmpExecutor;
1276  friend class ::gko::HipExecutor;
1277  friend class ::gko::DpcppExecutor;
1278  friend class ::gko::CudaExecutor;
1279  friend class ::gko::ReferenceExecutor;
1280 
1281 public:
1282  void run(const Operation& op) const override
1283  {
1284  this->template log<log::Logger::operation_launched>(this, &op);
1285  auto scope_guard = get_scoped_device_id_guard();
1286  op.run(self()->shared_from_this());
1287  this->template log<log::Logger::operation_completed>(this, &op);
1288  }
1289 
1290 protected:
1291  void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1292  const void* src_ptr, void* dest_ptr) const override
1293  {
1294  src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1295  }
1296 
1297  virtual bool verify_memory_from(const Executor* src_exec) const override
1298  {
1299  return src_exec->verify_memory_to(self());
1300  }
1301 
1302 private:
1303  ConcreteExecutor* self() noexcept
1304  {
1305  return static_cast<ConcreteExecutor*>(this);
1306  }
1307 
1308  const ConcreteExecutor* self() const noexcept
1309  {
1310  return static_cast<const ConcreteExecutor*>(this);
1311  }
1312 };
1313 
1314 #undef GKO_DECLARE_EXECUTOR_FRIEND
1315 
1316 
1324 class EnableDeviceReset {
1325 public:
1331  GKO_DEPRECATED(
1332  "device_reset is no longer supported, call "
1333  "cudaDeviceReset/hipDeviceReset manually")
1334  void set_device_reset(bool device_reset) {}
1335 
1341  GKO_DEPRECATED(
1342  "device_reset is no longer supported, call "
1343  "cudaDeviceReset/hipDeviceReset manually")
1344  bool get_device_reset() { return false; }
1345 
1346 protected:
1352  EnableDeviceReset() {}
1353 
1354  GKO_DEPRECATED(
1355  "device_reset is no longer supported, call "
1356  "cudaDeviceReset/hipDeviceReset manually")
1357  EnableDeviceReset(bool device_reset) {}
1358 };
1359 
1360 
1361 } // namespace detail
1362 
1363 
1364 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1365  void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1366  const void* src_ptr, void* dest_ptr) const override
1367 
1368 
1369 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1370  virtual bool verify_memory_to(const dest_* other) const override \
1371  { \
1372  return bool_; \
1373  } \
1374  static_assert(true, \
1375  "This assert is used to counter the false positive extra " \
1376  "semi-colon warnings")
1377 
1378 
1386 class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1387  public std::enable_shared_from_this<OmpExecutor> {
1388  friend class detail::ExecutorBase<OmpExecutor>;
1389 
1390 public:
1391  using Executor::run;
1392 
1396  static std::shared_ptr<OmpExecutor> create(
1397  std::shared_ptr<CpuAllocatorBase> alloc =
1398  std::make_shared<CpuAllocator>())
1399  {
1400  return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
1401  }
1402 
1403  std::shared_ptr<Executor> get_master() noexcept override;
1404 
1405  std::shared_ptr<const Executor> get_master() const noexcept override;
1406 
1407  void synchronize() const override;
1408 
1409  int get_num_cores() const
1410  {
1411  return this->get_exec_info().num_computing_units;
1412  }
1413 
1414  int get_num_threads_per_core() const
1415  {
1416  return this->get_exec_info().num_pu_per_cu;
1417  }
1418 
1419  static int get_num_omp_threads();
1420 
1421  scoped_device_id_guard get_scoped_device_id_guard() const override;
1422 
1423  std::string get_description() const override;
1424 
1425 protected:
1426  OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1427  : alloc_{std::move(alloc)}
1428  {
1429  this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
1430  }
1431 
1432  void populate_exec_info(const machine_topology* mach_topo) override;
1433 
1434  void* raw_alloc(size_type size) const override;
1435 
1436  void raw_free(void* ptr) const noexcept override;
1437 
1438  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1439 
1440  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1441 
1442  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1443 
1444  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1445 
1446  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1447 
1448  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1449 
1450  std::shared_ptr<CpuAllocatorBase> alloc_;
1451 };
1452 
1453 
1454 namespace kernels {
1455 namespace omp {
1456 using DefaultExecutor = OmpExecutor;
1457 } // namespace omp
1458 } // namespace kernels
1459 
1460 
1469 public:
1470  using Executor::run;
1471 
1472  static std::shared_ptr<ReferenceExecutor> create(
1473  std::shared_ptr<CpuAllocatorBase> alloc =
1474  std::make_shared<CpuAllocator>())
1475  {
1476  return std::shared_ptr<ReferenceExecutor>(
1477  new ReferenceExecutor(std::move(alloc)));
1478  }
1479 
1480  scoped_device_id_guard get_scoped_device_id_guard() const override
1481  {
1482  return {this, 0};
1483  }
1484 
1485  std::string get_description() const override { return "ReferenceExecutor"; }
1486 
1487  void run(const Operation& op) const override
1488  {
1489  this->template log<log::Logger::operation_launched>(this, &op);
1490  op.run(std::static_pointer_cast<const ReferenceExecutor>(
1491  this->shared_from_this()));
1492  this->template log<log::Logger::operation_completed>(this, &op);
1493  }
1494 
1495 protected:
1496  ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1497  : OmpExecutor{std::move(alloc)}
1498  {
1499  this->ReferenceExecutor::populate_exec_info(
1501  }
1502 
1503  void populate_exec_info(const machine_topology*) override
1504  {
1505  this->get_exec_info().device_id = -1;
1506  this->get_exec_info().num_computing_units = 1;
1507  this->get_exec_info().num_pu_per_cu = 1;
1508  }
1509 
1510  bool verify_memory_from(const Executor* src_exec) const override
1511  {
1512  return src_exec->verify_memory_to(this);
1513  }
1514 
1515  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1516 
1517  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1518 
1519  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1520 
1521  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1522 
1523  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1524 };
1525 
1526 
1527 namespace kernels {
1528 namespace reference {
1529 using DefaultExecutor = ReferenceExecutor;
1530 } // namespace reference
1531 } // namespace kernels
1532 
1533 
1540 class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1541  public std::enable_shared_from_this<CudaExecutor>,
1542  public detail::EnableDeviceReset {
1543  friend class detail::ExecutorBase<CudaExecutor>;
1544 
1545 public:
1546  using Executor::run;
1547 
1559  GKO_DEPRECATED(
1560  "calling this CudaExecutor::create method is deprecated, because"
1561  "device_reset no longer has an effect"
1562  "call CudaExecutor::create("
1563  " int device_id, std::shared_ptr<Executor> master,"
1564  " std::shared_ptr<CudaAllocatorBase> alloc,"
1565  " CUstream_st* stream);"
1566  "instead")
1567  static std::shared_ptr<CudaExecutor> create(
1568  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1569  allocation_mode alloc_mode = default_cuda_alloc_mode,
1570  CUstream_st* stream = nullptr);
1571 
1581  static std::shared_ptr<CudaExecutor> create(
1582  int device_id, std::shared_ptr<Executor> master,
1583  std::shared_ptr<CudaAllocatorBase> alloc =
1584  std::make_shared<CudaAllocator>(),
1585  CUstream_st* stream = nullptr);
1586 
1587  std::shared_ptr<Executor> get_master() noexcept override;
1588 
1589  std::shared_ptr<const Executor> get_master() const noexcept override;
1590 
1591  void synchronize() const override;
1592 
1593  scoped_device_id_guard get_scoped_device_id_guard() const override;
1594 
1595  std::string get_description() const override;
1596 
1600  int get_device_id() const noexcept
1601  {
1602  return this->get_exec_info().device_id;
1603  }
1604 
1608  static int get_num_devices();
1609 
1613  int get_num_warps_per_sm() const noexcept
1614  {
1615  return this->get_exec_info().num_pu_per_cu;
1616  }
1617 
1621  int get_num_multiprocessor() const noexcept
1622  {
1623  return this->get_exec_info().num_computing_units;
1624  }
1625 
1629  int get_num_warps() const noexcept
1630  {
1631  return this->get_exec_info().num_computing_units *
1632  this->get_exec_info().num_pu_per_cu;
1633  }
1634 
1638  int get_warp_size() const noexcept
1639  {
1640  return this->get_exec_info().max_subgroup_size;
1641  }
1642 
1646  int get_major_version() const noexcept
1647  {
1648  return this->get_exec_info().major;
1649  }
1650 
1654  int get_minor_version() const noexcept
1655  {
1656  return this->get_exec_info().minor;
1657  }
1658 
1664  GKO_DEPRECATED("use get_blas_handle() instead")
1665  cublasContext* get_cublas_handle() const { return get_blas_handle(); }
1666 
1670  cublasContext* get_blas_handle() const { return cublas_handle_.get(); }
1671 
1677  GKO_DEPRECATED("use get_sparselib_handle() instead")
1678  cusparseContext* get_cusparse_handle() const
1679  {
1680  return get_sparselib_handle();
1681  }
1682 
1686  cusparseContext* get_sparselib_handle() const
1687  {
1688  return cusparse_handle_.get();
1689  }
1690 
1696  std::vector<int> get_closest_pus() const
1697  {
1698  return this->get_exec_info().closest_pu_ids;
1699  }
1700 
1706  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1707 
1714  CUstream_st* get_stream() const { return stream_; }
1715 
1716 protected:
1717  void set_gpu_property();
1718 
1719  void init_handles();
1720 
1721  CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1722  std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1723  : master_(master), alloc_{std::move(alloc)}, stream_{stream}
1724  {
1725  this->get_exec_info().device_id = device_id;
1726  this->get_exec_info().num_computing_units = 0;
1727  this->get_exec_info().num_pu_per_cu = 0;
1728  this->CudaExecutor::populate_exec_info(
1730  this->set_gpu_property();
1731  this->init_handles();
1732  }
1733 
1734  void* raw_alloc(size_type size) const override;
1735 
1736  void raw_free(void* ptr) const noexcept override;
1737 
1738  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1739 
1740  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1741 
1742  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1743 
1744  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1745 
1746  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1747 
1748  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1749 
1750  void populate_exec_info(const machine_topology* mach_topo) override;
1751 
1752 private:
1753  std::shared_ptr<Executor> master_;
1754 
1755  template <typename T>
1756  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1757  handle_manager<cublasContext> cublas_handle_;
1758  handle_manager<cusparseContext> cusparse_handle_;
1759  std::shared_ptr<CudaAllocatorBase> alloc_;
1760  CUstream_st* stream_;
1761 };
1762 
1763 
1764 namespace kernels {
1765 namespace cuda {
1766 using DefaultExecutor = CudaExecutor;
1767 } // namespace cuda
1768 } // namespace kernels
1769 
1770 
1777 class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1778  public std::enable_shared_from_this<HipExecutor>,
1779  public detail::EnableDeviceReset {
1780  friend class detail::ExecutorBase<HipExecutor>;
1781 
1782 public:
1783  using Executor::run;
1784 
1796  GKO_DEPRECATED(
1797  "device_reset is deprecated entirely, call hipDeviceReset directly. "
1798  "alloc_mode was replaced by the Allocator type "
1799  "hierarchy.")
1800  static std::shared_ptr<HipExecutor> create(
1801  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1802  allocation_mode alloc_mode = default_hip_alloc_mode,
1803  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1804 
1805  static std::shared_ptr<HipExecutor> create(
1806  int device_id, std::shared_ptr<Executor> master,
1807  std::shared_ptr<HipAllocatorBase> alloc =
1808  std::make_shared<HipAllocator>(),
1809  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1810 
1811  std::shared_ptr<Executor> get_master() noexcept override;
1812 
1813  std::shared_ptr<const Executor> get_master() const noexcept override;
1814 
1815  void synchronize() const override;
1816 
1817  scoped_device_id_guard get_scoped_device_id_guard() const override;
1818 
1819  std::string get_description() const override;
1820 
1824  int get_device_id() const noexcept
1825  {
1826  return this->get_exec_info().device_id;
1827  }
1828 
1832  static int get_num_devices();
1833 
1837  int get_num_warps_per_sm() const noexcept
1838  {
1839  return this->get_exec_info().num_pu_per_cu;
1840  }
1841 
1845  int get_num_multiprocessor() const noexcept
1846  {
1847  return this->get_exec_info().num_computing_units;
1848  }
1849 
1853  int get_major_version() const noexcept
1854  {
1855  return this->get_exec_info().major;
1856  }
1857 
1861  int get_minor_version() const noexcept
1862  {
1863  return this->get_exec_info().minor;
1864  }
1865 
1869  int get_num_warps() const noexcept
1870  {
1871  return this->get_exec_info().num_computing_units *
1872  this->get_exec_info().num_pu_per_cu;
1873  }
1874 
1878  int get_warp_size() const noexcept
1879  {
1880  return this->get_exec_info().max_subgroup_size;
1881  }
1882 
1888  GKO_DEPRECATED("use get_blas_handle() instead")
1889  hipblasContext* get_hipblas_handle() const { return get_blas_handle(); }
1890 
1894  hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); }
1895 
1901  GKO_DEPRECATED("use get_sparselib_handle() instead")
1902  hipsparseContext* get_hipsparse_handle() const
1903  {
1904  return get_sparselib_handle();
1905  }
1906 
1910  hipsparseContext* get_sparselib_handle() const
1911  {
1912  return hipsparse_handle_.get();
1913  }
1914 
1920  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1921 
1927  std::vector<int> get_closest_pus() const
1928  {
1929  return this->get_exec_info().closest_pu_ids;
1930  }
1931 
1932  GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
1933 
1934 protected:
1935  void set_gpu_property();
1936 
1937  void init_handles();
1938 
1939  HipExecutor(int device_id, std::shared_ptr<Executor> master,
1940  std::shared_ptr<HipAllocatorBase> alloc,
1941  GKO_HIP_STREAM_STRUCT* stream)
1942  : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1943  {
1944  this->get_exec_info().device_id = device_id;
1945  this->get_exec_info().num_computing_units = 0;
1946  this->get_exec_info().num_pu_per_cu = 0;
1947  this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1948  this->set_gpu_property();
1949  this->init_handles();
1950  }
1951 
1952  void* raw_alloc(size_type size) const override;
1953 
1954  void raw_free(void* ptr) const noexcept override;
1955 
1956  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1957 
1958  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1959 
1960  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1961 
1962  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1963 
1964  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1965 
1966  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1967 
1968  void populate_exec_info(const machine_topology* mach_topo) override;
1969 
1970 private:
1971  std::shared_ptr<Executor> master_;
1972 
1973  template <typename T>
1974  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1975  handle_manager<hipblasContext> hipblas_handle_;
1976  handle_manager<hipsparseContext> hipsparse_handle_;
1977  std::shared_ptr<HipAllocatorBase> alloc_;
1978  GKO_HIP_STREAM_STRUCT* stream_;
1979 };
1980 
1981 
1982 namespace kernels {
1983 namespace hip {
1984 using DefaultExecutor = HipExecutor;
1985 } // namespace hip
1986 } // namespace kernels
1987 
1988 
1995 class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
1996  public std::enable_shared_from_this<DpcppExecutor> {
1997  friend class detail::ExecutorBase<DpcppExecutor>;
1998 
1999 public:
2000  using Executor::run;
2001 
2011  static std::shared_ptr<DpcppExecutor> create(
2012  int device_id, std::shared_ptr<Executor> master,
2013  std::string device_type = "all",
2014  dpcpp_queue_property property = dpcpp_queue_property::in_order);
2015 
2016  std::shared_ptr<Executor> get_master() noexcept override;
2017 
2018  std::shared_ptr<const Executor> get_master() const noexcept override;
2019 
2020  void synchronize() const override;
2021 
2022  scoped_device_id_guard get_scoped_device_id_guard() const override;
2023 
2024  std::string get_description() const override;
2025 
2031  int get_device_id() const noexcept
2032  {
2033  return this->get_exec_info().device_id;
2034  }
2035 
2036  sycl::queue* get_queue() const { return queue_.get(); }
2037 
2045  static int get_num_devices(std::string device_type);
2046 
2052  const std::vector<int>& get_subgroup_sizes() const noexcept
2053  {
2054  return this->get_exec_info().subgroup_sizes;
2055  }
2056 
2062  int get_num_computing_units() const noexcept
2063  {
2064  return this->get_exec_info().num_computing_units;
2065  }
2066 
2070  int get_num_subgroups() const noexcept
2071  {
2072  return this->get_exec_info().num_computing_units *
2073  this->get_exec_info().num_pu_per_cu;
2074  }
2075 
2081  const std::vector<int>& get_max_workitem_sizes() const noexcept
2082  {
2083  return this->get_exec_info().max_workitem_sizes;
2084  }
2085 
2091  int get_max_workgroup_size() const noexcept
2092  {
2093  return this->get_exec_info().max_workgroup_size;
2094  }
2095 
2101  int get_max_subgroup_size() const noexcept
2102  {
2103  return this->get_exec_info().max_subgroup_size;
2104  }
2105 
2111  std::string get_device_type() const noexcept
2112  {
2113  return this->get_exec_info().device_type;
2114  }
2115 
2116 protected:
2117  void set_device_property(
2118  dpcpp_queue_property property = dpcpp_queue_property::in_order);
2119 
2120  DpcppExecutor(
2121  int device_id, std::shared_ptr<Executor> master,
2122  std::string device_type = "all",
2123  dpcpp_queue_property property = dpcpp_queue_property::in_order)
2124  : master_(master)
2125  {
2126  std::for_each(device_type.begin(), device_type.end(),
2127  [](char& c) { c = std::tolower(c); });
2128  this->get_exec_info().device_type = std::string(device_type);
2129  this->get_exec_info().device_id = device_id;
2130  this->set_device_property(property);
2131  }
2132 
2133  void populate_exec_info(const machine_topology* mach_topo) override;
2134 
2135  void* raw_alloc(size_type size) const override;
2136 
2137  void raw_free(void* ptr) const noexcept override;
2138 
2139  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2140 
2141  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
2142 
2143  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
2144 
2145  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
2146 
2147  bool verify_memory_to(const OmpExecutor* dest_exec) const override;
2148 
2149  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
2150 
2151 private:
2152  std::shared_ptr<Executor> master_;
2153 
2154  template <typename T>
2155  using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2156  queue_manager<sycl::queue> queue_;
2157 };
2158 
2159 
2160 namespace kernels {
2161 namespace dpcpp {
2162 using DefaultExecutor = DpcppExecutor;
2163 } // namespace dpcpp
2164 } // namespace kernels
2165 
2166 
2167 #undef GKO_OVERRIDE_RAW_COPY_TO
2168 
2169 
2170 } // namespace gko
2171 
2172 
2173 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
gko::allocation_mode
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition: executor.hpp:62
gko::CudaExecutor::get_sparselib_handle
cusparseContext * get_sparselib_handle() const
Get the cusparse handle for this executor.
Definition: executor.hpp:1686
gko::CudaExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1613
gko::executor_deleter
This is a deleter that uses an executor's free method to deallocate the data.
Definition: executor.hpp:1217
gko::HipExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1837
gko::executor_deleter::executor_deleter
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition: executor.hpp:1226
gko::CudaExecutor::get_stream
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition: executor.hpp:1714
gko::Executor::synchronize
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor::get_max_workitem_sizes
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition: executor.hpp:2081
gko::Executor::free
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition: executor.hpp:722
gko::HipExecutor::get_hipblas_handle
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1889
gko::Executor::memory_accessible
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition: executor.hpp:892
gko::DpcppExecutor::get_device_type
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition: executor.hpp:2111
gko::HipExecutor::get_num_devices
static int get_num_devices()
Get the number of devices present on the system.
gko::HipExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1920
gko::scoped_device_id_guard
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition: scoped_device_id_guard.hpp:76
gko::DpcppExecutor::get_num_devices
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
gko::DpcppExecutor::get_num_computing_units
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition: executor.hpp:2062
gko::log_propagation_mode::automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
gko::Executor::remove_logger
void remove_logger(const log::Logger *logger) override
Definition: executor.hpp:851
gko::DpcppExecutor::get_num_subgroups
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition: executor.hpp:2070
gko::ReferenceExecutor::get_description
std::string get_description() const override
Definition: executor.hpp:1485
gko::size_type
std::size_t size_type
Integral type used for allocation quantities.
Definition: types.hpp:86
gko::Executor::copy_from
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition: executor.hpp:744
gko::DpcppExecutor::get_description
std::string get_description() const override
gko::Executor::get_master
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
gko::ptr_param::get
T * get() const
Definition: utils_helper.hpp:75
gko::Executor::run
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
gko::HipExecutor
This is the Executor subclass which represents the HIP enhanced device.
Definition: executor.hpp:1777
gko::CudaExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1654
gko::DpcppExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::log_propagation_mode::never
Events only get reported at loggers attached to the triggering object.
gko::CudaExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1621
gko::ReferenceExecutor
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition: executor.hpp:1468
gko::CudaExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1638
gko::NotSupported
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition: exception.hpp:127
gko::HipExecutor::get_device_id
int get_device_id() const noexcept
Get the HIP device id of the device associated to this executor.
Definition: executor.hpp:1824
gko::HipExecutor::get_hipsparse_handle
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1902
gko::CudaExecutor
This is the Executor subclass which represents the CUDA device.
Definition: executor.hpp:1540
gko::log_propagation_mode
log_propagation_mode
How Logger events are propagated to their Executor.
Definition: executor.hpp:34
gko::HipExecutor::create
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
gko
The Ginkgo namespace.
Definition: abstract_factory.hpp:20
gko::Executor::add_logger
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition: executor.hpp:838
gko::HipExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::CudaExecutor::get_blas_handle
cublasContext * get_blas_handle() const
Get the cublas handle for this executor.
Definition: executor.hpp:1670
gko::executor_deleter::operator()
void operator()(pointer ptr) const
Deletes the object.
Definition: executor.hpp:1235
gko::HipExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1927
gko::DpcppExecutor::get_max_subgroup_size
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition: executor.hpp:2101
gko::log::EnableLogging
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition: logger.hpp:748
gko::Operation::get_name
virtual const char * get_name() const noexcept
Returns the operation's name.
gko::DpcppExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor
This is the Executor subclass which represents a DPC++ enhanced device.
Definition: executor.hpp:1995
gko::ptr_param
This class is used for function parameters in the place of raw pointers.
Definition: utils_helper.hpp:41
gko::log::Logger
Definition: logger.hpp:75
gko::Executor::copy_val_to_host
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition: executor.hpp:809
gko::OmpExecutor
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition: executor.hpp:1386
gko::CudaExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1696
gko::HipExecutor::get_sparselib_handle
hipsparseContext * get_sparselib_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1910
gko::Executor::get_description
virtual std::string get_description() const =0
gko::stop::mode
mode
The mode for the residual norm criterion.
Definition: residual_norm.hpp:36
gko::OmpExecutor::create
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition: executor.hpp:1396
gko::HipExecutor::get_description
std::string get_description() const override
gko::Executor::alloc
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition: executor.hpp:705
gko::ReferenceExecutor::run
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition: executor.hpp:1487
gko::Executor::copy
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition: executor.hpp:794
gko::HipExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1845
gko::Executor::should_propagate_log
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition: executor.hpp:879
gko::DpcppExecutor::get_device_id
int get_device_id() const noexcept
Get the DPCPP device id of the device associated to this executor.
Definition: executor.hpp:2031
gko::DpcppExecutor::get_max_workgroup_size
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition: executor.hpp:2091
gko::CudaExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1646
gko::CudaExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1629
gko::log::Logger::needs_propagation
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition: logger.hpp:642
gko::CudaExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1706
gko::Executor
The first step in using the Ginkgo library consists of creating an executor.
Definition: executor.hpp:615
gko::HipExecutor::get_blas_handle
hipblasContext * get_blas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1894
gko::HipExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1869
gko::HipExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1853
gko::DpcppExecutor::create
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
gko::machine_topology::get_instance
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition: machine_topology.hpp:182
gko::HipExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::Operation
Operations can be used to define functionalities whose implementations differ among devices.
Definition: executor.hpp:258
gko::Executor::set_log_propagation_mode
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition: executor.hpp:867
gko::DpcppExecutor::get_subgroup_sizes
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition: executor.hpp:2052
gko::HipExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1878
gko::HipExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1861
gko::CudaExecutor::get_device_id
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition: executor.hpp:1600
gko::Executor::run
void run(std::string name, const ClosureReference &op_ref, const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition: executor.hpp:683