Ginkgo  Generated from pipelines/1363093349 branch based on develop. Ginkgo version 1.9.0
A numerical linear algebra library targeting many-core architectures
executor.hpp
1 // SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
2 //
3 // SPDX-License-Identifier: BSD-3-Clause
4 
5 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
7 
8 
9 #include <array>
10 #include <atomic>
11 #include <iostream>
12 #include <memory>
13 #include <mutex>
14 #include <sstream>
15 #include <string>
16 #include <tuple>
17 #include <type_traits>
18 #include <vector>
19 
20 #include <ginkgo/core/base/device.hpp>
21 #include <ginkgo/core/base/fwd_decls.hpp>
22 #include <ginkgo/core/base/machine_topology.hpp>
23 #include <ginkgo/core/base/memory.hpp>
24 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
25 #include <ginkgo/core/base/types.hpp>
26 #include <ginkgo/core/log/logger.hpp>
27 #include <ginkgo/core/synthesizer/containers.hpp>
28 
29 
30 namespace gko {
31 
32 
40  never,
46  automatic
47 };
48 
49 
62 enum class allocation_mode { device, unified_global, unified_host };
63 
64 
65 #ifdef NDEBUG
66 
67 // When in release, prefer device allocations
68 constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
69 
70 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
71 
72 #else
73 
74 // When in debug, always UM allocations.
75 constexpr allocation_mode default_cuda_alloc_mode =
76  allocation_mode::unified_global;
77 
78 #if (GINKGO_HIP_PLATFORM_HCC == 1)
79 
80 // HIP on AMD GPUs does not support UM, so always prefer device allocations.
81 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
82 
83 #else
84 
85 // HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
86 constexpr allocation_mode default_hip_alloc_mode =
87  allocation_mode::unified_global;
88 
89 #endif
90 
91 #endif
92 
93 
94 } // namespace gko
95 
96 
101 enum class dpcpp_queue_property {
105  in_order = 1,
106 
110  enable_profiling = 2
111 };
112 
113 GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
114  dpcpp_queue_property b)
115 {
116  return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
117  static_cast<int>(b));
118 }
119 
120 
121 namespace gko {
122 
123 
124 #define GKO_FORWARD_DECLARE(_type, ...) class _type
125 
126 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
127 
128 #undef GKO_FORWARD_DECLARE
129 
130 
131 class ReferenceExecutor;
132 
133 
134 namespace detail {
135 
136 
137 template <typename>
138 class ExecutorBase;
139 
140 
141 } // namespace detail
142 
143 
258 class Operation {
259 public:
260 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
261  virtual void run(std::shared_ptr<const _type>) const
262 
263  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
264 
265 #undef GKO_DECLARE_RUN_OVERLOAD
266 
267  // ReferenceExecutor overload can be defaulted to OmpExecutor's
268  virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
269 
275  virtual const char* get_name() const noexcept;
276 };
277 
278 
279 namespace detail {
280 
281 
291 template <typename Closure>
292 class RegisteredOperation : public Operation {
293 public:
300  RegisteredOperation(const char* name, Closure op)
301  : name_(name), op_(std::move(op))
302  {}
303 
304  const char* get_name() const noexcept override { return name_; }
305 
306  void run(std::shared_ptr<const ReferenceExecutor> exec) const override
307  {
308  op_(exec);
309  }
310 
311  void run(std::shared_ptr<const OmpExecutor> exec) const override
312  {
313  op_(exec);
314  }
315 
316  void run(std::shared_ptr<const CudaExecutor> exec) const override
317  {
318  op_(exec);
319  }
320 
321  void run(std::shared_ptr<const HipExecutor> exec) const override
322  {
323  op_(exec);
324  }
325 
326  void run(std::shared_ptr<const DpcppExecutor> exec) const override
327  {
328  op_(exec);
329  }
330 
331 private:
332  const char* name_;
333  Closure op_;
334 };
335 
336 
337 template <typename Closure>
338 RegisteredOperation<Closure> make_register_operation(const char* name,
339  Closure op)
340 {
341  return RegisteredOperation<Closure>{name, std::move(op)};
342 }
343 
344 
345 } // namespace detail
346 
347 
419 #define GKO_REGISTER_OPERATION(_name, _kernel) \
420  template <typename... Args> \
421  auto make_##_name(Args&&... args) \
422  { \
423  return ::gko::detail::make_register_operation( \
424  #_kernel, [&args...](auto exec) { \
425  using exec_type = decltype(exec); \
426  if (std::is_same< \
427  exec_type, \
428  std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
429  value) { \
430  ::gko::kernels::reference::_kernel( \
431  std::dynamic_pointer_cast< \
432  const ::gko::ReferenceExecutor>(exec), \
433  std::forward<Args>(args)...); \
434  } else if (std::is_same< \
435  exec_type, \
436  std::shared_ptr<const ::gko::OmpExecutor>>:: \
437  value) { \
438  ::gko::kernels::omp::_kernel( \
439  std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
440  exec), \
441  std::forward<Args>(args)...); \
442  } else if (std::is_same< \
443  exec_type, \
444  std::shared_ptr<const ::gko::CudaExecutor>>:: \
445  value) { \
446  ::gko::kernels::cuda::_kernel( \
447  std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
448  exec), \
449  std::forward<Args>(args)...); \
450  } else if (std::is_same< \
451  exec_type, \
452  std::shared_ptr<const ::gko::HipExecutor>>:: \
453  value) { \
454  ::gko::kernels::hip::_kernel( \
455  std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
456  exec), \
457  std::forward<Args>(args)...); \
458  } else if (std::is_same< \
459  exec_type, \
460  std::shared_ptr<const ::gko::DpcppExecutor>>:: \
461  value) { \
462  ::gko::kernels::dpcpp::_kernel( \
463  std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
464  exec), \
465  std::forward<Args>(args)...); \
466  } else { \
467  GKO_NOT_IMPLEMENTED; \
468  } \
469  }); \
470  } \
471  static_assert(true, \
472  "This assert is used to counter the false positive extra " \
473  "semi-colon warnings")
474 
475 
513 #define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
514  template <typename... Args> \
515  auto make_##_name(Args&&... args) \
516  { \
517  return ::gko::detail::make_register_operation( \
518  #_kernel, \
519  [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
520  } \
521  static_assert(true, \
522  "This assert is used to counter the false positive extra " \
523  "semi-colon warnings")
524 
525 
526 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
527 
615 class Executor : public log::EnableLogging<Executor> {
616  template <typename T>
617  friend class detail::ExecutorBase;
618 
619  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
620  friend class ReferenceExecutor;
621 
622 public:
623  virtual ~Executor() = default;
624 
625  Executor() = default;
626  Executor(Executor&) = delete;
627  Executor(Executor&&) = delete;
628  Executor& operator=(Executor&) = delete;
629  Executor& operator=(Executor&&) = delete;
630 
636  virtual void run(const Operation& op) const = 0;
637 
652  template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
653  typename ClosureDpcpp>
654  void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
655  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
656  {
657  LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
658  op_omp, op_cuda, op_hip, op_dpcpp);
659  this->run(op);
660  }
661 
673  template <typename T>
674  T* alloc(size_type num_elems) const
675  {
676  this->template log<log::Logger::allocation_started>(
677  this, num_elems * sizeof(T));
678  T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
679  this->template log<log::Logger::allocation_completed>(
680  this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
681  return allocated;
682  }
683 
691  void free(void* ptr) const noexcept
692  {
693  this->template log<log::Logger::free_started>(
694  this, reinterpret_cast<uintptr>(ptr));
695  this->raw_free(ptr);
696  this->template log<log::Logger::free_completed>(
697  this, reinterpret_cast<uintptr>(ptr));
698  }
699 
712  template <typename T>
714  const T* src_ptr, T* dest_ptr) const
715  {
716  const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
717  const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
718  this->template log<log::Logger::copy_started>(
719  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
720  if (this != src_exec.get()) {
721  src_exec->template log<log::Logger::copy_started>(
722  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
723  }
724  try {
725  this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,
726  dest_ptr);
727  } catch (NotSupported&) {
728 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
729  // Unoptimized copy. Try to go through the masters.
730  // output to log when verbose >= 1 and debug build
731  std::clog << "Not direct copy. Try to copy data from the masters."
732  << std::endl;
733 #endif
734  auto src_master = src_exec->get_master().get();
735  if (num_elems > 0 && src_master != src_exec.get()) {
736  auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
737  src_master->copy_from<T>(src_exec, num_elems, src_ptr,
738  master_ptr);
739  this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
740  src_master->free(master_ptr);
741  }
742  }
743  this->template log<log::Logger::copy_completed>(
744  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
745  if (this != src_exec.get()) {
746  src_exec->template log<log::Logger::copy_completed>(
747  src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
748  }
749  }
750 
762  template <typename T>
763  void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
764  {
765  this->copy_from(this, num_elems, src_ptr, dest_ptr);
766  }
767 
777  template <typename T>
778  T copy_val_to_host(const T* ptr) const
779  {
780  T out{};
781  this->get_master()->copy_from(this, 1, ptr, &out);
782  return out;
783  }
784 
789  virtual std::shared_ptr<Executor> get_master() noexcept = 0;
790 
794  virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
795 
799  virtual void synchronize() const = 0;
800 
807  void add_logger(std::shared_ptr<const log::Logger> logger) override
808  {
809  this->propagating_logger_refcount_.fetch_add(
810  logger->needs_propagation() ? 1 : 0);
811  this->EnableLogging<Executor>::add_logger(logger);
812  }
813 
820  void remove_logger(const log::Logger* logger) override
821  {
822  this->propagating_logger_refcount_.fetch_sub(
823  logger->needs_propagation() ? 1 : 0);
824  this->EnableLogging<Executor>::remove_logger(logger);
825  }
826 
827  using EnableLogging<Executor>::remove_logger;
828 
837  {
838  log_propagation_mode_ = mode;
839  }
840 
848  bool should_propagate_log() const
849  {
850  return this->propagating_logger_refcount_.load() > 0 &&
851  log_propagation_mode_ == log_propagation_mode::automatic;
852  }
853 
861  bool memory_accessible(const std::shared_ptr<const Executor>& other) const
862  {
863  return this->verify_memory_from(other.get());
864  }
865 
866  virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
867 
869  virtual std::string get_description() const = 0;
870 
871 protected:
876  struct exec_info {
880  int device_id = -1;
881 
885  std::string device_type;
886 
890  int numa_node = -1;
891 
900  int num_computing_units = -1;
901 
913  int num_pu_per_cu = -1;
914 
923  std::vector<int> subgroup_sizes{};
924 
933  int max_subgroup_size = -1;
934 
945  std::vector<int> max_workitem_sizes{};
946 
956  int max_workgroup_size;
957 
961  int major = -1;
962 
966  int minor = -1;
967 
973  std::string pci_bus_id = std::string(13, 'x');
974 
985  std::vector<int> closest_pu_ids{};
986  };
987 
993  const exec_info& get_exec_info() const { return this->exec_info_; }
994 
1004  virtual void* raw_alloc(size_type size) const = 0;
1005 
1013  virtual void raw_free(void* ptr) const noexcept = 0;
1014 
1025  virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1026  const void* src_ptr, void* dest_ptr) const = 0;
1027 
1037 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1038  virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1039  const void* src_ptr, void* dest_ptr) const = 0
1040 
1041  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1042 
1043 #undef GKO_ENABLE_RAW_COPY_TO
1044 
1052  virtual bool verify_memory_from(const Executor* src_exec) const = 0;
1053 
1063 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1064  virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1065 
1066  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1067 
1068  GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1069 
1070 #undef GKO_ENABLE_VERIFY_MEMORY_TO
1071 
1078  virtual void populate_exec_info(const machine_topology* mach_topo) = 0;
1079 
1085  exec_info& get_exec_info() { return this->exec_info_; }
1086 
1087  exec_info exec_info_;
1088 
1090 
1091  std::atomic<int> propagating_logger_refcount_{};
1092 
1093 private:
1108  template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
1109  typename ClosureDpcpp>
1110  class LambdaOperation : public Operation {
1111  public:
1122  LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1123  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1124  : op_omp_(op_omp),
1125  op_cuda_(op_cuda),
1126  op_hip_(op_hip),
1127  op_dpcpp_(op_dpcpp)
1128  {}
1129 
1130  void run(std::shared_ptr<const OmpExecutor>) const override
1131  {
1132  op_omp_();
1133  }
1134 
1135  void run(std::shared_ptr<const ReferenceExecutor>) const override
1136  {
1137  op_omp_();
1138  }
1139 
1140  void run(std::shared_ptr<const CudaExecutor>) const override
1141  {
1142  op_cuda_();
1143  }
1144 
1145  void run(std::shared_ptr<const HipExecutor>) const override
1146  {
1147  op_hip_();
1148  }
1149 
1150  void run(std::shared_ptr<const DpcppExecutor>) const override
1151  {
1152  op_dpcpp_();
1153  }
1154 
1155  private:
1156  ClosureOmp op_omp_;
1157  ClosureCuda op_cuda_;
1158  ClosureHip op_hip_;
1159  ClosureDpcpp op_dpcpp_;
1160  };
1161 };
1162 
1163 
1172 template <typename T>
1174 public:
1175  using pointer = T*;
1176 
1182  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1183  : exec_{exec}
1184  {}
1185 
1191  void operator()(pointer ptr) const
1192  {
1193  if (exec_) {
1194  exec_->free(ptr);
1195  }
1196  }
1197 
1198 private:
1199  std::shared_ptr<const Executor> exec_;
1200 };
1201 
1202 // a specialization for arrays
1203 template <typename T>
1204 class executor_deleter<T[]> {
1205 public:
1206  using pointer = T[];
1207 
1208  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1209  : exec_{exec}
1210  {}
1211 
1212  void operator()(pointer ptr) const
1213  {
1214  if (exec_) {
1215  exec_->free(ptr);
1216  }
1217  }
1218 
1219 private:
1220  std::shared_ptr<const Executor> exec_;
1221 };
1222 
1223 
1224 namespace detail {
1225 
1226 
1227 template <typename ConcreteExecutor>
1228 class ExecutorBase : public Executor {
1229  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
1230  friend class ReferenceExecutor;
1231 
1232 public:
1233  using Executor::run;
1234 
1235  void run(const Operation& op) const override
1236  {
1237  this->template log<log::Logger::operation_launched>(this, &op);
1238  auto scope_guard = get_scoped_device_id_guard();
1239  op.run(self()->shared_from_this());
1240  this->template log<log::Logger::operation_completed>(this, &op);
1241  }
1242 
1243 protected:
1244  void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1245  const void* src_ptr, void* dest_ptr) const override
1246  {
1247  src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1248  }
1249 
1250  virtual bool verify_memory_from(const Executor* src_exec) const override
1251  {
1252  return src_exec->verify_memory_to(self());
1253  }
1254 
1255 private:
1256  ConcreteExecutor* self() noexcept
1257  {
1258  return static_cast<ConcreteExecutor*>(this);
1259  }
1260 
1261  const ConcreteExecutor* self() const noexcept
1262  {
1263  return static_cast<const ConcreteExecutor*>(this);
1264  }
1265 };
1266 
1267 #undef GKO_DECLARE_EXECUTOR_FRIEND
1268 
1269 
1277 class EnableDeviceReset {
1278 public:
1284  GKO_DEPRECATED(
1285  "device_reset is no longer supported, call "
1286  "cudaDeviceReset/hipDeviceReset manually")
1287  void set_device_reset(bool device_reset) {}
1288 
1294  GKO_DEPRECATED(
1295  "device_reset is no longer supported, call "
1296  "cudaDeviceReset/hipDeviceReset manually")
1297  bool get_device_reset() { return false; }
1298 
1299 protected:
1305  EnableDeviceReset() {}
1306 
1307  GKO_DEPRECATED(
1308  "device_reset is no longer supported, call "
1309  "cudaDeviceReset/hipDeviceReset manually")
1310  EnableDeviceReset(bool device_reset) {}
1311 };
1312 
1313 
1314 } // namespace detail
1315 
1316 
1317 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1318  void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1319  const void* src_ptr, void* dest_ptr) const override
1320 
1321 
1322 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1323  virtual bool verify_memory_to(const dest_* other) const override \
1324  { \
1325  return bool_; \
1326  } \
1327  static_assert(true, \
1328  "This assert is used to counter the false positive extra " \
1329  "semi-colon warnings")
1330 
1331 
1339 class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1340  public std::enable_shared_from_this<OmpExecutor> {
1341  friend class detail::ExecutorBase<OmpExecutor>;
1342 
1343 public:
1347  static std::shared_ptr<OmpExecutor> create(
1348  std::shared_ptr<CpuAllocatorBase> alloc =
1349  std::make_shared<CpuAllocator>())
1350  {
1351  return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
1352  }
1353 
1354  std::shared_ptr<Executor> get_master() noexcept override;
1355 
1356  std::shared_ptr<const Executor> get_master() const noexcept override;
1357 
1358  void synchronize() const override;
1359 
1360  int get_num_cores() const
1361  {
1362  return this->get_exec_info().num_computing_units;
1363  }
1364 
1365  int get_num_threads_per_core() const
1366  {
1367  return this->get_exec_info().num_pu_per_cu;
1368  }
1369 
1370  static int get_num_omp_threads();
1371 
1372  scoped_device_id_guard get_scoped_device_id_guard() const override;
1373 
1374  std::string get_description() const override;
1375 
1376 protected:
1377  OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1378  : alloc_{std::move(alloc)}
1379  {
1380  this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
1381  }
1382 
1383  void populate_exec_info(const machine_topology* mach_topo) override;
1384 
1385  void* raw_alloc(size_type size) const override;
1386 
1387  void raw_free(void* ptr) const noexcept override;
1388 
1389  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1390 
1391  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1392 
1393  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1394 
1395  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1396 
1397  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1398 
1399  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1400 
1401  std::shared_ptr<CpuAllocatorBase> alloc_;
1402 };
1403 
1404 
1405 namespace kernels {
1406 namespace omp {
1407 using DefaultExecutor = OmpExecutor;
1408 } // namespace omp
1409 } // namespace kernels
1410 
1411 
1420 public:
1421  static std::shared_ptr<ReferenceExecutor> create(
1422  std::shared_ptr<CpuAllocatorBase> alloc =
1423  std::make_shared<CpuAllocator>())
1424  {
1425  return std::shared_ptr<ReferenceExecutor>(
1426  new ReferenceExecutor(std::move(alloc)));
1427  }
1428 
1429  scoped_device_id_guard get_scoped_device_id_guard() const override
1430  {
1431  return {this, 0};
1432  }
1433 
1434  std::string get_description() const override { return "ReferenceExecutor"; }
1435 
1436  void run(const Operation& op) const override
1437  {
1438  this->template log<log::Logger::operation_launched>(this, &op);
1439  op.run(std::static_pointer_cast<const ReferenceExecutor>(
1440  this->shared_from_this()));
1441  this->template log<log::Logger::operation_completed>(this, &op);
1442  }
1443 
1444 protected:
1445  ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1446  : OmpExecutor{std::move(alloc)}
1447  {
1448  this->ReferenceExecutor::populate_exec_info(
1450  }
1451 
1452  void populate_exec_info(const machine_topology*) override
1453  {
1454  this->get_exec_info().device_id = -1;
1455  this->get_exec_info().num_computing_units = 1;
1456  this->get_exec_info().num_pu_per_cu = 1;
1457  }
1458 
1459  bool verify_memory_from(const Executor* src_exec) const override
1460  {
1461  return src_exec->verify_memory_to(this);
1462  }
1463 
1464  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1465 
1466  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1467 
1468  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1469 
1470  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1471 
1472  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1473 };
1474 
1475 
1476 namespace kernels {
1477 namespace reference {
1478 using DefaultExecutor = ReferenceExecutor;
1479 } // namespace reference
1480 } // namespace kernels
1481 
1482 
1489 class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1490  public std::enable_shared_from_this<CudaExecutor>,
1491  public detail::EnableDeviceReset {
1492  friend class detail::ExecutorBase<CudaExecutor>;
1493 
1494 public:
1506  GKO_DEPRECATED(
1507  "calling this CudaExecutor::create method is deprecated, because"
1508  "device_reset no longer has an effect"
1509  "call CudaExecutor::create("
1510  " int device_id, std::shared_ptr<Executor> master,"
1511  " std::shared_ptr<CudaAllocatorBase> alloc,"
1512  " CUstream_st* stream);"
1513  "instead")
1514  static std::shared_ptr<CudaExecutor> create(
1515  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1516  allocation_mode alloc_mode = default_cuda_alloc_mode,
1517  CUstream_st* stream = nullptr);
1518 
1528  static std::shared_ptr<CudaExecutor> create(
1529  int device_id, std::shared_ptr<Executor> master,
1530  std::shared_ptr<CudaAllocatorBase> alloc =
1531  std::make_shared<CudaAllocator>(),
1532  CUstream_st* stream = nullptr);
1533 
1534  std::shared_ptr<Executor> get_master() noexcept override;
1535 
1536  std::shared_ptr<const Executor> get_master() const noexcept override;
1537 
1538  void synchronize() const override;
1539 
1540  scoped_device_id_guard get_scoped_device_id_guard() const override;
1541 
1542  std::string get_description() const override;
1543 
1547  int get_device_id() const noexcept
1548  {
1549  return this->get_exec_info().device_id;
1550  }
1551 
1555  static int get_num_devices();
1556 
1560  int get_num_warps_per_sm() const noexcept
1561  {
1562  return this->get_exec_info().num_pu_per_cu;
1563  }
1564 
1568  int get_num_multiprocessor() const noexcept
1569  {
1570  return this->get_exec_info().num_computing_units;
1571  }
1572 
1576  int get_num_warps() const noexcept
1577  {
1578  return this->get_exec_info().num_computing_units *
1579  this->get_exec_info().num_pu_per_cu;
1580  }
1581 
1585  int get_warp_size() const noexcept
1586  {
1587  return this->get_exec_info().max_subgroup_size;
1588  }
1589 
1593  int get_major_version() const noexcept
1594  {
1595  return this->get_exec_info().major;
1596  }
1597 
1601  int get_minor_version() const noexcept
1602  {
1603  return this->get_exec_info().minor;
1604  }
1605 
1611  GKO_DEPRECATED("use get_blas_handle() instead")
1612  cublasContext* get_cublas_handle() const { return get_blas_handle(); }
1613 
1617  cublasContext* get_blas_handle() const { return cublas_handle_.get(); }
1618 
1624  GKO_DEPRECATED("use get_sparselib_handle() instead")
1625  cusparseContext* get_cusparse_handle() const
1626  {
1627  return get_sparselib_handle();
1628  }
1629 
1633  cusparseContext* get_sparselib_handle() const
1634  {
1635  return cusparse_handle_.get();
1636  }
1637 
1643  std::vector<int> get_closest_pus() const
1644  {
1645  return this->get_exec_info().closest_pu_ids;
1646  }
1647 
1653  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1654 
1661  CUstream_st* get_stream() const { return stream_; }
1662 
1663 protected:
1664  void set_gpu_property();
1665 
1666  void init_handles();
1667 
1668  CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1669  std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1670  : alloc_{std::move(alloc)}, master_(master), stream_{stream}
1671  {
1672  this->get_exec_info().device_id = device_id;
1673  this->get_exec_info().num_computing_units = 0;
1674  this->get_exec_info().num_pu_per_cu = 0;
1675  this->CudaExecutor::populate_exec_info(
1677  this->set_gpu_property();
1678  this->init_handles();
1679  }
1680 
1681  void* raw_alloc(size_type size) const override;
1682 
1683  void raw_free(void* ptr) const noexcept override;
1684 
1685  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1686 
1687  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1688 
1689  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1690 
1691  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1692 
1693  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1694 
1695  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1696 
1697  void populate_exec_info(const machine_topology* mach_topo) override;
1698 
1699 private:
1700  std::shared_ptr<Executor> master_;
1701 
1702  template <typename T>
1703  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1704  handle_manager<cublasContext> cublas_handle_;
1705  handle_manager<cusparseContext> cusparse_handle_;
1706  std::shared_ptr<CudaAllocatorBase> alloc_;
1707  CUstream_st* stream_;
1708 };
1709 
1710 
1711 namespace kernels {
1712 namespace cuda {
1713 using DefaultExecutor = CudaExecutor;
1714 } // namespace cuda
1715 } // namespace kernels
1716 
1717 
1724 class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1725  public std::enable_shared_from_this<HipExecutor>,
1726  public detail::EnableDeviceReset {
1727  friend class detail::ExecutorBase<HipExecutor>;
1728 
1729 public:
1741  GKO_DEPRECATED(
1742  "device_reset is deprecated entirely, call hipDeviceReset directly. "
1743  "alloc_mode was replaced by the Allocator type "
1744  "hierarchy.")
1745  static std::shared_ptr<HipExecutor> create(
1746  int device_id, std::shared_ptr<Executor> master, bool device_reset,
1747  allocation_mode alloc_mode = default_hip_alloc_mode,
1748  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1749 
1750  static std::shared_ptr<HipExecutor> create(
1751  int device_id, std::shared_ptr<Executor> master,
1752  std::shared_ptr<HipAllocatorBase> alloc =
1753  std::make_shared<HipAllocator>(),
1754  GKO_HIP_STREAM_STRUCT* stream = nullptr);
1755 
1756  std::shared_ptr<Executor> get_master() noexcept override;
1757 
1758  std::shared_ptr<const Executor> get_master() const noexcept override;
1759 
1760  void synchronize() const override;
1761 
1762  scoped_device_id_guard get_scoped_device_id_guard() const override;
1763 
1764  std::string get_description() const override;
1765 
1769  int get_device_id() const noexcept
1770  {
1771  return this->get_exec_info().device_id;
1772  }
1773 
1777  static int get_num_devices();
1778 
1782  int get_num_warps_per_sm() const noexcept
1783  {
1784  return this->get_exec_info().num_pu_per_cu;
1785  }
1786 
1790  int get_num_multiprocessor() const noexcept
1791  {
1792  return this->get_exec_info().num_computing_units;
1793  }
1794 
1798  int get_major_version() const noexcept
1799  {
1800  return this->get_exec_info().major;
1801  }
1802 
1806  int get_minor_version() const noexcept
1807  {
1808  return this->get_exec_info().minor;
1809  }
1810 
1814  int get_num_warps() const noexcept
1815  {
1816  return this->get_exec_info().num_computing_units *
1817  this->get_exec_info().num_pu_per_cu;
1818  }
1819 
1823  int get_warp_size() const noexcept
1824  {
1825  return this->get_exec_info().max_subgroup_size;
1826  }
1827 
1833  GKO_DEPRECATED("use get_blas_handle() instead")
1834  hipblasContext* get_hipblas_handle() const { return get_blas_handle(); }
1835 
1839  hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); }
1840 
1846  GKO_DEPRECATED("use get_sparselib_handle() instead")
1847  hipsparseContext* get_hipsparse_handle() const
1848  {
1849  return get_sparselib_handle();
1850  }
1851 
1855  hipsparseContext* get_sparselib_handle() const
1856  {
1857  return hipsparse_handle_.get();
1858  }
1859 
1865  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1866 
1872  std::vector<int> get_closest_pus() const
1873  {
1874  return this->get_exec_info().closest_pu_ids;
1875  }
1876 
1877  GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
1878 
1879 protected:
1880  void set_gpu_property();
1881 
1882  void init_handles();
1883 
1884  HipExecutor(int device_id, std::shared_ptr<Executor> master,
1885  std::shared_ptr<HipAllocatorBase> alloc,
1886  GKO_HIP_STREAM_STRUCT* stream)
1887  : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1888  {
1889  this->get_exec_info().device_id = device_id;
1890  this->get_exec_info().num_computing_units = 0;
1891  this->get_exec_info().num_pu_per_cu = 0;
1892  this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1893  this->set_gpu_property();
1894  this->init_handles();
1895  }
1896 
1897  void* raw_alloc(size_type size) const override;
1898 
1899  void raw_free(void* ptr) const noexcept override;
1900 
1901  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1902 
1903  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1904 
1905  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1906 
1907  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1908 
1909  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1910 
1911  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1912 
1913  void populate_exec_info(const machine_topology* mach_topo) override;
1914 
1915 private:
1916  std::shared_ptr<Executor> master_;
1917 
1918  template <typename T>
1919  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1920  handle_manager<hipblasContext> hipblas_handle_;
1921  handle_manager<hipsparseContext> hipsparse_handle_;
1922  std::shared_ptr<HipAllocatorBase> alloc_;
1923  GKO_HIP_STREAM_STRUCT* stream_;
1924 };
1925 
1926 
1927 namespace kernels {
1928 namespace hip {
1929 using DefaultExecutor = HipExecutor;
1930 } // namespace hip
1931 } // namespace kernels
1932 
1933 
1940 class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
1941  public std::enable_shared_from_this<DpcppExecutor> {
1942  friend class detail::ExecutorBase<DpcppExecutor>;
1943 
1944 public:
1954  static std::shared_ptr<DpcppExecutor> create(
1955  int device_id, std::shared_ptr<Executor> master,
1956  std::string device_type = "all",
1957  dpcpp_queue_property property = dpcpp_queue_property::in_order);
1958 
1959  std::shared_ptr<Executor> get_master() noexcept override;
1960 
1961  std::shared_ptr<const Executor> get_master() const noexcept override;
1962 
1963  void synchronize() const override;
1964 
1965  scoped_device_id_guard get_scoped_device_id_guard() const override;
1966 
1967  std::string get_description() const override;
1968 
1974  int get_device_id() const noexcept
1975  {
1976  return this->get_exec_info().device_id;
1977  }
1978 
1979  sycl::queue* get_queue() const { return queue_.get(); }
1980 
1988  static int get_num_devices(std::string device_type);
1989 
1995  const std::vector<int>& get_subgroup_sizes() const noexcept
1996  {
1997  return this->get_exec_info().subgroup_sizes;
1998  }
1999 
2005  int get_num_computing_units() const noexcept
2006  {
2007  return this->get_exec_info().num_computing_units;
2008  }
2009 
2013  int get_num_subgroups() const noexcept
2014  {
2015  return this->get_exec_info().num_computing_units *
2016  this->get_exec_info().num_pu_per_cu;
2017  }
2018 
2024  const std::vector<int>& get_max_workitem_sizes() const noexcept
2025  {
2026  return this->get_exec_info().max_workitem_sizes;
2027  }
2028 
2034  int get_max_workgroup_size() const noexcept
2035  {
2036  return this->get_exec_info().max_workgroup_size;
2037  }
2038 
2044  int get_max_subgroup_size() const noexcept
2045  {
2046  return this->get_exec_info().max_subgroup_size;
2047  }
2048 
2054  std::string get_device_type() const noexcept
2055  {
2056  return this->get_exec_info().device_type;
2057  }
2058 
2059 protected:
2060  void set_device_property(
2061  dpcpp_queue_property property = dpcpp_queue_property::in_order);
2062 
2063  DpcppExecutor(
2064  int device_id, std::shared_ptr<Executor> master,
2065  std::string device_type = "all",
2066  dpcpp_queue_property property = dpcpp_queue_property::in_order)
2067  : master_(master)
2068  {
2069  std::for_each(device_type.begin(), device_type.end(),
2070  [](char& c) { c = std::tolower(c); });
2071  this->get_exec_info().device_type = std::string(device_type);
2072  this->get_exec_info().device_id = device_id;
2073  this->set_device_property(property);
2074  }
2075 
2076  void populate_exec_info(const machine_topology* mach_topo) override;
2077 
2078  void* raw_alloc(size_type size) const override;
2079 
2080  void raw_free(void* ptr) const noexcept override;
2081 
2082  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2083 
2084  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
2085 
2086  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
2087 
2088  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
2089 
2090  bool verify_memory_to(const OmpExecutor* dest_exec) const override;
2091 
2092  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
2093 
2094 private:
2095  std::shared_ptr<Executor> master_;
2096 
2097  template <typename T>
2098  using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2099  queue_manager<sycl::queue> queue_;
2100 };
2101 
2102 
2103 namespace kernels {
2104 namespace dpcpp {
2105 using DefaultExecutor = DpcppExecutor;
2106 } // namespace dpcpp
2107 } // namespace kernels
2108 
2109 
2110 #undef GKO_OVERRIDE_RAW_COPY_TO
2111 
2112 
2113 } // namespace gko
2114 
2115 
2116 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
gko::allocation_mode
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition: executor.hpp:62
gko::CudaExecutor::get_sparselib_handle
cusparseContext * get_sparselib_handle() const
Get the cusparse handle for this executor.
Definition: executor.hpp:1633
gko::CudaExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1560
gko::executor_deleter
This is a deleter that uses an executor's free method to deallocate the data.
Definition: executor.hpp:1173
gko::HipExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1782
gko::executor_deleter::executor_deleter
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition: executor.hpp:1182
gko::CudaExecutor::get_stream
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition: executor.hpp:1661
gko::Executor::synchronize
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor::get_max_workitem_sizes
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition: executor.hpp:2024
gko::Executor::free
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition: executor.hpp:691
gko::HipExecutor::get_hipblas_handle
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1834
gko::Executor::memory_accessible
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition: executor.hpp:861
gko::DpcppExecutor::get_device_type
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition: executor.hpp:2054
gko::HipExecutor::get_num_devices
static int get_num_devices()
Get the number of devices present on the system.
gko::HipExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1865
gko::scoped_device_id_guard
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition: scoped_device_id_guard.hpp:76
gko::DpcppExecutor::get_num_devices
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
gko::DpcppExecutor::get_num_computing_units
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition: executor.hpp:2005
gko::log_propagation_mode::automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
gko::Executor::remove_logger
void remove_logger(const log::Logger *logger) override
Definition: executor.hpp:820
gko::DpcppExecutor::get_num_subgroups
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition: executor.hpp:2013
gko::ReferenceExecutor::get_description
std::string get_description() const override
Definition: executor.hpp:1434
gko::size_type
std::size_t size_type
Integral type used for allocation quantities.
Definition: types.hpp:108
gko::Executor::copy_from
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition: executor.hpp:713
gko::DpcppExecutor::get_description
std::string get_description() const override
gko::Executor::get_master
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
gko::ptr_param::get
T * get() const
Definition: utils_helper.hpp:76
gko::Executor::run
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
gko::HipExecutor
This is the Executor subclass which represents the HIP enhanced device.
Definition: executor.hpp:1724
gko::CudaExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1601
gko::DpcppExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::log_propagation_mode::never
Events only get reported at loggers attached to the triggering object.
gko::CudaExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1568
gko::ReferenceExecutor
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition: executor.hpp:1419
gko::CudaExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1585
gko::NotSupported
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition: exception.hpp:127
gko::HipExecutor::get_device_id
int get_device_id() const noexcept
Get the HIP device id of the device associated to this executor.
Definition: executor.hpp:1769
gko::HipExecutor::get_hipsparse_handle
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1847
gko::CudaExecutor
This is the Executor subclass which represents the CUDA device.
Definition: executor.hpp:1489
gko::log_propagation_mode
log_propagation_mode
How Logger events are propagated to their Executor.
Definition: executor.hpp:34
gko::HipExecutor::create
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
gko
The Ginkgo namespace.
Definition: abstract_factory.hpp:19
gko::Executor::add_logger
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition: executor.hpp:807
gko::HipExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::CudaExecutor::get_blas_handle
cublasContext * get_blas_handle() const
Get the cublas handle for this executor.
Definition: executor.hpp:1617
gko::executor_deleter::operator()
void operator()(pointer ptr) const
Deletes the object.
Definition: executor.hpp:1191
gko::HipExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1872
gko::DpcppExecutor::get_max_subgroup_size
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition: executor.hpp:2044
gko::log::EnableLogging
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition: logger.hpp:748
gko::Operation::get_name
virtual const char * get_name() const noexcept
Returns the operation's name.
gko::DpcppExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor
This is the Executor subclass which represents a DPC++ enhanced device.
Definition: executor.hpp:1940
gko::ptr_param
This class is used for function parameters in the place of raw pointers.
Definition: utils_helper.hpp:42
gko::log::Logger
Definition: logger.hpp:75
gko::Executor::copy_val_to_host
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition: executor.hpp:778
gko::OmpExecutor
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition: executor.hpp:1339
gko::Executor::run
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition: executor.hpp:654
gko::CudaExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1643
gko::HipExecutor::get_sparselib_handle
hipsparseContext * get_sparselib_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1855
gko::Executor::get_description
virtual std::string get_description() const =0
gko::stop::mode
mode
The mode for the residual norm criterion.
Definition: residual_norm.hpp:36
gko::OmpExecutor::create
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition: executor.hpp:1347
gko::HipExecutor::get_description
std::string get_description() const override
gko::Executor::alloc
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition: executor.hpp:674
gko::ReferenceExecutor::run
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition: executor.hpp:1436
gko::Executor::copy
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition: executor.hpp:763
gko::HipExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1790
gko::Executor::should_propagate_log
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition: executor.hpp:848
gko::DpcppExecutor::get_device_id
int get_device_id() const noexcept
Get the DPCPP device id of the device associated to this executor.
Definition: executor.hpp:1974
gko::DpcppExecutor::get_max_workgroup_size
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition: executor.hpp:2034
gko::CudaExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1593
gko::CudaExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1576
gko::log::Logger::needs_propagation
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition: logger.hpp:642
gko::CudaExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1653
gko::Executor
The first step in using the Ginkgo library consists of creating an executor.
Definition: executor.hpp:615
gko::HipExecutor::get_blas_handle
hipblasContext * get_blas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1839
gko::HipExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1814
gko::HipExecutor::get_major_version
int get_major_version() const noexcept
Get the major version of compute capability.
Definition: executor.hpp:1798
gko::DpcppExecutor::create
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
gko::machine_topology::get_instance
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition: machine_topology.hpp:182
gko::HipExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::Operation
Operations can be used to define functionalities whose implementations differ among devices.
Definition: executor.hpp:258
gko::Executor::set_log_propagation_mode
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition: executor.hpp:836
gko::DpcppExecutor::get_subgroup_sizes
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition: executor.hpp:1995
gko::HipExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1823
gko::HipExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition: executor.hpp:1806
gko::CudaExecutor::get_device_id
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition: executor.hpp:1547