Ginkgo  Generated from pipelines/375350765 branch based on develop. Ginkgo version 1.5.0
A numerical linear algebra library targeting many-core architectures
executor.hpp
1 /*******************************<GINKGO LICENSE>******************************
2 Copyright (c) 2017-2021, the Ginkgo authors
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8 
9 1. Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11 
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 
16 3. Neither the name of the copyright holder nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 ******************************<GINKGO LICENSE>*******************************/
32 
33 #ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
34 #define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
35 
36 
37 #include <array>
38 #include <iostream>
39 #include <memory>
40 #include <mutex>
41 #include <sstream>
42 #include <string>
43 #include <tuple>
44 #include <type_traits>
45 #include <vector>
46 
47 
48 #include <ginkgo/core/base/device.hpp>
49 #include <ginkgo/core/base/machine_topology.hpp>
50 #include <ginkgo/core/base/types.hpp>
51 #include <ginkgo/core/log/logger.hpp>
52 #include <ginkgo/core/synthesizer/containers.hpp>
53 
54 
55 namespace gko {
56 
57 
70 enum class allocation_mode { device, unified_global, unified_host };
71 
72 
73 #ifdef NDEBUG
74 
75 // When in release, prefer device allocations
76 constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
77 
78 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
79 
80 #else
81 
82 // When in debug, always UM allocations.
83 constexpr allocation_mode default_cuda_alloc_mode =
84  allocation_mode::unified_global;
85 
86 #if (GINKGO_HIP_PLATFORM_HCC == 1)
87 
88 // HIP on AMD GPUs does not support UM, so always prefer device allocations.
89 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
90 
91 #else
92 
93 // HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
94 constexpr allocation_mode default_hip_alloc_mode =
95  allocation_mode::unified_global;
96 
97 #endif
98 
99 #endif
100 
101 
102 } // namespace gko
103 
104 inline namespace cl {
105 namespace sycl {
106 
107 class queue;
108 
109 } // namespace sycl
110 } // namespace cl
111 
112 
113 struct cublasContext;
114 
115 struct cusparseContext;
116 
117 struct hipblasContext;
118 
119 struct hipsparseContext;
120 
121 
122 namespace gko {
123 
124 
125 #define GKO_FORWARD_DECLARE(_type, ...) class _type
126 
127 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
128 
129 #undef GKO_FORWARD_DECLARE
130 
131 
132 class ReferenceExecutor;
133 
134 
135 namespace detail {
136 
137 
138 template <typename>
139 class ExecutorBase;
140 
141 
142 } // namespace detail
143 
144 
259 class Operation {
260 public:
261 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
262  virtual void run(std::shared_ptr<const _type>) const
263 
264  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
265 
266 #undef GKO_DECLARE_RUN_OVERLOAD
267 
268  // ReferenceExecutor overload can be defaulted to OmpExecutor's
269  virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
270 
276  virtual const char* get_name() const noexcept;
277 };
278 
279 
280 namespace detail {
281 
282 
292 template <typename Closure>
293 class RegisteredOperation : public Operation {
294 public:
301  RegisteredOperation(const char* name, int num_params, Closure op)
302  : name_(name), num_params_(num_params), op_(std::move(op))
303  {}
304 
305  const char* get_name() const noexcept override
306  {
307  static auto name = [this] {
308  std::ostringstream oss;
309  oss << name_ << '#' << num_params_;
310  return oss.str();
311  }();
312  return name.c_str();
313  }
314 
315  void run(std::shared_ptr<const ReferenceExecutor> exec) const override
316  {
317  op_(exec);
318  }
319 
320  void run(std::shared_ptr<const OmpExecutor> exec) const override
321  {
322  op_(exec);
323  }
324 
325  void run(std::shared_ptr<const CudaExecutor> exec) const override
326  {
327  op_(exec);
328  }
329 
330  void run(std::shared_ptr<const HipExecutor> exec) const override
331  {
332  op_(exec);
333  }
334 
335  void run(std::shared_ptr<const DpcppExecutor> exec) const override
336  {
337  op_(exec);
338  }
339 
340 private:
341  const char* name_;
342  int num_params_;
343  Closure op_;
344 };
345 
346 
347 template <typename Closure>
348 RegisteredOperation<Closure> make_register_operation(const char* name,
349  int num_params, Closure op)
350 {
351  return RegisteredOperation<Closure>{name, num_params, std::move(op)};
352 }
353 
354 
355 } // namespace detail
356 
357 
429 #define GKO_REGISTER_OPERATION(_name, _kernel) \
430  template <typename... Args> \
431  auto make_##_name(Args&&... args) \
432  { \
433  return ::gko::detail::make_register_operation( \
434  #_name, sizeof...(Args), [&args...](auto exec) { \
435  using exec_type = decltype(exec); \
436  if (std::is_same< \
437  exec_type, \
438  std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
439  value) { \
440  ::gko::kernels::reference::_kernel( \
441  std::dynamic_pointer_cast< \
442  const ::gko::ReferenceExecutor>(exec), \
443  std::forward<Args>(args)...); \
444  } else if (std::is_same< \
445  exec_type, \
446  std::shared_ptr<const ::gko::OmpExecutor>>:: \
447  value) { \
448  ::gko::kernels::omp::_kernel( \
449  std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
450  exec), \
451  std::forward<Args>(args)...); \
452  } else if (std::is_same< \
453  exec_type, \
454  std::shared_ptr<const ::gko::CudaExecutor>>:: \
455  value) { \
456  ::gko::kernels::cuda::_kernel( \
457  std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
458  exec), \
459  std::forward<Args>(args)...); \
460  } else if (std::is_same< \
461  exec_type, \
462  std::shared_ptr<const ::gko::HipExecutor>>:: \
463  value) { \
464  ::gko::kernels::hip::_kernel( \
465  std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
466  exec), \
467  std::forward<Args>(args)...); \
468  } else if (std::is_same< \
469  exec_type, \
470  std::shared_ptr<const ::gko::DpcppExecutor>>:: \
471  value) { \
472  ::gko::kernels::dpcpp::_kernel( \
473  std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
474  exec), \
475  std::forward<Args>(args)...); \
476  } else { \
477  GKO_NOT_IMPLEMENTED; \
478  } \
479  }); \
480  } \
481  static_assert(true, \
482  "This assert is used to counter the false positive extra " \
483  "semi-colon warnings")
484 
485 
486 #define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
487 
575 class Executor : public log::EnableLogging<Executor> {
576  template <typename T>
577  friend class detail::ExecutorBase;
578 
579  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
580  friend class ReferenceExecutor;
581 
582 public:
583  virtual ~Executor() = default;
584 
585  Executor() = default;
586  Executor(Executor&) = delete;
587  Executor(Executor&&) = default;
588  Executor& operator=(Executor&) = delete;
589  Executor& operator=(Executor&&) = default;
590 
596  virtual void run(const Operation& op) const = 0;
597 
612  template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
613  typename ClosureDpcpp>
614  void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
615  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
616  {
617  LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
618  op_omp, op_cuda, op_hip, op_dpcpp);
619  this->run(op);
620  }
621 
633  template <typename T>
634  T* alloc(size_type num_elems) const
635  {
636  this->template log<log::Logger::allocation_started>(
637  this, num_elems * sizeof(T));
638  T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
639  this->template log<log::Logger::allocation_completed>(
640  this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
641  return allocated;
642  }
643 
651  void free(void* ptr) const noexcept
652  {
653  this->template log<log::Logger::free_started>(
654  this, reinterpret_cast<uintptr>(ptr));
655  this->raw_free(ptr);
656  this->template log<log::Logger::free_completed>(
657  this, reinterpret_cast<uintptr>(ptr));
658  }
659 
672  template <typename T>
673  void copy_from(const Executor* src_exec, size_type num_elems,
674  const T* src_ptr, T* dest_ptr) const
675  {
676  this->template log<log::Logger::copy_started>(
677  src_exec, this, reinterpret_cast<uintptr>(src_ptr),
678  reinterpret_cast<uintptr>(dest_ptr), num_elems * sizeof(T));
679  try {
680  this->raw_copy_from(src_exec, num_elems * sizeof(T), src_ptr,
681  dest_ptr);
682  } catch (NotSupported&) {
683 #if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
684  // Unoptimized copy. Try to go through the masters.
685  // output to log when verbose >= 1 and debug build
686  std::clog << "Not direct copy. Try to copy data from the masters."
687  << std::endl;
688 #endif
689  auto src_master = src_exec->get_master().get();
690  if (num_elems > 0 && src_master != src_exec) {
691  auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
692  src_master->copy_from<T>(src_exec, num_elems, src_ptr,
693  master_ptr);
694  this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
695  src_master->free(master_ptr);
696  }
697  }
698  this->template log<log::Logger::copy_completed>(
699  src_exec, this, reinterpret_cast<uintptr>(src_ptr),
700  reinterpret_cast<uintptr>(dest_ptr), num_elems * sizeof(T));
701  }
702 
714  template <typename T>
715  void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
716  {
717  this->copy_from(this, num_elems, src_ptr, dest_ptr);
718  }
719 
729  template <typename T>
730  T copy_val_to_host(const T* ptr) const
731  {
732  T out{};
733  this->get_master()->copy_from(this, 1, ptr, &out);
734  return out;
735  }
736 
741  virtual std::shared_ptr<Executor> get_master() noexcept = 0;
742 
746  virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
747 
751  virtual void synchronize() const = 0;
752 
760  bool memory_accessible(const std::shared_ptr<const Executor>& other) const
761  {
762  return this->verify_memory_from(other.get());
763  }
764 
765 protected:
770  struct exec_info {
774  int device_id = -1;
775 
779  std::string device_type;
780 
784  int numa_node = -1;
785 
794  int num_computing_units = -1;
795 
805  int num_pu_per_cu = -1;
806 
815  std::vector<int> subgroup_sizes{};
816 
825  int max_subgroup_size = -1;
826 
837  std::vector<int> max_workitem_sizes{};
838 
848  int max_workgroup_size;
849 
853  int major = -1;
854 
858  int minor = -1;
859 
865  std::string pci_bus_id = std::string(13, 'x');
866 
877  std::vector<int> closest_pu_ids{};
878  };
879 
885  const exec_info& get_exec_info() const { return this->exec_info_; }
886 
896  virtual void* raw_alloc(size_type size) const = 0;
897 
905  virtual void raw_free(void* ptr) const noexcept = 0;
906 
917  virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
918  const void* src_ptr, void* dest_ptr) const = 0;
919 
929 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
930  virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
931  const void* src_ptr, void* dest_ptr) const = 0
932 
933  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
934 
935 #undef GKO_ENABLE_RAW_COPY_TO
936 
944  virtual bool verify_memory_from(const Executor* src_exec) const = 0;
945 
955 #define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
956  virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
957 
958  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
959 
960  GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
961 
962 #undef GKO_ENABLE_VERIFY_MEMORY_TO
963 
970  virtual void populate_exec_info(const MachineTopology* mach_topo) = 0;
971 
977  exec_info& get_exec_info() { return this->exec_info_; }
978 
979  exec_info exec_info_;
980 
981 private:
996  template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
997  typename ClosureDpcpp>
998  class LambdaOperation : public Operation {
999  public:
1010  LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1011  const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1012  : op_omp_(op_omp),
1013  op_cuda_(op_cuda),
1014  op_hip_(op_hip),
1015  op_dpcpp_(op_dpcpp)
1016  {}
1017 
1018  void run(std::shared_ptr<const OmpExecutor>) const override
1019  {
1020  op_omp_();
1021  }
1022 
1023  void run(std::shared_ptr<const CudaExecutor>) const override
1024  {
1025  op_cuda_();
1026  }
1027 
1028  void run(std::shared_ptr<const HipExecutor>) const override
1029  {
1030  op_hip_();
1031  }
1032 
1033  void run(std::shared_ptr<const DpcppExecutor>) const override
1034  {
1035  op_dpcpp_();
1036  }
1037 
1038  private:
1039  ClosureOmp op_omp_;
1040  ClosureCuda op_cuda_;
1041  ClosureHip op_hip_;
1042  ClosureDpcpp op_dpcpp_;
1043  };
1044 };
1045 
1046 
1055 template <typename T>
1057 public:
1058  using pointer = T*;
1059 
1065  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1066  : exec_{exec}
1067  {}
1068 
1074  void operator()(pointer ptr) const
1075  {
1076  if (exec_) {
1077  exec_->free(ptr);
1078  }
1079  }
1080 
1081 private:
1082  std::shared_ptr<const Executor> exec_;
1083 };
1084 
1085 // a specialization for arrays
1086 template <typename T>
1087 class executor_deleter<T[]> {
1088 public:
1089  using pointer = T[];
1090 
1091  explicit executor_deleter(std::shared_ptr<const Executor> exec)
1092  : exec_{exec}
1093  {}
1094 
1095  void operator()(pointer ptr) const
1096  {
1097  if (exec_) {
1098  exec_->free(ptr);
1099  }
1100  }
1101 
1102 private:
1103  std::shared_ptr<const Executor> exec_;
1104 };
1105 
1106 
1107 namespace detail {
1108 
1109 
1110 template <typename ConcreteExecutor>
1111 class ExecutorBase : public Executor {
1112  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
1113  friend class ReferenceExecutor;
1114 
1115 public:
1116  void run(const Operation& op) const override
1117  {
1118  this->template log<log::Logger::operation_launched>(this, &op);
1119  op.run(self()->shared_from_this());
1120  this->template log<log::Logger::operation_completed>(this, &op);
1121  }
1122 
1123 protected:
1124  void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1125  const void* src_ptr, void* dest_ptr) const override
1126  {
1127  src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1128  }
1129 
1130  virtual bool verify_memory_from(const Executor* src_exec) const override
1131  {
1132  return src_exec->verify_memory_to(self());
1133  }
1134 
1135 private:
1136  ConcreteExecutor* self() noexcept
1137  {
1138  return static_cast<ConcreteExecutor*>(this);
1139  }
1140 
1141  const ConcreteExecutor* self() const noexcept
1142  {
1143  return static_cast<const ConcreteExecutor*>(this);
1144  }
1145 };
1146 
1147 #undef GKO_DECLARE_EXECUTOR_FRIEND
1148 
1149 
1157 class EnableDeviceReset {
1158 public:
1164  void set_device_reset(bool device_reset) { device_reset_ = device_reset; }
1165 
1171  bool get_device_reset() { return device_reset_; }
1172 
1173 protected:
1179  EnableDeviceReset(bool device_reset = false) : device_reset_{device_reset}
1180  {}
1181 
1182 private:
1183  bool device_reset_{};
1184 };
1185 
1186 
1187 } // namespace detail
1188 
1189 
1190 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1191  void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1192  const void* src_ptr, void* dest_ptr) const override
1193 
1194 
1195 #define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1196  virtual bool verify_memory_to(const dest_* other) const override \
1197  { \
1198  return bool_; \
1199  } \
1200  static_assert(true, \
1201  "This assert is used to counter the false positive extra " \
1202  "semi-colon warnings")
1203 
1204 
1212 class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1213  public std::enable_shared_from_this<OmpExecutor> {
1214  friend class detail::ExecutorBase<OmpExecutor>;
1215 
1216 public:
1220  static std::shared_ptr<OmpExecutor> create()
1221  {
1222  return std::shared_ptr<OmpExecutor>(new OmpExecutor());
1223  }
1224 
1225  std::shared_ptr<Executor> get_master() noexcept override;
1226 
1227  std::shared_ptr<const Executor> get_master() const noexcept override;
1228 
1229  void synchronize() const override;
1230 
1231  int get_num_cores() const
1232  {
1233  return this->get_exec_info().num_computing_units;
1234  }
1235 
1236  int get_num_threads_per_core() const
1237  {
1238  return this->get_exec_info().num_pu_per_cu;
1239  }
1240 
1241 protected:
1242  OmpExecutor()
1243  {
1244  this->OmpExecutor::populate_exec_info(MachineTopology::get_instance());
1245  }
1246 
1247  void populate_exec_info(const MachineTopology* mach_topo) override;
1248 
1249  void* raw_alloc(size_type size) const override;
1250 
1251  void raw_free(void* ptr) const noexcept override;
1252 
1253  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1254 
1255  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1256 
1257  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1258 
1259  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1260 
1261  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1262 
1263  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1264 };
1265 
1266 
1267 namespace kernels {
1268 namespace omp {
1269 using DefaultExecutor = OmpExecutor;
1270 } // namespace omp
1271 } // namespace kernels
1272 
1273 
1282 public:
1283  static std::shared_ptr<ReferenceExecutor> create()
1284  {
1285  return std::shared_ptr<ReferenceExecutor>(new ReferenceExecutor());
1286  }
1287 
1288  void run(const Operation& op) const override
1289  {
1290  this->template log<log::Logger::operation_launched>(this, &op);
1291  op.run(std::static_pointer_cast<const ReferenceExecutor>(
1292  this->shared_from_this()));
1293  this->template log<log::Logger::operation_completed>(this, &op);
1294  }
1295 
1296 protected:
1298  {
1299  this->ReferenceExecutor::populate_exec_info(
1301  }
1302 
1303  void populate_exec_info(const MachineTopology*) override
1304  {
1305  this->get_exec_info().device_id = -1;
1306  this->get_exec_info().num_computing_units = 1;
1307  this->get_exec_info().num_pu_per_cu = 1;
1308  }
1309 
1310  bool verify_memory_from(const Executor* src_exec) const override
1311  {
1312  return src_exec->verify_memory_to(this);
1313  }
1314 
1315  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1316 
1317  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1318 
1319  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1320 
1321  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1322 
1323  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1324 };
1325 
1326 
1327 namespace kernels {
1328 namespace reference {
1329 using DefaultExecutor = ReferenceExecutor;
1330 } // namespace reference
1331 } // namespace kernels
1332 
1333 
1340 class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1341  public std::enable_shared_from_this<CudaExecutor>,
1342  public detail::EnableDeviceReset {
1343  friend class detail::ExecutorBase<CudaExecutor>;
1344 
1345 public:
1357  static std::shared_ptr<CudaExecutor> create(
1358  int device_id, std::shared_ptr<Executor> master,
1359  bool device_reset = false,
1360  allocation_mode alloc_mode = default_cuda_alloc_mode);
1361 
1362  std::shared_ptr<Executor> get_master() noexcept override;
1363 
1364  std::shared_ptr<const Executor> get_master() const noexcept override;
1365 
1366  void synchronize() const override;
1367 
1368  void run(const Operation& op) const override;
1369 
1373  int get_device_id() const noexcept
1374  {
1375  return this->get_exec_info().device_id;
1376  }
1377 
1381  static int get_num_devices();
1382 
1386  int get_num_warps_per_sm() const noexcept
1387  {
1388  return this->get_exec_info().num_pu_per_cu;
1389  }
1390 
1394  int get_num_multiprocessor() const noexcept
1395  {
1396  return this->get_exec_info().num_computing_units;
1397  }
1398 
1402  int get_num_warps() const noexcept
1403  {
1404  return this->get_exec_info().num_computing_units *
1405  this->get_exec_info().num_pu_per_cu;
1406  }
1407 
1411  int get_warp_size() const noexcept
1412  {
1413  return this->get_exec_info().max_subgroup_size;
1414  }
1415 
1419  int get_major_version() const noexcept
1420  {
1421  return this->get_exec_info().major;
1422  }
1423 
1427  int get_minor_version() const noexcept
1428  {
1429  return this->get_exec_info().minor;
1430  }
1431 
1437  cublasContext* get_cublas_handle() const { return cublas_handle_.get(); }
1438 
1444  cusparseContext* get_cusparse_handle() const
1445  {
1446  return cusparse_handle_.get();
1447  }
1448 
1454  std::vector<int> get_closest_pus() const
1455  {
1456  return this->get_exec_info().closest_pu_ids;
1457  }
1458 
1464  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1465 
1466 protected:
1467  void set_gpu_property();
1468 
1469  void init_handles();
1470 
1471  CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1472  bool device_reset = false,
1473  allocation_mode alloc_mode = default_cuda_alloc_mode)
1474  : EnableDeviceReset{device_reset},
1475  alloc_mode_{alloc_mode},
1476  master_(master)
1477  {
1478  this->get_exec_info().device_id = device_id;
1479  this->get_exec_info().num_computing_units = 0;
1480  this->get_exec_info().num_pu_per_cu = 0;
1481  this->CudaExecutor::populate_exec_info(MachineTopology::get_instance());
1482  if (this->get_exec_info().closest_pu_ids.size()) {
1484  this->get_closest_pus());
1485  }
1486  // it only gets attribute from device, so it should not be affected by
1487  // DeviceReset.
1488  this->set_gpu_property();
1489  // increase the number of executor before any operations may be affected
1490  // by DeviceReset.
1491  increase_num_execs(this->get_exec_info().device_id);
1492  this->init_handles();
1493  }
1494 
1495  void* raw_alloc(size_type size) const override;
1496 
1497  void raw_free(void* ptr) const noexcept override;
1498 
1499  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1500 
1501  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1502 
1503  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1504 
1505  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1506 
1507  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1508 
1509  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1510 
1511  static void increase_num_execs(unsigned device_id);
1512 
1513  static void decrease_num_execs(unsigned device_id);
1514 
1515  static unsigned get_num_execs(unsigned device_id);
1516 
1517  void populate_exec_info(const MachineTopology* mach_topo) override;
1518 
1519 private:
1520  std::shared_ptr<Executor> master_;
1521 
1522  template <typename T>
1523  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1524  handle_manager<cublasContext> cublas_handle_;
1525  handle_manager<cusparseContext> cusparse_handle_;
1526 
1527  allocation_mode alloc_mode_;
1528 };
1529 
1530 
1531 namespace kernels {
1532 namespace cuda {
1533 using DefaultExecutor = CudaExecutor;
1534 } // namespace cuda
1535 } // namespace kernels
1536 
1537 
1544 class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1545  public std::enable_shared_from_this<HipExecutor>,
1546  public detail::EnableDeviceReset {
1547  friend class detail::ExecutorBase<HipExecutor>;
1548 
1549 public:
1561  static std::shared_ptr<HipExecutor> create(
1562  int device_id, std::shared_ptr<Executor> master,
1563  bool device_reset = false,
1564  allocation_mode alloc_mode = default_hip_alloc_mode);
1565 
1566  std::shared_ptr<Executor> get_master() noexcept override;
1567 
1568  std::shared_ptr<const Executor> get_master() const noexcept override;
1569 
1570  void synchronize() const override;
1571 
1572  void run(const Operation& op) const override;
1573 
1577  int get_device_id() const noexcept
1578  {
1579  return this->get_exec_info().device_id;
1580  }
1581 
1585  static int get_num_devices();
1586 
1590  int get_num_warps_per_sm() const noexcept
1591  {
1592  return this->get_exec_info().num_pu_per_cu;
1593  }
1594 
1598  int get_num_multiprocessor() const noexcept
1599  {
1600  return this->get_exec_info().num_computing_units;
1601  }
1602 
1606  int get_major_version() const noexcept
1607  {
1608  return this->get_exec_info().major;
1609  }
1610 
1614  int get_minor_version() const noexcept
1615  {
1616  return this->get_exec_info().minor;
1617  }
1618 
1622  int get_num_warps() const noexcept
1623  {
1624  return this->get_exec_info().num_computing_units *
1625  this->get_exec_info().num_pu_per_cu;
1626  }
1627 
1631  int get_warp_size() const noexcept
1632  {
1633  return this->get_exec_info().max_subgroup_size;
1634  }
1635 
1641  hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); }
1642 
1648  hipsparseContext* get_hipsparse_handle() const
1649  {
1650  return hipsparse_handle_.get();
1651  }
1652 
1658  int get_closest_numa() const { return this->get_exec_info().numa_node; }
1659 
1665  std::vector<int> get_closest_pus() const
1666  {
1667  return this->get_exec_info().closest_pu_ids;
1668  }
1669 
1670 protected:
1671  void set_gpu_property();
1672 
1673  void init_handles();
1674 
1675  HipExecutor(int device_id, std::shared_ptr<Executor> master,
1676  bool device_reset = false,
1677  allocation_mode alloc_mode = default_hip_alloc_mode)
1678  : EnableDeviceReset{device_reset},
1679  alloc_mode_(alloc_mode),
1680  master_(master)
1681  {
1682  this->get_exec_info().device_id = device_id;
1683  this->get_exec_info().num_computing_units = 0;
1684  this->get_exec_info().num_pu_per_cu = 0;
1685  this->HipExecutor::populate_exec_info(MachineTopology::get_instance());
1686  if (this->get_exec_info().closest_pu_ids.size()) {
1688  this->get_closest_pus());
1689  }
1690  // it only gets attribute from device, so it should not be affected by
1691  // DeviceReset.
1692  this->set_gpu_property();
1693  // increase the number of executor before any operations may be affected
1694  // by DeviceReset.
1695  increase_num_execs(this->get_exec_info().device_id);
1696  this->init_handles();
1697  }
1698 
1699  void* raw_alloc(size_type size) const override;
1700 
1701  void raw_free(void* ptr) const noexcept override;
1702 
1703  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1704 
1705  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1706 
1707  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1708 
1709  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1710 
1711  bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1712 
1713  bool verify_memory_to(const HipExecutor* dest_exec) const override;
1714 
1715  static void increase_num_execs(int device_id);
1716 
1717  static void decrease_num_execs(int device_id);
1718 
1719  static int get_num_execs(int device_id);
1720 
1721  void populate_exec_info(const MachineTopology* mach_topo) override;
1722 
1723 private:
1724  std::shared_ptr<Executor> master_;
1725 
1726  template <typename T>
1727  using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1728  handle_manager<hipblasContext> hipblas_handle_;
1729  handle_manager<hipsparseContext> hipsparse_handle_;
1730 
1731  allocation_mode alloc_mode_;
1732 };
1733 
1734 
1735 namespace kernels {
1736 namespace hip {
1737 using DefaultExecutor = HipExecutor;
1738 } // namespace hip
1739 } // namespace kernels
1740 
1741 
1748 class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
1749  public std::enable_shared_from_this<DpcppExecutor> {
1750  friend class detail::ExecutorBase<DpcppExecutor>;
1751 
1752 public:
1762  static std::shared_ptr<DpcppExecutor> create(
1763  int device_id, std::shared_ptr<Executor> master,
1764  std::string device_type = "all");
1765 
1766  std::shared_ptr<Executor> get_master() noexcept override;
1767 
1768  std::shared_ptr<const Executor> get_master() const noexcept override;
1769 
1770  void synchronize() const override;
1771 
1772  void run(const Operation& op) const override;
1773 
1779  int get_device_id() const noexcept
1780  {
1781  return this->get_exec_info().device_id;
1782  }
1783 
1784  ::cl::sycl::queue* get_queue() const { return queue_.get(); }
1785 
1793  static int get_num_devices(std::string device_type);
1794 
1800  const std::vector<int>& get_subgroup_sizes() const noexcept
1801  {
1802  return this->get_exec_info().subgroup_sizes;
1803  }
1804 
1810  int get_num_computing_units() const noexcept
1811  {
1812  return this->get_exec_info().num_computing_units;
1813  }
1814 
1820  const std::vector<int>& get_max_workitem_sizes() const noexcept
1821  {
1822  return this->get_exec_info().max_workitem_sizes;
1823  }
1824 
1830  int get_max_workgroup_size() const noexcept
1831  {
1832  return this->get_exec_info().max_workgroup_size;
1833  }
1834 
1840  int get_max_subgroup_size() const noexcept
1841  {
1842  return this->get_exec_info().max_subgroup_size;
1843  }
1844 
1850  std::string get_device_type() const noexcept
1851  {
1852  return this->get_exec_info().device_type;
1853  }
1854 
1855 protected:
1856  void set_device_property();
1857 
1858  DpcppExecutor(int device_id, std::shared_ptr<Executor> master,
1859  std::string device_type = "all")
1860  : master_(master)
1861  {
1862  std::for_each(device_type.begin(), device_type.end(),
1863  [](char& c) { c = std::tolower(c); });
1864  this->get_exec_info().device_type = std::string(device_type);
1865  this->get_exec_info().device_id = device_id;
1866  this->set_device_property();
1867  }
1868 
1869  void populate_exec_info(const MachineTopology* mach_topo) override;
1870 
1871  void* raw_alloc(size_type size) const override;
1872 
1873  void raw_free(void* ptr) const noexcept override;
1874 
1875  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1876 
1877  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1878 
1879  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1880 
1881  GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1882 
1883  bool verify_memory_to(const OmpExecutor* dest_exec) const override;
1884 
1885  bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1886 
1887 private:
1888  std::shared_ptr<Executor> master_;
1889 
1890  template <typename T>
1891  using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
1892  queue_manager<::cl::sycl::queue> queue_;
1893 };
1894 
1895 
1896 namespace kernels {
1897 namespace dpcpp {
1898 using DefaultExecutor = DpcppExecutor;
1899 } // namespace dpcpp
1900 } // namespace kernels
1901 
1902 
1903 #undef GKO_OVERRIDE_RAW_COPY_TO
1904 
1905 
1906 } // namespace gko
1907 
1908 
1909 #endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
gko::allocation_mode
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition: executor.hpp:70
gko::CudaExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1386
gko::executor_deleter
This is a deleter that uses an executor's free method to deallocate the data.
Definition: executor.hpp:1056
gko::HipExecutor::get_num_warps_per_sm
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition: executor.hpp:1590
gko::executor_deleter::executor_deleter
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition: executor.hpp:1065
gko::Executor::synchronize
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor::get_max_workitem_sizes
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition: executor.hpp:1820
gko::Executor::free
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition: executor.hpp:651
gko::MachineTopology::get_instance
static MachineTopology * get_instance()
Returns an instance of the MachineTopology object.
Definition: machine_topology.hpp:211
gko::HipExecutor::get_hipblas_handle
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition: executor.hpp:1641
gko::Executor::memory_accessible
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition: executor.hpp:760
gko::DpcppExecutor::get_device_type
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition: executor.hpp:1850
gko::HipExecutor::get_num_devices
static int get_num_devices()
Get the number of devices present on the system.
gko::HipExecutor::run
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
gko::HipExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1658
gko::DpcppExecutor::get_num_devices
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
gko::DpcppExecutor::get_num_computing_units
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition: executor.hpp:1810
gko::OmpExecutor::create
static std::shared_ptr< OmpExecutor > create()
Creates a new OmpExecutor.
Definition: executor.hpp:1220
gko::size_type
std::size_t size_type
Integral type used for allocation quantities.
Definition: types.hpp:105
gko::Executor::get_master
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
gko::Executor::run
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
gko::HipExecutor
This is the Executor subclass which represents the HIP enhanced device.
Definition: executor.hpp:1544
gko::CudaExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor verion of compute capability.
Definition: executor.hpp:1427
gko::Executor::copy_from
void copy_from(const Executor *src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition: executor.hpp:673
gko::DpcppExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::DpcppExecutor::run
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
gko::CudaExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1394
gko::ReferenceExecutor
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition: executor.hpp:1281
gko::CudaExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1411
gko::NotSupported
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition: exception.hpp:156
gko::HipExecutor::get_device_id
int get_device_id() const noexcept
Get the HIP device id of the device associated to this executor.
Definition: executor.hpp:1577
gko::HipExecutor::get_hipsparse_handle
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition: executor.hpp:1648
gko::CudaExecutor
This is the Executor subclass which represents the CUDA device.
Definition: executor.hpp:1340
gko
The Ginkgo namespace.
Definition: abstract_factory.hpp:45
gko::CudaExecutor::get_cublas_handle
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition: executor.hpp:1437
gko::HipExecutor::get_master
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
gko::executor_deleter::operator()
void operator()(pointer ptr) const
Deletes the object.
Definition: executor.hpp:1074
gko::HipExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1665
gko::DpcppExecutor::get_max_subgroup_size
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition: executor.hpp:1840
gko::log::EnableLogging
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition: logger.hpp:587
gko::Operation::get_name
virtual const char * get_name() const noexcept
Returns the operation's name.
gko::DpcppExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::DpcppExecutor
This is the Executor subclass which represents a DPC++ enhanced device.
Definition: executor.hpp:1748
gko::DpcppExecutor::create
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all")
Creates a new DpcppExecutor.
gko::Executor::copy_val_to_host
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition: executor.hpp:730
gko::OmpExecutor
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition: executor.hpp:1212
gko::Executor::run
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition: executor.hpp:614
gko::CudaExecutor::get_closest_pus
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition: executor.hpp:1454
gko::CudaExecutor::get_cusparse_handle
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition: executor.hpp:1444
gko::Executor::alloc
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition: executor.hpp:634
gko::ReferenceExecutor::run
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition: executor.hpp:1288
gko::Executor::copy
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition: executor.hpp:715
gko::HipExecutor::get_num_multiprocessor
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:1598
gko::DpcppExecutor::get_device_id
int get_device_id() const noexcept
Get the DPCPP device id of the device associated to this executor.
Definition: executor.hpp:1779
gko::DpcppExecutor::get_max_workgroup_size
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition: executor.hpp:1830
gko::CudaExecutor::get_major_version
int get_major_version() const noexcept
Get the major verion of compute capability.
Definition: executor.hpp:1419
gko::CudaExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1402
gko::CudaExecutor::get_closest_numa
int get_closest_numa() const
Get the closest NUMA node.
Definition: executor.hpp:1464
gko::Executor
The first step in using the Ginkgo library consists of creating an executor.
Definition: executor.hpp:575
gko::HipExecutor::get_num_warps
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:1622
gko::HipExecutor::get_major_version
int get_major_version() const noexcept
Get the major verion of compute capability.
Definition: executor.hpp:1606
gko::HipExecutor::create
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset=false, allocation_mode alloc_mode=default_hip_alloc_mode)
Creates a new HipExecutor.
gko::HipExecutor::synchronize
void synchronize() const override
Synchronize the operations launched on the executor with its master.
gko::Operation
Operations can be used to define functionalities whose implementations differ among devices.
Definition: executor.hpp:259
gko::DpcppExecutor::get_subgroup_sizes
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition: executor.hpp:1800
gko::MachineTopology::bind_to_pus
void bind_to_pus(const std::vector< int > &ids, const bool singlify=true) const
Bind the calling process to PUs associated with the ids.
Definition: machine_topology.hpp:257
gko::HipExecutor::get_warp_size
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition: executor.hpp:1631
gko::HipExecutor::get_minor_version
int get_minor_version() const noexcept
Get the minor verion of compute capability.
Definition: executor.hpp:1614
gko::CudaExecutor::get_device_id
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition: executor.hpp:1373