33 #ifndef GKO_CORE_EXECUTOR_HPP_ 34 #define GKO_CORE_EXECUTOR_HPP_ 41 #include <type_traits> 44 #include <ginkgo/core/base/types.hpp> 45 #include <ginkgo/core/log/logger.hpp> 46 #include <ginkgo/core/synthesizer/containers.hpp> 51 struct cusparseContext;
57 #define GKO_FORWARD_DECLARE(_type, ...) class _type 59 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
61 #undef GKO_FORWARD_DECLARE 64 class ReferenceExecutor;
175 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \ 176 virtual void run(std::shared_ptr<const _type>) const 178 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
180 #undef GKO_DECLARE_RUN_OVERLOAD 183 virtual void run(std::shared_ptr<const ReferenceExecutor> executor)
const;
190 virtual const char *get_name()
const noexcept;
193 #define GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(_type, _namespace, _kernel) \ 195 void run(std::shared_ptr<const ::gko::_type> exec) const override \ 197 this->call(counts{}, exec); \ 201 template <int... Ns> \ 202 void call(::gko::syn::value_list<int, Ns...>, \ 203 std::shared_ptr<const ::gko::_type> exec) const \ 205 ::gko::kernels::_namespace::_kernel( \ 206 exec, std::forward<Args>(std::get<Ns>(data))...); \ 208 static_assert(true, \ 209 "This assert is used to counter the false positive extra " \ 210 "semi-colon warnings") 212 #define GKO_DETAIL_DEFINE_RUN_OVERLOAD(_type, _namespace, _kernel, ...) \ 214 void run(std::shared_ptr<const ::gko::_type> exec) const override \ 216 this->call(counts{}, exec); \ 220 template <int... Ns> \ 221 void call(::gko::syn::value_list<int, Ns...>, \ 222 std::shared_ptr<const ::gko::_type> exec) const \ 224 ::gko::kernels::_namespace::_kernel( \ 225 exec, std::forward<Args>(std::get<Ns>(data))...); \ 227 static_assert(true, \ 228 "This assert is used to counter the false positive extra " \ 229 "semi-colon warnings") 288 #define GKO_REGISTER_OPERATION(_name, _kernel) \ 289 template <typename... Args> \ 290 class _name##_operation : public Operation { \ 292 ::gko::syn::as_list<::gko::syn::range<0, sizeof...(Args)>>; \ 295 _name##_operation(Args &&... args) : data(std::forward<Args>(args)...) \ 298 const char *get_name() const noexcept override \ 300 static auto name = [this] { \ 301 std::ostringstream oss; \ 302 oss << #_kernel << '#' << sizeof...(Args); \ 305 return name.c_str(); \ 308 GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(OmpExecutor, omp, _kernel); \ 309 GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(CudaExecutor, cuda, _kernel); \ 310 GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(ReferenceExecutor, reference, \ 314 mutable std::tuple<Args &&...> data; \ 317 template <typename... Args> \ 318 static _name##_operation<Args...> make_##_name(Args &&... args) \ 320 return _name##_operation<Args...>(std::forward<Args>(args)...); \ 322 static_assert(true, \ 323 "This assert is used to counter the false positive extra " \ 324 "semi-colon warnings") 411 template <
typename T>
412 friend class detail::ExecutorBase;
428 virtual void run(
const Operation &op)
const = 0;
440 template <
typename ClosureOmp,
typename ClosureCuda>
441 void run(
const ClosureOmp &op_omp,
const ClosureCuda &op_cuda)
const 443 LambdaOperation<ClosureOmp, ClosureCuda> op(op_omp, op_cuda);
458 template <
typename T>
461 this->
template log<log::Logger::allocation_started>(
462 this, num_elems *
sizeof(T));
463 T *allocated =
static_cast<T *
>(this->raw_alloc(num_elems *
sizeof(T)));
464 this->
template log<log::Logger::allocation_completed>(
465 this, num_elems *
sizeof(T), reinterpret_cast<uintptr>(allocated));
476 void free(
void *ptr)
const noexcept
478 this->
template log<log::Logger::free_started>(
479 this,
reinterpret_cast<uintptr
>(ptr));
481 this->
template log<log::Logger::free_completed>(
482 this,
reinterpret_cast<uintptr
>(ptr));
497 template <
typename T>
499 const T *src_ptr, T *dest_ptr)
const 501 this->
template log<log::Logger::copy_started>(
502 src_exec,
this,
reinterpret_cast<uintptr
>(src_ptr),
503 reinterpret_cast<uintptr>(dest_ptr), num_elems *
sizeof(T));
504 this->raw_copy_from(src_exec, num_elems *
sizeof(T), src_ptr, dest_ptr);
505 this->
template log<log::Logger::copy_completed>(
506 src_exec,
this,
reinterpret_cast<uintptr
>(src_ptr),
507 reinterpret_cast<uintptr>(dest_ptr), num_elems *
sizeof(T));
514 virtual std::shared_ptr<Executor> get_master() noexcept = 0;
519 virtual std::shared_ptr<const Executor> get_master()
const noexcept = 0;
524 virtual void synchronize()
const = 0;
536 virtual void *raw_alloc(
size_type size)
const = 0;
545 virtual void raw_free(
void *ptr)
const noexcept = 0;
558 const void *src_ptr,
void *dest_ptr)
const = 0;
569 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \ 570 virtual void raw_copy_to(const _exec_type *dest_exec, size_type n_bytes, \ 571 const void *src_ptr, void *dest_ptr) const = 0 573 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
575 #undef GKO_ENABLE_RAW_COPY_TO 588 template <
typename ClosureOmp,
typename ClosureCuda>
589 class LambdaOperation :
public Operation {
598 LambdaOperation(
const ClosureOmp &op_omp,
const ClosureCuda &op_cuda)
599 : op_omp_(op_omp), op_cuda_(op_cuda)
602 void run(std::shared_ptr<const OmpExecutor>)
const override 607 void run(std::shared_ptr<const CudaExecutor>)
const override 614 ClosureCuda op_cuda_;
627 template <
typename T>
654 std::shared_ptr<const Executor> exec_;
658 template <
typename T>
667 void operator()(pointer ptr)
const 675 std::shared_ptr<const Executor> exec_;
682 template <
typename ConcreteExecutor>
683 class ExecutorBase :
public Executor {
685 void run(
const Operation &op)
const override 687 this->
template log<log::Logger::operation_launched>(
this, &op);
688 op.run(
self()->shared_from_this());
689 this->
template log<log::Logger::operation_completed>(
this, &op);
694 const void *src_ptr,
void *dest_ptr)
const override 696 src_exec->raw_copy_to(
self(), n_bytes, src_ptr, dest_ptr);
700 ConcreteExecutor *
self() noexcept
702 return static_cast<ConcreteExecutor *
>(
this);
705 const ConcreteExecutor *
self()
const noexcept
707 return static_cast<const ConcreteExecutor *
>(
this);
715 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \ 716 void raw_copy_to(const _executor_type *dest_exec, size_type n_bytes, \ 717 const void *src_ptr, void *dest_ptr) const override 728 public std::enable_shared_from_this<OmpExecutor> {
729 friend class detail::ExecutorBase<OmpExecutor>;
735 static std::shared_ptr<OmpExecutor>
create()
737 return std::shared_ptr<OmpExecutor>(
new OmpExecutor());
740 std::shared_ptr<Executor> get_master() noexcept
override;
742 std::shared_ptr<const Executor> get_master()
const noexcept
override;
744 void synchronize()
const override;
749 void *raw_alloc(
size_type size)
const override;
751 void raw_free(
void *ptr)
const noexcept
override;
753 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
773 static std::shared_ptr<ReferenceExecutor> create()
780 this->
template log<log::Logger::operation_launched>(
this, &op);
781 op.run(std::static_pointer_cast<const ReferenceExecutor>(
782 this->shared_from_this()));
783 this->
template log<log::Logger::operation_completed>(
this, &op);
792 namespace reference {
805 public std::enable_shared_from_this<CudaExecutor> {
806 friend class ExecutorBase<CudaExecutor>;
816 static std::shared_ptr<CudaExecutor> create(
817 int device_id, std::shared_ptr<Executor> master);
819 ~
CudaExecutor() { decrease_num_execs(this->device_id_); }
821 std::shared_ptr<Executor> get_master() noexcept
override;
823 std::shared_ptr<const Executor> get_master()
const noexcept
override;
825 void synchronize()
const override;
827 void run(
const Operation &op)
const override;
837 static int get_num_devices();
854 constexpr
uint32 warp_size = 32;
855 auto warps_per_sm = num_cores_per_sm_ / warp_size;
856 return num_multiprocessor_ * warps_per_sm;
883 return cusparse_handle_.get();
887 void set_gpu_property();
891 CudaExecutor(
int device_id, std::shared_ptr<Executor> master)
892 : device_id_(device_id),
894 num_cores_per_sm_(0),
895 num_multiprocessor_(0),
899 assert(device_id < max_devices);
900 this->set_gpu_property();
901 this->init_handles();
902 increase_num_execs(device_id);
905 void *raw_alloc(
size_type size)
const override;
907 void raw_free(
void *ptr)
const noexcept
override;
909 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
911 static void increase_num_execs(
int device_id)
913 std::lock_guard<std::mutex> guard(mutex[device_id]);
914 num_execs[device_id]++;
917 static void decrease_num_execs(
int device_id)
919 std::lock_guard<std::mutex> guard(mutex[device_id]);
920 num_execs[device_id]--;
923 static int get_num_execs(
int device_id)
925 std::lock_guard<std::mutex> guard(mutex[device_id]);
926 return num_execs[device_id];
931 std::shared_ptr<Executor> master_;
932 int num_cores_per_sm_;
933 int num_multiprocessor_;
937 template <
typename T>
938 using handle_manager = std::unique_ptr<T, std::function<void(T *)>>;
939 handle_manager<cublasContext> cublas_handle_;
940 handle_manager<cusparseContext> cusparse_handle_;
942 static constexpr
int max_devices = 64;
943 static int num_execs[max_devices];
944 static std::mutex mutex[max_devices];
955 #undef GKO_OVERRIDE_RAW_COPY_TO 961 #endif // GKO_CORE_EXECUTOR_HPP_ int get_major_version() const noexcept
Get the major verion of compute capability.
Definition: executor.hpp:862
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition: executor.hpp:832
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition: executor.hpp:637
Definition: executor.hpp:659
std::uint32_t uint32
32-bit unsigned integral type.
Definition: types.hpp:134
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition: executor.hpp:778
void copy_from(const Executor *src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition: executor.hpp:498
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition: executor.hpp:459
This is a deleter that uses an executor's free method to deallocate the data.
Definition: executor.hpp:628
std::size_t size_type
Integral type used for allocation quantities.
Definition: types.hpp:94
int get_num_cores_per_sm() const noexcept
Get the number of cores per SM of this executor.
Definition: executor.hpp:842
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:847
The Ginkgo namespace.
Definition: abstract_factory.hpp:45
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition: executor.hpp:771
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda) const
Runs one of the passed in functors, depending on the Executor type.
Definition: executor.hpp:441
static std::shared_ptr< OmpExecutor > create()
Creates a new OmpExecutor.
Definition: executor.hpp:735
EnableLogging is a mixin which should be inherited by any class which wants to enable logging...
Definition: logger.hpp:521
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition: executor.hpp:874
This is the Executor subclass which represents the CUDA device.
Definition: executor.hpp:804
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition: executor.hpp:476
Operations can be used to define functionalities whose implementations differ among devices...
Definition: executor.hpp:173
This is the Executor subclass which represents the OpenMP device (typically CPU). ...
Definition: executor.hpp:727
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition: executor.hpp:881
void operator()(pointer ptr) const
Deletes the object.
Definition: executor.hpp:646
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:852
int get_minor_version() const noexcept
Get the minor verion of compute capability.
Definition: executor.hpp:867
The first step in using the Ginkgo library consists of creating an executor.
Definition: executor.hpp:410