Ginkgo  Generated from tags/v1.0.0^0 branch based on master. Ginkgo version 1.0.0
A numerical linear algebra library targeting many-core architectures
executor.hpp
1 /*******************************<GINKGO LICENSE>******************************
2 Copyright (c) 2017-2019, the Ginkgo authors
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8 
9 1. Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11 
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 
16 3. Neither the name of the copyright holder nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 ******************************<GINKGO LICENSE>*******************************/
32 
33 #ifndef GKO_CORE_EXECUTOR_HPP_
34 #define GKO_CORE_EXECUTOR_HPP_
35 
36 
37 #include <memory>
38 #include <mutex>
39 #include <sstream>
40 #include <tuple>
41 #include <type_traits>
42 
43 
44 #include <ginkgo/core/base/types.hpp>
45 #include <ginkgo/core/log/logger.hpp>
46 #include <ginkgo/core/synthesizer/containers.hpp>
47 
48 
49 struct cublasContext;
50 
51 struct cusparseContext;
52 
53 
54 namespace gko {
55 
56 
57 #define GKO_FORWARD_DECLARE(_type, ...) class _type
58 
59 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
60 
61 #undef GKO_FORWARD_DECLARE
62 
63 
64 class ReferenceExecutor;
65 
66 
67 namespace detail {
68 
69 
70 template <typename>
71 class ExecutorBase;
72 
73 
74 } // namespace detail
75 
76 
173 class Operation {
174 public:
175 #define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
176  virtual void run(std::shared_ptr<const _type>) const
177 
178  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
179 
180 #undef GKO_DECLARE_RUN_OVERLOAD
181 
182  // ReferenceExecutor overload can be defaulted to OmpExecutor's
183  virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
184 
190  virtual const char *get_name() const noexcept;
191 };
192 
193 #define GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(_type, _namespace, _kernel) \
194 public: \
195  void run(std::shared_ptr<const ::gko::_type> exec) const override \
196  { \
197  this->call(counts{}, exec); \
198  } \
199  \
200 private: \
201  template <int... Ns> \
202  void call(::gko::syn::value_list<int, Ns...>, \
203  std::shared_ptr<const ::gko::_type> exec) const \
204  { \
205  ::gko::kernels::_namespace::_kernel( \
206  exec, std::forward<Args>(std::get<Ns>(data))...); \
207  } \
208  static_assert(true, \
209  "This assert is used to counter the false positive extra " \
210  "semi-colon warnings")
211 
212 #define GKO_DETAIL_DEFINE_RUN_OVERLOAD(_type, _namespace, _kernel, ...) \
213 public: \
214  void run(std::shared_ptr<const ::gko::_type> exec) const override \
215  { \
216  this->call(counts{}, exec); \
217  } \
218  \
219 private: \
220  template <int... Ns> \
221  void call(::gko::syn::value_list<int, Ns...>, \
222  std::shared_ptr<const ::gko::_type> exec) const \
223  { \
224  ::gko::kernels::_namespace::_kernel( \
225  exec, std::forward<Args>(std::get<Ns>(data))...); \
226  } \
227  static_assert(true, \
228  "This assert is used to counter the false positive extra " \
229  "semi-colon warnings")
230 
231 
288 #define GKO_REGISTER_OPERATION(_name, _kernel) \
289  template <typename... Args> \
290  class _name##_operation : public Operation { \
291  using counts = \
292  ::gko::syn::as_list<::gko::syn::range<0, sizeof...(Args)>>; \
293  \
294  public: \
295  _name##_operation(Args &&... args) : data(std::forward<Args>(args)...) \
296  {} \
297  \
298  const char *get_name() const noexcept override \
299  { \
300  static auto name = [this] { \
301  std::ostringstream oss; \
302  oss << #_kernel << '#' << sizeof...(Args); \
303  return oss.str(); \
304  }(); \
305  return name.c_str(); \
306  } \
307  \
308  GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(OmpExecutor, omp, _kernel); \
309  GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(CudaExecutor, cuda, _kernel); \
310  GKO_KERNEL_DETAIL_DEFINE_RUN_OVERLOAD(ReferenceExecutor, reference, \
311  _kernel); \
312  \
313  private: \
314  mutable std::tuple<Args &&...> data; \
315  }; \
316  \
317  template <typename... Args> \
318  static _name##_operation<Args...> make_##_name(Args &&... args) \
319  { \
320  return _name##_operation<Args...>(std::forward<Args>(args)...); \
321  } \
322  static_assert(true, \
323  "This assert is used to counter the false positive extra " \
324  "semi-colon warnings")
325 
326 
410 class Executor : public log::EnableLogging<Executor> {
411  template <typename T>
412  friend class detail::ExecutorBase;
413 
414 public:
415  virtual ~Executor() = default;
416 
417  Executor() = default;
418  Executor(Executor &) = delete;
419  Executor(Executor &&) = default;
420  Executor &operator=(Executor &) = delete;
421  Executor &operator=(Executor &&) = default;
422 
428  virtual void run(const Operation &op) const = 0;
429 
440  template <typename ClosureOmp, typename ClosureCuda>
441  void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda) const
442  {
443  LambdaOperation<ClosureOmp, ClosureCuda> op(op_omp, op_cuda);
444  this->run(op);
445  }
446 
458  template <typename T>
459  T *alloc(size_type num_elems) const
460  {
461  this->template log<log::Logger::allocation_started>(
462  this, num_elems * sizeof(T));
463  T *allocated = static_cast<T *>(this->raw_alloc(num_elems * sizeof(T)));
464  this->template log<log::Logger::allocation_completed>(
465  this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
466  return allocated;
467  }
468 
476  void free(void *ptr) const noexcept
477  {
478  this->template log<log::Logger::free_started>(
479  this, reinterpret_cast<uintptr>(ptr));
480  this->raw_free(ptr);
481  this->template log<log::Logger::free_completed>(
482  this, reinterpret_cast<uintptr>(ptr));
483  }
484 
497  template <typename T>
498  void copy_from(const Executor *src_exec, size_type num_elems,
499  const T *src_ptr, T *dest_ptr) const
500  {
501  this->template log<log::Logger::copy_started>(
502  src_exec, this, reinterpret_cast<uintptr>(src_ptr),
503  reinterpret_cast<uintptr>(dest_ptr), num_elems * sizeof(T));
504  this->raw_copy_from(src_exec, num_elems * sizeof(T), src_ptr, dest_ptr);
505  this->template log<log::Logger::copy_completed>(
506  src_exec, this, reinterpret_cast<uintptr>(src_ptr),
507  reinterpret_cast<uintptr>(dest_ptr), num_elems * sizeof(T));
508  }
509 
514  virtual std::shared_ptr<Executor> get_master() noexcept = 0;
515 
519  virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
520 
524  virtual void synchronize() const = 0;
525 
526 protected:
536  virtual void *raw_alloc(size_type size) const = 0;
537 
545  virtual void raw_free(void *ptr) const noexcept = 0;
546 
557  virtual void raw_copy_from(const Executor *src_exec, size_type n_bytes,
558  const void *src_ptr, void *dest_ptr) const = 0;
559 
569 #define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
570  virtual void raw_copy_to(const _exec_type *dest_exec, size_type n_bytes, \
571  const void *src_ptr, void *dest_ptr) const = 0
572 
573  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
574 
575 #undef GKO_ENABLE_RAW_COPY_TO
576 
577 private:
588  template <typename ClosureOmp, typename ClosureCuda>
589  class LambdaOperation : public Operation {
590  public:
598  LambdaOperation(const ClosureOmp &op_omp, const ClosureCuda &op_cuda)
599  : op_omp_(op_omp), op_cuda_(op_cuda)
600  {}
601 
602  void run(std::shared_ptr<const OmpExecutor>) const override
603  {
604  op_omp_();
605  }
606 
607  void run(std::shared_ptr<const CudaExecutor>) const override
608  {
609  op_cuda_();
610  }
611 
612  private:
613  ClosureOmp op_omp_;
614  ClosureCuda op_cuda_;
615  };
616 };
617 
618 
627 template <typename T>
629 public:
630  using pointer = T *;
631 
637  explicit executor_deleter(std::shared_ptr<const Executor> exec)
638  : exec_{exec}
639  {}
640 
646  void operator()(pointer ptr) const
647  {
648  if (exec_) {
649  exec_->free(ptr);
650  }
651  }
652 
653 private:
654  std::shared_ptr<const Executor> exec_;
655 };
656 
657 // a specialization for arrays
658 template <typename T>
660 public:
661  using pointer = T[];
662 
663  explicit executor_deleter(std::shared_ptr<const Executor> exec)
664  : exec_{exec}
665  {}
666 
667  void operator()(pointer ptr) const
668  {
669  if (exec_) {
670  exec_->free(ptr);
671  }
672  }
673 
674 private:
675  std::shared_ptr<const Executor> exec_;
676 };
677 
678 
679 namespace detail {
680 
681 
682 template <typename ConcreteExecutor>
683 class ExecutorBase : public Executor {
684 public:
685  void run(const Operation &op) const override
686  {
687  this->template log<log::Logger::operation_launched>(this, &op);
688  op.run(self()->shared_from_this());
689  this->template log<log::Logger::operation_completed>(this, &op);
690  }
691 
692 protected:
693  void raw_copy_from(const Executor *src_exec, size_type n_bytes,
694  const void *src_ptr, void *dest_ptr) const override
695  {
696  src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
697  }
698 
699 private:
700  ConcreteExecutor *self() noexcept
701  {
702  return static_cast<ConcreteExecutor *>(this);
703  }
704 
705  const ConcreteExecutor *self() const noexcept
706  {
707  return static_cast<const ConcreteExecutor *>(this);
708  }
709 };
710 
711 
712 } // namespace detail
713 
714 
715 #define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
716  void raw_copy_to(const _executor_type *dest_exec, size_type n_bytes, \
717  const void *src_ptr, void *dest_ptr) const override
718 
719 
727 class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
728  public std::enable_shared_from_this<OmpExecutor> {
729  friend class detail::ExecutorBase<OmpExecutor>;
730 
731 public:
735  static std::shared_ptr<OmpExecutor> create()
736  {
737  return std::shared_ptr<OmpExecutor>(new OmpExecutor());
738  }
739 
740  std::shared_ptr<Executor> get_master() noexcept override;
741 
742  std::shared_ptr<const Executor> get_master() const noexcept override;
743 
744  void synchronize() const override;
745 
746 protected:
747  OmpExecutor() = default;
748 
749  void *raw_alloc(size_type size) const override;
750 
751  void raw_free(void *ptr) const noexcept override;
752 
753  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
754 };
755 
756 
757 namespace kernels {
758 namespace omp {
759 using DefaultExecutor = OmpExecutor;
760 } // namespace omp
761 } // namespace kernels
762 
763 
772 public:
773  static std::shared_ptr<ReferenceExecutor> create()
774  {
775  return std::shared_ptr<ReferenceExecutor>(new ReferenceExecutor());
776  }
777 
778  void run(const Operation &op) const override
779  {
780  this->template log<log::Logger::operation_launched>(this, &op);
781  op.run(std::static_pointer_cast<const ReferenceExecutor>(
782  this->shared_from_this()));
783  this->template log<log::Logger::operation_completed>(this, &op);
784  }
785 
786 protected:
787  ReferenceExecutor() = default;
788 };
789 
790 
791 namespace kernels {
792 namespace reference {
793 using DefaultExecutor = ReferenceExecutor;
794 } // namespace reference
795 } // namespace kernels
796 
797 
804 class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
805  public std::enable_shared_from_this<CudaExecutor> {
806  friend class ExecutorBase<CudaExecutor>;
807 
808 public:
816  static std::shared_ptr<CudaExecutor> create(
817  int device_id, std::shared_ptr<Executor> master);
818 
819  ~CudaExecutor() { decrease_num_execs(this->device_id_); }
820 
821  std::shared_ptr<Executor> get_master() noexcept override;
822 
823  std::shared_ptr<const Executor> get_master() const noexcept override;
824 
825  void synchronize() const override;
826 
827  void run(const Operation &op) const override;
828 
832  int get_device_id() const noexcept { return device_id_; }
833 
837  static int get_num_devices();
838 
842  int get_num_cores_per_sm() const noexcept { return num_cores_per_sm_; }
843 
847  int get_num_multiprocessor() const noexcept { return num_multiprocessor_; }
848 
852  int get_num_warps() const noexcept
853  {
854  constexpr uint32 warp_size = 32;
855  auto warps_per_sm = num_cores_per_sm_ / warp_size;
856  return num_multiprocessor_ * warps_per_sm;
857  }
858 
862  int get_major_version() const noexcept { return major_; }
863 
867  int get_minor_version() const noexcept { return minor_; }
868 
874  cublasContext *get_cublas_handle() const { return cublas_handle_.get(); }
875 
881  cusparseContext *get_cusparse_handle() const
882  {
883  return cusparse_handle_.get();
884  }
885 
886 protected:
887  void set_gpu_property();
888 
889  void init_handles();
890 
891  CudaExecutor(int device_id, std::shared_ptr<Executor> master)
892  : device_id_(device_id),
893  master_(master),
894  num_cores_per_sm_(0),
895  num_multiprocessor_(0),
896  major_(0),
897  minor_(0)
898  {
899  assert(device_id < max_devices);
900  this->set_gpu_property();
901  this->init_handles();
902  increase_num_execs(device_id);
903  }
904 
905  void *raw_alloc(size_type size) const override;
906 
907  void raw_free(void *ptr) const noexcept override;
908 
909  GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
910 
911  static void increase_num_execs(int device_id)
912  {
913  std::lock_guard<std::mutex> guard(mutex[device_id]);
914  num_execs[device_id]++;
915  }
916 
917  static void decrease_num_execs(int device_id)
918  {
919  std::lock_guard<std::mutex> guard(mutex[device_id]);
920  num_execs[device_id]--;
921  }
922 
923  static int get_num_execs(int device_id)
924  {
925  std::lock_guard<std::mutex> guard(mutex[device_id]);
926  return num_execs[device_id];
927  }
928 
929 private:
930  int device_id_;
931  std::shared_ptr<Executor> master_;
932  int num_cores_per_sm_;
933  int num_multiprocessor_;
934  int major_;
935  int minor_;
936 
937  template <typename T>
938  using handle_manager = std::unique_ptr<T, std::function<void(T *)>>;
939  handle_manager<cublasContext> cublas_handle_;
940  handle_manager<cusparseContext> cusparse_handle_;
941 
942  static constexpr int max_devices = 64;
943  static int num_execs[max_devices];
944  static std::mutex mutex[max_devices];
945 };
946 
947 
948 namespace kernels {
949 namespace cuda {
950 using DefaultExecutor = CudaExecutor;
951 } // namespace cuda
952 } // namespace kernels
953 
954 
955 #undef GKO_OVERRIDE_RAW_COPY_TO
956 
957 
958 } // namespace gko
959 
960 
961 #endif // GKO_CORE_EXECUTOR_HPP_
int get_major_version() const noexcept
Get the major verion of compute capability.
Definition: executor.hpp:862
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition: executor.hpp:832
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition: executor.hpp:637
Definition: executor.hpp:659
std::uint32_t uint32
32-bit unsigned integral type.
Definition: types.hpp:134
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition: executor.hpp:778
void copy_from(const Executor *src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition: executor.hpp:498
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition: executor.hpp:459
This is a deleter that uses an executor&#39;s free method to deallocate the data.
Definition: executor.hpp:628
std::size_t size_type
Integral type used for allocation quantities.
Definition: types.hpp:94
int get_num_cores_per_sm() const noexcept
Get the number of cores per SM of this executor.
Definition: executor.hpp:842
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition: executor.hpp:847
The Ginkgo namespace.
Definition: abstract_factory.hpp:45
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition: executor.hpp:771
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda) const
Runs one of the passed in functors, depending on the Executor type.
Definition: executor.hpp:441
static std::shared_ptr< OmpExecutor > create()
Creates a new OmpExecutor.
Definition: executor.hpp:735
EnableLogging is a mixin which should be inherited by any class which wants to enable logging...
Definition: logger.hpp:521
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition: executor.hpp:874
This is the Executor subclass which represents the CUDA device.
Definition: executor.hpp:804
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition: executor.hpp:476
Operations can be used to define functionalities whose implementations differ among devices...
Definition: executor.hpp:173
This is the Executor subclass which represents the OpenMP device (typically CPU). ...
Definition: executor.hpp:727
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition: executor.hpp:881
void operator()(pointer ptr) const
Deletes the object.
Definition: executor.hpp:646
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition: executor.hpp:852
int get_minor_version() const noexcept
Get the minor verion of compute capability.
Definition: executor.hpp:867
The first step in using the Ginkgo library consists of creating an executor.
Definition: executor.hpp:410