Ginkgo  Generated from pipelines/1680925034 branch based on develop. Ginkgo version 1.10.0
A numerical linear algebra library targeting many-core architectures
mpi.hpp
1 // SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
2 //
3 // SPDX-License-Identifier: BSD-3-Clause
4 
5 #ifndef GKO_PUBLIC_CORE_BASE_MPI_HPP_
6 #define GKO_PUBLIC_CORE_BASE_MPI_HPP_
7 
8 
9 #include <memory>
10 #include <type_traits>
11 #include <utility>
12 
13 #include <ginkgo/config.hpp>
14 #include <ginkgo/core/base/exception.hpp>
15 #include <ginkgo/core/base/exception_helpers.hpp>
16 #include <ginkgo/core/base/executor.hpp>
17 #include <ginkgo/core/base/half.hpp>
18 #include <ginkgo/core/base/types.hpp>
19 #include <ginkgo/core/base/utils_helper.hpp>
20 
21 
22 #if GINKGO_BUILD_MPI
23 
24 
25 #include <mpi.h>
26 
27 
28 namespace gko {
29 namespace experimental {
36 namespace mpi {
37 
38 
42 inline constexpr bool is_gpu_aware()
43 {
44 #if GINKGO_HAVE_GPU_AWARE_MPI
45  return true;
46 #else
47  return false;
48 #endif
49 }
50 
51 
59 int map_rank_to_device_id(MPI_Comm comm, int num_devices);
60 
61 
62 #define GKO_REGISTER_MPI_TYPE(input_type, mpi_type) \
63  template <> \
64  struct type_impl<input_type> { \
65  static MPI_Datatype get_type() { return mpi_type; } \
66  }
67 
76 template <typename T>
77 struct type_impl {};
78 
79 
80 GKO_REGISTER_MPI_TYPE(char, MPI_CHAR);
81 GKO_REGISTER_MPI_TYPE(unsigned char, MPI_UNSIGNED_CHAR);
82 GKO_REGISTER_MPI_TYPE(unsigned, MPI_UNSIGNED);
83 GKO_REGISTER_MPI_TYPE(int, MPI_INT);
84 GKO_REGISTER_MPI_TYPE(unsigned short, MPI_UNSIGNED_SHORT);
85 GKO_REGISTER_MPI_TYPE(unsigned long, MPI_UNSIGNED_LONG);
86 GKO_REGISTER_MPI_TYPE(long, MPI_LONG);
87 GKO_REGISTER_MPI_TYPE(long long, MPI_LONG_LONG_INT);
88 GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG);
89 GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT);
90 GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE);
91 GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE);
92 #if GINKGO_ENABLE_HALF
93 // OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16
94 // Only OpenMPI support complex half
95 // TODO: use native type when mpi is configured with half feature
96 GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT);
97 GKO_REGISTER_MPI_TYPE(std::complex<half>, MPI_FLOAT);
98 #endif // GKO_ENABLE_HALF
99 GKO_REGISTER_MPI_TYPE(std::complex<float>, MPI_C_FLOAT_COMPLEX);
100 GKO_REGISTER_MPI_TYPE(std::complex<double>, MPI_C_DOUBLE_COMPLEX);
101 
102 
110 public:
117  contiguous_type(int count, MPI_Datatype old_type) : type_(MPI_DATATYPE_NULL)
118  {
119  GKO_ASSERT_NO_MPI_ERRORS(MPI_Type_contiguous(count, old_type, &type_));
120  GKO_ASSERT_NO_MPI_ERRORS(MPI_Type_commit(&type_));
121  }
122 
126  contiguous_type() : type_(MPI_DATATYPE_NULL) {}
127 
131  contiguous_type(const contiguous_type&) = delete;
132 
136  contiguous_type& operator=(const contiguous_type&) = delete;
137 
143  contiguous_type(contiguous_type&& other) noexcept : type_(MPI_DATATYPE_NULL)
144  {
145  *this = std::move(other);
146  }
147 
156  {
157  if (this != &other) {
158  this->type_ = std::exchange(other.type_, MPI_DATATYPE_NULL);
159  }
160  return *this;
161  }
162 
167  {
168  if (type_ != MPI_DATATYPE_NULL) {
169  MPI_Type_free(&type_);
170  }
171  }
172 
178  MPI_Datatype get() const { return type_; }
179 
180 private:
181  MPI_Datatype type_;
182 };
183 
184 
189 enum class thread_type {
190  serialized = MPI_THREAD_SERIALIZED,
191  funneled = MPI_THREAD_FUNNELED,
192  single = MPI_THREAD_SINGLE,
193  multiple = MPI_THREAD_MULTIPLE
194 };
195 
196 
206 class environment {
207 public:
208  static bool is_finalized()
209  {
210  int flag = 0;
211  GKO_ASSERT_NO_MPI_ERRORS(MPI_Finalized(&flag));
212  return flag;
213  }
214 
215  static bool is_initialized()
216  {
217  int flag = 0;
218  GKO_ASSERT_NO_MPI_ERRORS(MPI_Initialized(&flag));
219  return flag;
220  }
221 
227  int get_provided_thread_support() const { return provided_thread_support_; }
228 
237  environment(int& argc, char**& argv,
238  const thread_type thread_t = thread_type::serialized)
239  {
240  this->required_thread_support_ = static_cast<int>(thread_t);
241  GKO_ASSERT_NO_MPI_ERRORS(
242  MPI_Init_thread(&argc, &argv, this->required_thread_support_,
243  &(this->provided_thread_support_)));
244  }
245 
249  ~environment() { MPI_Finalize(); }
250 
251  environment(const environment&) = delete;
252  environment(environment&&) = delete;
253  environment& operator=(const environment&) = delete;
254  environment& operator=(environment&&) = delete;
255 
256 private:
257  int required_thread_support_;
258  int provided_thread_support_;
259 };
260 
261 
262 namespace {
263 
264 
269 class comm_deleter {
270 public:
271  using pointer = MPI_Comm*;
272  void operator()(pointer comm) const
273  {
274  GKO_ASSERT(*comm != MPI_COMM_NULL);
275  GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_free(comm));
276  delete comm;
277  }
278 };
279 
280 
281 } // namespace
282 
283 
287 struct status {
291  status() : status_(MPI_Status{}) {}
292 
298  MPI_Status* get() { return &this->status_; }
299 
310  template <typename T>
311  int get_count(const T* data) const
312  {
313  int count;
314  MPI_Get_count(&status_, type_impl<T>::get_type(), &count);
315  return count;
316  }
317 
318 private:
319  MPI_Status status_;
320 };
321 
322 
327 class request {
328 public:
333  request() : req_(MPI_REQUEST_NULL) {}
334 
335  request(const request&) = delete;
336 
337  request& operator=(const request&) = delete;
338 
339  request(request&& o) noexcept { *this = std::move(o); }
340 
341  request& operator=(request&& o) noexcept
342  {
343  if (this != &o) {
344  this->req_ = std::exchange(o.req_, MPI_REQUEST_NULL);
345  }
346  return *this;
347  }
348 
349  ~request()
350  {
351  if (req_ != MPI_REQUEST_NULL) {
352  if (MPI_Request_free(&req_) != MPI_SUCCESS) {
353  std::terminate(); // since we can't throw in destructors, we
354  // have to terminate the program
355  }
356  }
357  }
358 
364  MPI_Request* get() { return &this->req_; }
365 
373  {
374  status status;
375  GKO_ASSERT_NO_MPI_ERRORS(MPI_Wait(&req_, status.get()));
376  return status;
377  }
378 
379 
380 private:
381  MPI_Request req_;
382 };
383 
384 
392 inline std::vector<status> wait_all(std::vector<request>& req)
393 {
394  std::vector<status> stat;
395  for (std::size_t i = 0; i < req.size(); ++i) {
396  stat.emplace_back(req[i].wait());
397  }
398  return stat;
399 }
400 
401 
417 public:
428  communicator(const MPI_Comm& comm, bool force_host_buffer = false)
429  : comm_(), force_host_buffer_(force_host_buffer)
430  {
431  this->comm_.reset(new MPI_Comm(comm));
432  }
433 
442  communicator(const MPI_Comm& comm, int color, int key)
443  {
444  MPI_Comm comm_out;
445  GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_split(comm, color, key, &comm_out));
446  this->comm_.reset(new MPI_Comm(comm_out), comm_deleter{});
447  }
448 
457  communicator(const communicator& comm, int color, int key)
458  {
459  MPI_Comm comm_out;
460  GKO_ASSERT_NO_MPI_ERRORS(
461  MPI_Comm_split(comm.get(), color, key, &comm_out));
462  this->comm_.reset(new MPI_Comm(comm_out), comm_deleter{});
463  }
464 
470  const MPI_Comm& get() const { return *(this->comm_.get()); }
471 
472  bool force_host_buffer() const { return force_host_buffer_; }
473 
479  int size() const { return get_num_ranks(); }
480 
486  int rank() const { return get_my_rank(); };
487 
493  int node_local_rank() const { return get_node_local_rank(); };
494 
500  bool operator==(const communicator& rhs) const
501  {
502  return compare(rhs.get());
503  }
504 
510  bool operator!=(const communicator& rhs) const { return !(*this == rhs); }
511 
516  void synchronize() const
517  {
518  GKO_ASSERT_NO_MPI_ERRORS(MPI_Barrier(this->get()));
519  }
520 
534  template <typename SendType>
535  void send(std::shared_ptr<const Executor> exec, const SendType* send_buffer,
536  const int send_count, const int destination_rank,
537  const int send_tag) const
538  {
539  auto guard = exec->get_scoped_device_id_guard();
540  GKO_ASSERT_NO_MPI_ERRORS(
541  MPI_Send(send_buffer, send_count, type_impl<SendType>::get_type(),
542  destination_rank, send_tag, this->get()));
543  }
544 
561  template <typename SendType>
562  request i_send(std::shared_ptr<const Executor> exec,
563  const SendType* send_buffer, const int send_count,
564  const int destination_rank, const int send_tag) const
565  {
566  auto guard = exec->get_scoped_device_id_guard();
567  request req;
568  GKO_ASSERT_NO_MPI_ERRORS(
569  MPI_Isend(send_buffer, send_count, type_impl<SendType>::get_type(),
570  destination_rank, send_tag, this->get(), req.get()));
571  return req;
572  }
573 
589  template <typename RecvType>
590  status recv(std::shared_ptr<const Executor> exec, RecvType* recv_buffer,
591  const int recv_count, const int source_rank,
592  const int recv_tag) const
593  {
594  auto guard = exec->get_scoped_device_id_guard();
595  status st;
596  GKO_ASSERT_NO_MPI_ERRORS(
597  MPI_Recv(recv_buffer, recv_count, type_impl<RecvType>::get_type(),
598  source_rank, recv_tag, this->get(), st.get()));
599  return st;
600  }
601 
617  template <typename RecvType>
618  request i_recv(std::shared_ptr<const Executor> exec, RecvType* recv_buffer,
619  const int recv_count, const int source_rank,
620  const int recv_tag) const
621  {
622  auto guard = exec->get_scoped_device_id_guard();
623  request req;
624  GKO_ASSERT_NO_MPI_ERRORS(
625  MPI_Irecv(recv_buffer, recv_count, type_impl<RecvType>::get_type(),
626  source_rank, recv_tag, this->get(), req.get()));
627  return req;
628  }
629 
642  template <typename BroadcastType>
643  void broadcast(std::shared_ptr<const Executor> exec, BroadcastType* buffer,
644  int count, int root_rank) const
645  {
646  auto guard = exec->get_scoped_device_id_guard();
647  GKO_ASSERT_NO_MPI_ERRORS(MPI_Bcast(buffer, count,
649  root_rank, this->get()));
650  }
651 
667  template <typename BroadcastType>
668  request i_broadcast(std::shared_ptr<const Executor> exec,
669  BroadcastType* buffer, int count, int root_rank) const
670  {
671  auto guard = exec->get_scoped_device_id_guard();
672  request req;
673  GKO_ASSERT_NO_MPI_ERRORS(
674  MPI_Ibcast(buffer, count, type_impl<BroadcastType>::get_type(),
675  root_rank, this->get(), req.get()));
676  return req;
677  }
678 
693  template <typename ReduceType>
694  void reduce(std::shared_ptr<const Executor> exec,
695  const ReduceType* send_buffer, ReduceType* recv_buffer,
696  int count, MPI_Op operation, int root_rank) const
697  {
698  auto guard = exec->get_scoped_device_id_guard();
699  GKO_ASSERT_NO_MPI_ERRORS(MPI_Reduce(send_buffer, recv_buffer, count,
701  operation, root_rank, this->get()));
702  }
703 
720  template <typename ReduceType>
721  request i_reduce(std::shared_ptr<const Executor> exec,
722  const ReduceType* send_buffer, ReduceType* recv_buffer,
723  int count, MPI_Op operation, int root_rank) const
724  {
725  auto guard = exec->get_scoped_device_id_guard();
726  request req;
727  GKO_ASSERT_NO_MPI_ERRORS(MPI_Ireduce(
728  send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
729  operation, root_rank, this->get(), req.get()));
730  return req;
731  }
732 
746  template <typename ReduceType>
747  void all_reduce(std::shared_ptr<const Executor> exec,
748  ReduceType* recv_buffer, int count, MPI_Op operation) const
749  {
750  auto guard = exec->get_scoped_device_id_guard();
751  GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
752  MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),
753  operation, this->get()));
754  }
755 
771  template <typename ReduceType>
772  request i_all_reduce(std::shared_ptr<const Executor> exec,
773  ReduceType* recv_buffer, int count,
774  MPI_Op operation) const
775  {
776  auto guard = exec->get_scoped_device_id_guard();
777  request req;
778  GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(
779  MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),
780  operation, this->get(), req.get()));
781  return req;
782  }
783 
798  template <typename ReduceType>
799  void all_reduce(std::shared_ptr<const Executor> exec,
800  const ReduceType* send_buffer, ReduceType* recv_buffer,
801  int count, MPI_Op operation) const
802  {
803  auto guard = exec->get_scoped_device_id_guard();
804  GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
805  send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
806  operation, this->get()));
807  }
808 
825  template <typename ReduceType>
826  request i_all_reduce(std::shared_ptr<const Executor> exec,
827  const ReduceType* send_buffer, ReduceType* recv_buffer,
828  int count, MPI_Op operation) const
829  {
830  auto guard = exec->get_scoped_device_id_guard();
831  request req;
832  GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(
833  send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
834  operation, this->get(), req.get()));
835  return req;
836  }
837 
854  template <typename SendType, typename RecvType>
855  void gather(std::shared_ptr<const Executor> exec,
856  const SendType* send_buffer, const int send_count,
857  RecvType* recv_buffer, const int recv_count,
858  int root_rank) const
859  {
860  auto guard = exec->get_scoped_device_id_guard();
861  GKO_ASSERT_NO_MPI_ERRORS(
862  MPI_Gather(send_buffer, send_count, type_impl<SendType>::get_type(),
863  recv_buffer, recv_count, type_impl<RecvType>::get_type(),
864  root_rank, this->get()));
865  }
866 
886  template <typename SendType, typename RecvType>
887  request i_gather(std::shared_ptr<const Executor> exec,
888  const SendType* send_buffer, const int send_count,
889  RecvType* recv_buffer, const int recv_count,
890  int root_rank) const
891  {
892  auto guard = exec->get_scoped_device_id_guard();
893  request req;
894  GKO_ASSERT_NO_MPI_ERRORS(MPI_Igather(
895  send_buffer, send_count, type_impl<SendType>::get_type(),
896  recv_buffer, recv_count, type_impl<RecvType>::get_type(), root_rank,
897  this->get(), req.get()));
898  return req;
899  }
900 
919  template <typename SendType, typename RecvType>
920  void gather_v(std::shared_ptr<const Executor> exec,
921  const SendType* send_buffer, const int send_count,
922  RecvType* recv_buffer, const int* recv_counts,
923  const int* displacements, int root_rank) const
924  {
925  auto guard = exec->get_scoped_device_id_guard();
926  GKO_ASSERT_NO_MPI_ERRORS(MPI_Gatherv(
927  send_buffer, send_count, type_impl<SendType>::get_type(),
928  recv_buffer, recv_counts, displacements,
929  type_impl<RecvType>::get_type(), root_rank, this->get()));
930  }
931 
952  template <typename SendType, typename RecvType>
953  request i_gather_v(std::shared_ptr<const Executor> exec,
954  const SendType* send_buffer, const int send_count,
955  RecvType* recv_buffer, const int* recv_counts,
956  const int* displacements, int root_rank) const
957  {
958  auto guard = exec->get_scoped_device_id_guard();
959  request req;
960  GKO_ASSERT_NO_MPI_ERRORS(MPI_Igatherv(
961  send_buffer, send_count, type_impl<SendType>::get_type(),
962  recv_buffer, recv_counts, displacements,
963  type_impl<RecvType>::get_type(), root_rank, this->get(),
964  req.get()));
965  return req;
966  }
967 
983  template <typename SendType, typename RecvType>
984  void all_gather(std::shared_ptr<const Executor> exec,
985  const SendType* send_buffer, const int send_count,
986  RecvType* recv_buffer, const int recv_count) const
987  {
988  auto guard = exec->get_scoped_device_id_guard();
989  GKO_ASSERT_NO_MPI_ERRORS(MPI_Allgather(
990  send_buffer, send_count, type_impl<SendType>::get_type(),
991  recv_buffer, recv_count, type_impl<RecvType>::get_type(),
992  this->get()));
993  }
994 
1013  template <typename SendType, typename RecvType>
1014  request i_all_gather(std::shared_ptr<const Executor> exec,
1015  const SendType* send_buffer, const int send_count,
1016  RecvType* recv_buffer, const int recv_count) const
1017  {
1018  auto guard = exec->get_scoped_device_id_guard();
1019  request req;
1020  GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallgather(
1021  send_buffer, send_count, type_impl<SendType>::get_type(),
1022  recv_buffer, recv_count, type_impl<RecvType>::get_type(),
1023  this->get(), req.get()));
1024  return req;
1025  }
1026 
1042  template <typename SendType, typename RecvType>
1043  void scatter(std::shared_ptr<const Executor> exec,
1044  const SendType* send_buffer, const int send_count,
1045  RecvType* recv_buffer, const int recv_count,
1046  int root_rank) const
1047  {
1048  auto guard = exec->get_scoped_device_id_guard();
1049  GKO_ASSERT_NO_MPI_ERRORS(MPI_Scatter(
1050  send_buffer, send_count, type_impl<SendType>::get_type(),
1051  recv_buffer, recv_count, type_impl<RecvType>::get_type(), root_rank,
1052  this->get()));
1053  }
1054 
1073  template <typename SendType, typename RecvType>
1074  request i_scatter(std::shared_ptr<const Executor> exec,
1075  const SendType* send_buffer, const int send_count,
1076  RecvType* recv_buffer, const int recv_count,
1077  int root_rank) const
1078  {
1079  auto guard = exec->get_scoped_device_id_guard();
1080  request req;
1081  GKO_ASSERT_NO_MPI_ERRORS(MPI_Iscatter(
1082  send_buffer, send_count, type_impl<SendType>::get_type(),
1083  recv_buffer, recv_count, type_impl<RecvType>::get_type(), root_rank,
1084  this->get(), req.get()));
1085  return req;
1086  }
1087 
1106  template <typename SendType, typename RecvType>
1107  void scatter_v(std::shared_ptr<const Executor> exec,
1108  const SendType* send_buffer, const int* send_counts,
1109  const int* displacements, RecvType* recv_buffer,
1110  const int recv_count, int root_rank) const
1111  {
1112  auto guard = exec->get_scoped_device_id_guard();
1113  GKO_ASSERT_NO_MPI_ERRORS(MPI_Scatterv(
1114  send_buffer, send_counts, displacements,
1115  type_impl<SendType>::get_type(), recv_buffer, recv_count,
1116  type_impl<RecvType>::get_type(), root_rank, this->get()));
1117  }
1118 
1139  template <typename SendType, typename RecvType>
1140  request i_scatter_v(std::shared_ptr<const Executor> exec,
1141  const SendType* send_buffer, const int* send_counts,
1142  const int* displacements, RecvType* recv_buffer,
1143  const int recv_count, int root_rank) const
1144  {
1145  auto guard = exec->get_scoped_device_id_guard();
1146  request req;
1147  GKO_ASSERT_NO_MPI_ERRORS(
1148  MPI_Iscatterv(send_buffer, send_counts, displacements,
1149  type_impl<SendType>::get_type(), recv_buffer,
1150  recv_count, type_impl<RecvType>::get_type(),
1151  root_rank, this->get(), req.get()));
1152  return req;
1153  }
1154 
1171  template <typename RecvType>
1172  void all_to_all(std::shared_ptr<const Executor> exec, RecvType* recv_buffer,
1173  const int recv_count) const
1174  {
1175  auto guard = exec->get_scoped_device_id_guard();
1176  GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoall(
1177  MPI_IN_PLACE, recv_count, type_impl<RecvType>::get_type(),
1178  recv_buffer, recv_count, type_impl<RecvType>::get_type(),
1179  this->get()));
1180  }
1181 
1200  template <typename RecvType>
1201  request i_all_to_all(std::shared_ptr<const Executor> exec,
1202  RecvType* recv_buffer, const int recv_count) const
1203  {
1204  auto guard = exec->get_scoped_device_id_guard();
1205  request req;
1206  GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoall(
1207  MPI_IN_PLACE, recv_count, type_impl<RecvType>::get_type(),
1208  recv_buffer, recv_count, type_impl<RecvType>::get_type(),
1209  this->get(), req.get()));
1210  return req;
1211  }
1212 
1229  template <typename SendType, typename RecvType>
1230  void all_to_all(std::shared_ptr<const Executor> exec,
1231  const SendType* send_buffer, const int send_count,
1232  RecvType* recv_buffer, const int recv_count) const
1233  {
1234  auto guard = exec->get_scoped_device_id_guard();
1235  GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoall(
1236  send_buffer, send_count, type_impl<SendType>::get_type(),
1237  recv_buffer, recv_count, type_impl<RecvType>::get_type(),
1238  this->get()));
1239  }
1240 
1259  template <typename SendType, typename RecvType>
1260  request i_all_to_all(std::shared_ptr<const Executor> exec,
1261  const SendType* send_buffer, const int send_count,
1262  RecvType* recv_buffer, const int recv_count) const
1263  {
1264  auto guard = exec->get_scoped_device_id_guard();
1265  request req;
1266  GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoall(
1267  send_buffer, send_count, type_impl<SendType>::get_type(),
1268  recv_buffer, recv_count, type_impl<RecvType>::get_type(),
1269  this->get(), req.get()));
1270  return req;
1271  }
1272 
1292  template <typename SendType, typename RecvType>
1293  void all_to_all_v(std::shared_ptr<const Executor> exec,
1294  const SendType* send_buffer, const int* send_counts,
1295  const int* send_offsets, RecvType* recv_buffer,
1296  const int* recv_counts, const int* recv_offsets) const
1297  {
1298  this->all_to_all_v(std::move(exec), send_buffer, send_counts,
1299  send_offsets, type_impl<SendType>::get_type(),
1300  recv_buffer, recv_counts, recv_offsets,
1302  }
1303 
1319  void all_to_all_v(std::shared_ptr<const Executor> exec,
1320  const void* send_buffer, const int* send_counts,
1321  const int* send_offsets, MPI_Datatype send_type,
1322  void* recv_buffer, const int* recv_counts,
1323  const int* recv_offsets, MPI_Datatype recv_type) const
1324  {
1325  auto guard = exec->get_scoped_device_id_guard();
1326  GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoallv(
1327  send_buffer, send_counts, send_offsets, send_type, recv_buffer,
1328  recv_counts, recv_offsets, recv_type, this->get()));
1329  }
1330 
1350  request i_all_to_all_v(std::shared_ptr<const Executor> exec,
1351  const void* send_buffer, const int* send_counts,
1352  const int* send_offsets, MPI_Datatype send_type,
1353  void* recv_buffer, const int* recv_counts,
1354  const int* recv_offsets,
1355  MPI_Datatype recv_type) const
1356  {
1357  auto guard = exec->get_scoped_device_id_guard();
1358  request req;
1359  GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoallv(
1360  send_buffer, send_counts, send_offsets, send_type, recv_buffer,
1361  recv_counts, recv_offsets, recv_type, this->get(), req.get()));
1362  return req;
1363  }
1364 
1385  template <typename SendType, typename RecvType>
1386  request i_all_to_all_v(std::shared_ptr<const Executor> exec,
1387  const SendType* send_buffer, const int* send_counts,
1388  const int* send_offsets, RecvType* recv_buffer,
1389  const int* recv_counts,
1390  const int* recv_offsets) const
1391  {
1392  return this->i_all_to_all_v(
1393  std::move(exec), send_buffer, send_counts, send_offsets,
1394  type_impl<SendType>::get_type(), recv_buffer, recv_counts,
1395  recv_offsets, type_impl<RecvType>::get_type());
1396  }
1397 
1412  template <typename ScanType>
1413  void scan(std::shared_ptr<const Executor> exec, const ScanType* send_buffer,
1414  ScanType* recv_buffer, int count, MPI_Op operation) const
1415  {
1416  auto guard = exec->get_scoped_device_id_guard();
1417  GKO_ASSERT_NO_MPI_ERRORS(MPI_Scan(send_buffer, recv_buffer, count,
1419  operation, this->get()));
1420  }
1421 
1438  template <typename ScanType>
1439  request i_scan(std::shared_ptr<const Executor> exec,
1440  const ScanType* send_buffer, ScanType* recv_buffer,
1441  int count, MPI_Op operation) const
1442  {
1443  auto guard = exec->get_scoped_device_id_guard();
1444  request req;
1445  GKO_ASSERT_NO_MPI_ERRORS(MPI_Iscan(send_buffer, recv_buffer, count,
1447  operation, this->get(), req.get()));
1448  return req;
1449  }
1450 
1451 private:
1452  std::shared_ptr<MPI_Comm> comm_;
1453  bool force_host_buffer_;
1454 
1455  int get_my_rank() const
1456  {
1457  int my_rank = 0;
1458  GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_rank(get(), &my_rank));
1459  return my_rank;
1460  }
1461 
1462  int get_node_local_rank() const
1463  {
1464  MPI_Comm local_comm;
1465  int rank;
1466  GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_split_type(
1467  this->get(), MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local_comm));
1468  GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_rank(local_comm, &rank));
1469  MPI_Comm_free(&local_comm);
1470  return rank;
1471  }
1472 
1473  int get_num_ranks() const
1474  {
1475  int size = 1;
1476  GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_size(this->get(), &size));
1477  return size;
1478  }
1479 
1480  bool compare(const MPI_Comm& other) const
1481  {
1482  int flag;
1483  GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_compare(get(), other, &flag));
1484  return flag == MPI_IDENT;
1485  }
1486 };
1487 
1488 
1493 bool requires_host_buffer(const std::shared_ptr<const Executor>& exec,
1494  const communicator& comm);
1495 
1496 
1502 inline double get_walltime() { return MPI_Wtime(); }
1503 
1504 
1513 template <typename ValueType>
1514 class window {
1515 public:
1519  enum class create_type { allocate = 1, create = 2, dynamic_create = 3 };
1520 
1524  enum class lock_type { shared = 1, exclusive = 2 };
1525 
1529  window() : window_(MPI_WIN_NULL) {}
1530 
1531  window(const window& other) = delete;
1532 
1533  window& operator=(const window& other) = delete;
1534 
1541  window(window&& other) : window_{std::exchange(other.window_, MPI_WIN_NULL)}
1542  {}
1543 
1551  {
1552  window_ = std::exchange(other.window_, MPI_WIN_NULL);
1553  }
1554 
1567  window(std::shared_ptr<const Executor> exec, ValueType* base, int num_elems,
1568  const communicator& comm, const int disp_unit = sizeof(ValueType),
1569  MPI_Info input_info = MPI_INFO_NULL,
1570  create_type c_type = create_type::create)
1571  {
1572  auto guard = exec->get_scoped_device_id_guard();
1573  unsigned size = num_elems * sizeof(ValueType);
1574  if (c_type == create_type::create) {
1575  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_create(
1576  base, size, disp_unit, input_info, comm.get(), &this->window_));
1577  } else if (c_type == create_type::dynamic_create) {
1578  GKO_ASSERT_NO_MPI_ERRORS(
1579  MPI_Win_create_dynamic(input_info, comm.get(), &this->window_));
1580  } else if (c_type == create_type::allocate) {
1581  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_allocate(
1582  size, disp_unit, input_info, comm.get(), base, &this->window_));
1583  } else {
1584  GKO_NOT_IMPLEMENTED;
1585  }
1586  }
1587 
1593  MPI_Win get_window() const { return this->window_; }
1594 
1601  void fence(int assert = 0) const
1602  {
1603  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_fence(assert, this->window_));
1604  }
1605 
1614  void lock(int rank, lock_type lock_t = lock_type::shared,
1615  int assert = 0) const
1616  {
1617  if (lock_t == lock_type::shared) {
1618  GKO_ASSERT_NO_MPI_ERRORS(
1619  MPI_Win_lock(MPI_LOCK_SHARED, rank, assert, this->window_));
1620  } else if (lock_t == lock_type::exclusive) {
1621  GKO_ASSERT_NO_MPI_ERRORS(
1622  MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, assert, this->window_));
1623  } else {
1624  GKO_NOT_IMPLEMENTED;
1625  }
1626  }
1627 
1634  void unlock(int rank) const
1635  {
1636  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_unlock(rank, this->window_));
1637  }
1638 
1645  void lock_all(int assert = 0) const
1646  {
1647  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_lock_all(assert, this->window_));
1648  }
1649 
1654  void unlock_all() const
1655  {
1656  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_unlock_all(this->window_));
1657  }
1658 
1665  void flush(int rank) const
1666  {
1667  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_flush(rank, this->window_));
1668  }
1669 
1676  void flush_local(int rank) const
1677  {
1678  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_flush_local(rank, this->window_));
1679  }
1680 
1685  void flush_all() const
1686  {
1687  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_flush_all(this->window_));
1688  }
1689 
1694  void flush_all_local() const
1695  {
1696  GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_flush_local_all(this->window_));
1697  }
1698 
1702  void sync() const { GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_sync(this->window_)); }
1703 
1708  {
1709  if (this->window_ && this->window_ != MPI_WIN_NULL) {
1710  MPI_Win_free(&this->window_);
1711  }
1712  }
1713 
1724  template <typename PutType>
1725  void put(std::shared_ptr<const Executor> exec, const PutType* origin_buffer,
1726  const int origin_count, const int target_rank,
1727  const unsigned int target_disp, const int target_count) const
1728  {
1729  auto guard = exec->get_scoped_device_id_guard();
1730  GKO_ASSERT_NO_MPI_ERRORS(
1731  MPI_Put(origin_buffer, origin_count, type_impl<PutType>::get_type(),
1732  target_rank, target_disp, target_count,
1734  }
1735 
1748  template <typename PutType>
1749  request r_put(std::shared_ptr<const Executor> exec,
1750  const PutType* origin_buffer, const int origin_count,
1751  const int target_rank, const unsigned int target_disp,
1752  const int target_count) const
1753  {
1754  auto guard = exec->get_scoped_device_id_guard();
1755  request req;
1756  GKO_ASSERT_NO_MPI_ERRORS(MPI_Rput(
1757  origin_buffer, origin_count, type_impl<PutType>::get_type(),
1758  target_rank, target_disp, target_count,
1759  type_impl<PutType>::get_type(), this->get_window(), req.get()));
1760  return req;
1761  }
1762 
1774  template <typename PutType>
1775  void accumulate(std::shared_ptr<const Executor> exec,
1776  const PutType* origin_buffer, const int origin_count,
1777  const int target_rank, const unsigned int target_disp,
1778  const int target_count, MPI_Op operation) const
1779  {
1780  auto guard = exec->get_scoped_device_id_guard();
1781  GKO_ASSERT_NO_MPI_ERRORS(MPI_Accumulate(
1782  origin_buffer, origin_count, type_impl<PutType>::get_type(),
1783  target_rank, target_disp, target_count,
1784  type_impl<PutType>::get_type(), operation, this->get_window()));
1785  }
1786 
1800  template <typename PutType>
1801  request r_accumulate(std::shared_ptr<const Executor> exec,
1802  const PutType* origin_buffer, const int origin_count,
1803  const int target_rank, const unsigned int target_disp,
1804  const int target_count, MPI_Op operation) const
1805  {
1806  auto guard = exec->get_scoped_device_id_guard();
1807  request req;
1808  GKO_ASSERT_NO_MPI_ERRORS(MPI_Raccumulate(
1809  origin_buffer, origin_count, type_impl<PutType>::get_type(),
1810  target_rank, target_disp, target_count,
1811  type_impl<PutType>::get_type(), operation, this->get_window(),
1812  req.get()));
1813  return req;
1814  }
1815 
1826  template <typename GetType>
1827  void get(std::shared_ptr<const Executor> exec, GetType* origin_buffer,
1828  const int origin_count, const int target_rank,
1829  const unsigned int target_disp, const int target_count) const
1830  {
1831  auto guard = exec->get_scoped_device_id_guard();
1832  GKO_ASSERT_NO_MPI_ERRORS(
1833  MPI_Get(origin_buffer, origin_count, type_impl<GetType>::get_type(),
1834  target_rank, target_disp, target_count,
1836  }
1837 
1850  template <typename GetType>
1851  request r_get(std::shared_ptr<const Executor> exec, GetType* origin_buffer,
1852  const int origin_count, const int target_rank,
1853  const unsigned int target_disp, const int target_count) const
1854  {
1855  auto guard = exec->get_scoped_device_id_guard();
1856  request req;
1857  GKO_ASSERT_NO_MPI_ERRORS(MPI_Rget(
1858  origin_buffer, origin_count, type_impl<GetType>::get_type(),
1859  target_rank, target_disp, target_count,
1860  type_impl<GetType>::get_type(), this->get_window(), req.get()));
1861  return req;
1862  }
1863 
1877  template <typename GetType>
1878  void get_accumulate(std::shared_ptr<const Executor> exec,
1879  GetType* origin_buffer, const int origin_count,
1880  GetType* result_buffer, const int result_count,
1881  const int target_rank, const unsigned int target_disp,
1882  const int target_count, MPI_Op operation) const
1883  {
1884  auto guard = exec->get_scoped_device_id_guard();
1885  GKO_ASSERT_NO_MPI_ERRORS(MPI_Get_accumulate(
1886  origin_buffer, origin_count, type_impl<GetType>::get_type(),
1887  result_buffer, result_count, type_impl<GetType>::get_type(),
1888  target_rank, target_disp, target_count,
1889  type_impl<GetType>::get_type(), operation, this->get_window()));
1890  }
1891 
1907  template <typename GetType>
1908  request r_get_accumulate(std::shared_ptr<const Executor> exec,
1909  GetType* origin_buffer, const int origin_count,
1910  GetType* result_buffer, const int result_count,
1911  const int target_rank,
1912  const unsigned int target_disp,
1913  const int target_count, MPI_Op operation) const
1914  {
1915  auto guard = exec->get_scoped_device_id_guard();
1916  request req;
1917  GKO_ASSERT_NO_MPI_ERRORS(MPI_Rget_accumulate(
1918  origin_buffer, origin_count, type_impl<GetType>::get_type(),
1919  result_buffer, result_count, type_impl<GetType>::get_type(),
1920  target_rank, target_disp, target_count,
1921  type_impl<GetType>::get_type(), operation, this->get_window(),
1922  req.get()));
1923  return req;
1924  }
1925 
1936  template <typename GetType>
1937  void fetch_and_op(std::shared_ptr<const Executor> exec,
1938  GetType* origin_buffer, GetType* result_buffer,
1939  const int target_rank, const unsigned int target_disp,
1940  MPI_Op operation) const
1941  {
1942  auto guard = exec->get_scoped_device_id_guard();
1943  GKO_ASSERT_NO_MPI_ERRORS(MPI_Fetch_and_op(
1944  origin_buffer, result_buffer, type_impl<GetType>::get_type(),
1945  target_rank, target_disp, operation, this->get_window()));
1946  }
1947 
1948 private:
1949  MPI_Win window_;
1950 };
1951 
1952 
1953 } // namespace mpi
1954 } // namespace experimental
1955 } // namespace gko
1956 
1957 
1958 #endif // GKO_HAVE_MPI
1959 
1960 
1961 #endif // GKO_PUBLIC_CORE_BASE_MPI_HPP_
gko::experimental::mpi::window
This class wraps the MPI_Window class with RAII functionality.
Definition: mpi.hpp:1514
gko::experimental::mpi::environment::get_provided_thread_support
int get_provided_thread_support() const
Return the provided thread support.
Definition: mpi.hpp:227
gko::experimental::mpi::requires_host_buffer
bool requires_host_buffer(const std::shared_ptr< const Executor > &exec, const communicator &comm)
Checks if the combination of Executor and communicator requires passing MPI buffers from the host mem...
gko::experimental::mpi::communicator::i_scan
request i_scan(std::shared_ptr< const Executor > exec, const ScanType *send_buffer, ScanType *recv_buffer, int count, MPI_Op operation) const
Does a scan operation with the given operator.
Definition: mpi.hpp:1439
gko::experimental::mpi::contiguous_type::contiguous_type
contiguous_type()
Constructs empty wrapper with MPI_DATATYPE_NULL.
Definition: mpi.hpp:126
gko::experimental::mpi::communicator::scan
void scan(std::shared_ptr< const Executor > exec, const ScanType *send_buffer, ScanType *recv_buffer, int count, MPI_Op operation) const
Does a scan operation with the given operator.
Definition: mpi.hpp:1413
gko::experimental::mpi::window::get_accumulate
void get_accumulate(std::shared_ptr< const Executor > exec, GetType *origin_buffer, const int origin_count, GetType *result_buffer, const int result_count, const int target_rank, const unsigned int target_disp, const int target_count, MPI_Op operation) const
Get Accumulate data from the target window.
Definition: mpi.hpp:1878
gko::experimental::mpi::window::lock
void lock(int rank, lock_type lock_t=lock_type::shared, int assert=0) const
Create an epoch using MPI_Win_lock for the window object.
Definition: mpi.hpp:1614
gko::experimental::mpi::communicator::i_broadcast
request i_broadcast(std::shared_ptr< const Executor > exec, BroadcastType *buffer, int count, int root_rank) const
(Non-blocking) Broadcast data from calling process to all ranks in the communicator
Definition: mpi.hpp:668
gko::experimental::mpi::communicator::communicator
communicator(const communicator &comm, int color, int key)
Create a communicator object from an existing MPI_Comm object using color and key.
Definition: mpi.hpp:457
gko::experimental::mpi::window::window
window(std::shared_ptr< const Executor > exec, ValueType *base, int num_elems, const communicator &comm, const int disp_unit=sizeof(ValueType), MPI_Info input_info=MPI_INFO_NULL, create_type c_type=create_type::create)
Create a window object with a given data pointer and type.
Definition: mpi.hpp:1567
gko::experimental::mpi::communicator::i_all_to_all
request i_all_to_all(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int recv_count) const
(Non-blocking) Communicate data from all ranks to all other ranks (MPI_Ialltoall).
Definition: mpi.hpp:1260
gko::experimental::mpi::environment::environment
environment(int &argc, char **&argv, const thread_type thread_t=thread_type::serialized)
Call MPI_Init_thread and initialize the MPI environment.
Definition: mpi.hpp:237
gko::experimental::mpi::window::put
void put(std::shared_ptr< const Executor > exec, const PutType *origin_buffer, const int origin_count, const int target_rank, const unsigned int target_disp, const int target_count) const
Put data into the target window.
Definition: mpi.hpp:1725
gko::experimental::mpi::window::create_type
create_type
The create type for the window object.
Definition: mpi.hpp:1519
gko::experimental::mpi::window::accumulate
void accumulate(std::shared_ptr< const Executor > exec, const PutType *origin_buffer, const int origin_count, const int target_rank, const unsigned int target_disp, const int target_count, MPI_Op operation) const
Accumulate data into the target window.
Definition: mpi.hpp:1775
gko::experimental::mpi::window::fetch_and_op
void fetch_and_op(std::shared_ptr< const Executor > exec, GetType *origin_buffer, GetType *result_buffer, const int target_rank, const unsigned int target_disp, MPI_Op operation) const
Fetch and operate on data from the target window (An optimized version of Get_accumulate).
Definition: mpi.hpp:1937
gko::experimental::mpi::environment
Class that sets up and finalizes the MPI environment.
Definition: mpi.hpp:206
gko::experimental::mpi::communicator::all_to_all_v
void all_to_all_v(std::shared_ptr< const Executor > exec, const void *send_buffer, const int *send_counts, const int *send_offsets, MPI_Datatype send_type, void *recv_buffer, const int *recv_counts, const int *recv_offsets, MPI_Datatype recv_type) const
Communicate data from all ranks to all other ranks with offsets (MPI_Alltoallv).
Definition: mpi.hpp:1319
gko::experimental::mpi::communicator::send
void send(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, const int destination_rank, const int send_tag) const
Send (Blocking) data from calling process to destination rank.
Definition: mpi.hpp:535
gko::experimental::mpi::communicator::communicator
communicator(const MPI_Comm &comm, int color, int key)
Create a communicator object from an existing MPI_Comm object using color and key.
Definition: mpi.hpp:442
gko::experimental::mpi::communicator::i_scatter_v
request i_scatter_v(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int *send_counts, const int *displacements, RecvType *recv_buffer, const int recv_count, int root_rank) const
(Non-blocking) Scatter data from root rank to all ranks in the communicator with offsets.
Definition: mpi.hpp:1140
gko::experimental::mpi::communicator::synchronize
void synchronize() const
This function is used to synchronize the ranks in the communicator.
Definition: mpi.hpp:516
gko::experimental::mpi::communicator::all_gather
void all_gather(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int recv_count) const
Gather data onto all ranks from all ranks in the communicator.
Definition: mpi.hpp:984
gko::experimental::mpi::communicator::all_reduce
void all_reduce(std::shared_ptr< const Executor > exec, const ReduceType *send_buffer, ReduceType *recv_buffer, int count, MPI_Op operation) const
Reduce data from all calling processes from all calling processes on same communicator.
Definition: mpi.hpp:799
gko::experimental::mpi::window::get_window
MPI_Win get_window() const
Get the underlying window object of MPI_Win type.
Definition: mpi.hpp:1593
gko::experimental::mpi::window::fence
void fence(int assert=0) const
The active target synchronization using MPI_Win_fence for the window object.
Definition: mpi.hpp:1601
gko::experimental::mpi::communicator::broadcast
void broadcast(std::shared_ptr< const Executor > exec, BroadcastType *buffer, int count, int root_rank) const
Broadcast data from calling process to all ranks in the communicator.
Definition: mpi.hpp:643
gko::experimental::mpi::window::unlock_all
void unlock_all() const
Close the epoch on all ranks using MPI_Win_unlock_all for the window object.
Definition: mpi.hpp:1654
gko::experimental::mpi::communicator::all_reduce
void all_reduce(std::shared_ptr< const Executor > exec, ReduceType *recv_buffer, int count, MPI_Op operation) const
(In-place) Reduce data from all calling processes from all calling processes on same communicator.
Definition: mpi.hpp:747
gko::experimental::mpi::communicator::i_all_to_all
request i_all_to_all(std::shared_ptr< const Executor > exec, RecvType *recv_buffer, const int recv_count) const
(In-place, Non-blocking) Communicate data from all ranks to all other ranks in place (MPI_Ialltoall).
Definition: mpi.hpp:1201
gko::experimental::mpi::contiguous_type::contiguous_type
contiguous_type(contiguous_type &&other) noexcept
Move constructor, leaves other with MPI_DATATYPE_NULL.
Definition: mpi.hpp:143
gko::experimental::mpi::request
The request class is a light, move-only wrapper around the MPI_Request handle.
Definition: mpi.hpp:327
gko::experimental::mpi::communicator::size
int size() const
Return the size of the communicator (number of ranks).
Definition: mpi.hpp:479
gko::experimental::mpi::status::status
status()
The default constructor.
Definition: mpi.hpp:291
gko::experimental::mpi::environment::~environment
~environment()
Call MPI_Finalize at the end of the scope of this class.
Definition: mpi.hpp:249
gko::experimental::mpi::communicator::i_scatter
request i_scatter(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int recv_count, int root_rank) const
(Non-blocking) Scatter data from root rank to all ranks in the communicator.
Definition: mpi.hpp:1074
gko
The Ginkgo namespace.
Definition: abstract_factory.hpp:20
gko::experimental::mpi::communicator::reduce
void reduce(std::shared_ptr< const Executor > exec, const ReduceType *send_buffer, ReduceType *recv_buffer, int count, MPI_Op operation, int root_rank) const
Reduce data into root from all calling processes on the same communicator.
Definition: mpi.hpp:694
gko::experimental::mpi::request::wait
status wait()
Allows a rank to wait on a particular request handle.
Definition: mpi.hpp:372
gko::experimental::mpi::contiguous_type::operator=
contiguous_type & operator=(contiguous_type &&other) noexcept
Move assignment, leaves other with MPI_DATATYPE_NULL.
Definition: mpi.hpp:155
gko::experimental::mpi::window::flush_all
void flush_all() const
Flush all the existing RDMA operations for the calling process for the window object.
Definition: mpi.hpp:1685
gko::experimental::mpi::communicator::all_to_all_v
void all_to_all_v(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int *send_counts, const int *send_offsets, RecvType *recv_buffer, const int *recv_counts, const int *recv_offsets) const
Communicate data from all ranks to all other ranks with offsets (MPI_Alltoallv).
Definition: mpi.hpp:1293
gko::experimental::mpi::communicator::i_gather_v
request i_gather_v(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int *recv_counts, const int *displacements, int root_rank) const
(Non-blocking) Gather data onto the root rank from all ranks in the communicator with offsets.
Definition: mpi.hpp:953
gko::experimental::mpi::contiguous_type::contiguous_type
contiguous_type(int count, MPI_Datatype old_type)
Constructs a wrapper for a contiguous MPI_Datatype.
Definition: mpi.hpp:117
gko::experimental::mpi::window::unlock
void unlock(int rank) const
Close the epoch using MPI_Win_unlock for the window object.
Definition: mpi.hpp:1634
gko::experimental::mpi::window::flush_all_local
void flush_all_local() const
Flush all the local existing RDMA operations on the calling rank for the window object.
Definition: mpi.hpp:1694
gko::experimental::mpi::is_gpu_aware
constexpr bool is_gpu_aware()
Return if GPU aware functionality is available.
Definition: mpi.hpp:42
gko::experimental::mpi::window::lock_all
void lock_all(int assert=0) const
Create the epoch on all ranks using MPI_Win_lock_all for the window object.
Definition: mpi.hpp:1645
gko::experimental::mpi::window::lock_type
lock_type
The lock type for passive target synchronization of the windows.
Definition: mpi.hpp:1524
gko::experimental::mpi::contiguous_type::get
MPI_Datatype get() const
Access the underlying MPI_Datatype.
Definition: mpi.hpp:178
gko::experimental::mpi::communicator::operator==
bool operator==(const communicator &rhs) const
Compare two communicator objects for equality.
Definition: mpi.hpp:500
gko::experimental::mpi::window::r_accumulate
request r_accumulate(std::shared_ptr< const Executor > exec, const PutType *origin_buffer, const int origin_count, const int target_rank, const unsigned int target_disp, const int target_count, MPI_Op operation) const
(Non-blocking) Accumulate data into the target window.
Definition: mpi.hpp:1801
gko::experimental::mpi::communicator
A thin wrapper of MPI_Comm that supports most MPI calls.
Definition: mpi.hpp:416
gko::experimental::mpi::contiguous_type::~contiguous_type
~contiguous_type()
Destructs object by freeing wrapped MPI_Datatype.
Definition: mpi.hpp:166
gko::experimental::mpi::type_impl
A struct that is used to determine the MPI_Datatype of a specified type.
Definition: mpi.hpp:77
gko::experimental::mpi::communicator::communicator
communicator(const MPI_Comm &comm, bool force_host_buffer=false)
Non-owning constructor for an existing communicator of type MPI_Comm.
Definition: mpi.hpp:428
gko::experimental::mpi::communicator::all_to_all
void all_to_all(std::shared_ptr< const Executor > exec, RecvType *recv_buffer, const int recv_count) const
(In-place) Communicate data from all ranks to all other ranks in place (MPI_Alltoall).
Definition: mpi.hpp:1172
gko::experimental::mpi::communicator::i_all_reduce
request i_all_reduce(std::shared_ptr< const Executor > exec, const ReduceType *send_buffer, ReduceType *recv_buffer, int count, MPI_Op operation) const
Reduce data from all calling processes from all calling processes on same communicator.
Definition: mpi.hpp:826
gko::experimental::mpi::request::request
request()
The default constructor.
Definition: mpi.hpp:333
gko::experimental::mpi::communicator::i_send
request i_send(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, const int destination_rank, const int send_tag) const
Send (Non-blocking, Immediate return) data from calling process to destination rank.
Definition: mpi.hpp:562
gko::experimental::mpi::communicator::scatter
void scatter(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int recv_count, int root_rank) const
Scatter data from root rank to all ranks in the communicator.
Definition: mpi.hpp:1043
gko::experimental::mpi::window::r_get_accumulate
request r_get_accumulate(std::shared_ptr< const Executor > exec, GetType *origin_buffer, const int origin_count, GetType *result_buffer, const int result_count, const int target_rank, const unsigned int target_disp, const int target_count, MPI_Op operation) const
(Non-blocking) Get Accumulate data (with handle) from the target window.
Definition: mpi.hpp:1908
gko::experimental::mpi::window::~window
~window()
The deleter which calls MPI_Win_free when the window leaves its scope.
Definition: mpi.hpp:1707
gko::experimental::mpi::map_rank_to_device_id
int map_rank_to_device_id(MPI_Comm comm, int num_devices)
Maps each MPI rank to a single device id in a round robin manner.
gko::experimental::mpi::window::window
window()
The default constructor.
Definition: mpi.hpp:1529
gko::experimental::mpi::get_walltime
double get_walltime()
Get the rank in the communicator of the calling process.
Definition: mpi.hpp:1502
gko::experimental::mpi::status::get_count
int get_count(const T *data) const
Get the count of the number of elements received by the communication call.
Definition: mpi.hpp:311
gko::experimental::mpi::communicator::i_gather
request i_gather(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int recv_count, int root_rank) const
(Non-blocking) Gather data onto the root rank from all ranks in the communicator.
Definition: mpi.hpp:887
gko::experimental::mpi::communicator::rank
int rank() const
Return the rank of the calling process in the communicator.
Definition: mpi.hpp:486
gko::experimental::mpi::communicator::i_all_reduce
request i_all_reduce(std::shared_ptr< const Executor > exec, ReduceType *recv_buffer, int count, MPI_Op operation) const
(In-place, non-blocking) Reduce data from all calling processes from all calling processes on same co...
Definition: mpi.hpp:772
gko::experimental::mpi::wait_all
std::vector< status > wait_all(std::vector< request > &req)
Allows a rank to wait on multiple request handles.
Definition: mpi.hpp:392
gko::half
A class providing basic support for half precision floating point types.
Definition: half.hpp:286
gko::experimental::mpi::communicator::gather_v
void gather_v(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int *recv_counts, const int *displacements, int root_rank) const
Gather data onto the root rank from all ranks in the communicator with offsets.
Definition: mpi.hpp:920
gko::experimental::mpi::communicator::scatter_v
void scatter_v(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int *send_counts, const int *displacements, RecvType *recv_buffer, const int recv_count, int root_rank) const
Scatter data from root rank to all ranks in the communicator with offsets.
Definition: mpi.hpp:1107
gko::experimental::mpi::thread_type
thread_type
This enum specifies the threading type to be used when creating an MPI environment.
Definition: mpi.hpp:189
gko::experimental::mpi::communicator::operator!=
bool operator!=(const communicator &rhs) const
Compare two communicator objects for non-equality.
Definition: mpi.hpp:510
gko::experimental::mpi::status::get
MPI_Status * get()
Get a pointer to the underlying MPI_Status object.
Definition: mpi.hpp:298
gko::experimental::mpi::communicator::gather
void gather(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int recv_count, int root_rank) const
Gather data onto the root rank from all ranks in the communicator.
Definition: mpi.hpp:855
gko::experimental::mpi::contiguous_type::operator=
contiguous_type & operator=(const contiguous_type &)=delete
Disallow copying of wrapper type.
gko::experimental::mpi::communicator::i_all_gather
request i_all_gather(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int recv_count) const
(Non-blocking) Gather data onto all ranks from all ranks in the communicator.
Definition: mpi.hpp:1014
gko::experimental::mpi::window::window
window(window &&other)
The move constructor.
Definition: mpi.hpp:1541
gko::experimental::mpi::status
The status struct is a light wrapper around the MPI_Status struct.
Definition: mpi.hpp:287
gko::experimental::mpi::window::flush_local
void flush_local(int rank) const
Flush the existing RDMA operations on the calling rank from the target rank for the window object.
Definition: mpi.hpp:1676
gko::experimental::mpi::request::get
MPI_Request * get()
Get a pointer to the underlying MPI_Request handle.
Definition: mpi.hpp:364
gko::experimental::mpi::communicator::i_all_to_all_v
request i_all_to_all_v(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int *send_counts, const int *send_offsets, RecvType *recv_buffer, const int *recv_counts, const int *recv_offsets) const
Communicate data from all ranks to all other ranks with offsets (MPI_Ialltoallv).
Definition: mpi.hpp:1386
gko::experimental::mpi::communicator::i_recv
request i_recv(std::shared_ptr< const Executor > exec, RecvType *recv_buffer, const int recv_count, const int source_rank, const int recv_tag) const
Receive (Non-blocking, Immediate return) data from source rank.
Definition: mpi.hpp:618
gko::experimental::mpi::window::sync
void sync() const
Synchronize the public and private buffers for the window object.
Definition: mpi.hpp:1702
gko::experimental::mpi::window::flush
void flush(int rank) const
Flush the existing RDMA operations on the target rank for the calling process for the window object.
Definition: mpi.hpp:1665
gko::experimental::mpi::communicator::node_local_rank
int node_local_rank() const
Return the node local rank of the calling process in the communicator.
Definition: mpi.hpp:493
gko::experimental::mpi::contiguous_type
A move-only wrapper for a contiguous MPI_Datatype.
Definition: mpi.hpp:109
gko::experimental::mpi::communicator::i_all_to_all_v
request i_all_to_all_v(std::shared_ptr< const Executor > exec, const void *send_buffer, const int *send_counts, const int *send_offsets, MPI_Datatype send_type, void *recv_buffer, const int *recv_counts, const int *recv_offsets, MPI_Datatype recv_type) const
Communicate data from all ranks to all other ranks with offsets (MPI_Ialltoallv).
Definition: mpi.hpp:1350
gko::experimental::mpi::communicator::all_to_all
void all_to_all(std::shared_ptr< const Executor > exec, const SendType *send_buffer, const int send_count, RecvType *recv_buffer, const int recv_count) const
Communicate data from all ranks to all other ranks (MPI_Alltoall).
Definition: mpi.hpp:1230
gko::experimental::mpi::window::r_get
request r_get(std::shared_ptr< const Executor > exec, GetType *origin_buffer, const int origin_count, const int target_rank, const unsigned int target_disp, const int target_count) const
Get data (with handle) from the target window.
Definition: mpi.hpp:1851
gko::experimental::mpi::communicator::recv
status recv(std::shared_ptr< const Executor > exec, RecvType *recv_buffer, const int recv_count, const int source_rank, const int recv_tag) const
Receive data from source rank.
Definition: mpi.hpp:590
gko::experimental::mpi::communicator::i_reduce
request i_reduce(std::shared_ptr< const Executor > exec, const ReduceType *send_buffer, ReduceType *recv_buffer, int count, MPI_Op operation, int root_rank) const
(Non-blocking) Reduce data into root from all calling processes on the same communicator.
Definition: mpi.hpp:721
gko::experimental::mpi::window::get
void get(std::shared_ptr< const Executor > exec, GetType *origin_buffer, const int origin_count, const int target_rank, const unsigned int target_disp, const int target_count) const
Get data from the target window.
Definition: mpi.hpp:1827
gko::experimental::mpi::window::operator=
window & operator=(window &&other)
The move assignment operator.
Definition: mpi.hpp:1550
gko::experimental::mpi::communicator::get
const MPI_Comm & get() const
Return the underlying MPI_Comm object.
Definition: mpi.hpp:470
gko::experimental::mpi::window::r_put
request r_put(std::shared_ptr< const Executor > exec, const PutType *origin_buffer, const int origin_count, const int target_rank, const unsigned int target_disp, const int target_count) const
Put data into the target window.
Definition: mpi.hpp:1749