doc/main/mpi_8hpp_source.html

// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors

//

// SPDX-License-Identifier: BSD-3-Clause


#ifndef GKO_PUBLIC_CORE_BASE_MPI_HPP_

#define GKO_PUBLIC_CORE_BASE_MPI_HPP_


#include <memory>

#include <type_traits>

#include <utility>


#include <ginkgo/config.hpp>

#include <ginkgo/core/base/exception.hpp>

#include <ginkgo/core/base/exception_helpers.hpp>

#include <ginkgo/core/base/executor.hpp>

#include <ginkgo/core/base/half.hpp>

#include <ginkgo/core/base/types.hpp>

#include <ginkgo/core/base/utils_helper.hpp>


#if GINKGO_BUILD_MPI


#include <mpi.h>


namespace gko {

namespace experimental {

namespace mpi {


inline constexpr bool is_gpu_aware()

{

#if GINKGO_HAVE_GPU_AWARE_MPI

    return true;

#else

    return false;

#endif

}


int map_rank_to_device_id(MPI_Comm comm, int num_devices);


#define GKO_REGISTER_MPI_TYPE(input_type, mpi_type)         \

    template <>                                             \

    struct type_impl<input_type> {                          \

        static MPI_Datatype get_type() { return mpi_type; } \

    }


template <typename T>

struct type_impl {};


GKO_REGISTER_MPI_TYPE(char, MPI_CHAR);

GKO_REGISTER_MPI_TYPE(unsigned char, MPI_UNSIGNED_CHAR);

GKO_REGISTER_MPI_TYPE(unsigned, MPI_UNSIGNED);

GKO_REGISTER_MPI_TYPE(int, MPI_INT);

GKO_REGISTER_MPI_TYPE(unsigned short, MPI_UNSIGNED_SHORT);

GKO_REGISTER_MPI_TYPE(unsigned long, MPI_UNSIGNED_LONG);

GKO_REGISTER_MPI_TYPE(long, MPI_LONG);

GKO_REGISTER_MPI_TYPE(long long, MPI_LONG_LONG_INT);

GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG);

GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT);

GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE);

GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE);

#if GINKGO_ENABLE_HALF

// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16

// Only OpenMPI support complex float16

// TODO: use native type when mpi is configured with half feature

GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT);

GKO_REGISTER_MPI_TYPE(std::complex<half>, MPI_FLOAT);

#endif  // GKO_ENABLE_HALF

#if GINKGO_ENABLE_BFLOAT16

GKO_REGISTER_MPI_TYPE(bfloat16, MPI_UNSIGNED_SHORT);

GKO_REGISTER_MPI_TYPE(std::complex<bfloat16>, MPI_FLOAT);

#endif  // GKO_ENABLE_BFLOAT16

GKO_REGISTER_MPI_TYPE(std::complex<float>, MPI_C_FLOAT_COMPLEX);

GKO_REGISTER_MPI_TYPE(std::complex<double>, MPI_C_DOUBLE_COMPLEX);


class contiguous_type {

public:

    contiguous_type(int count, MPI_Datatype old_type) : type_(MPI_DATATYPE_NULL)

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Type_contiguous(count, old_type, &type_));

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Type_commit(&type_));

    }


    contiguous_type() : type_(MPI_DATATYPE_NULL) {}


    contiguous_type(const contiguous_type&) = delete;


    contiguous_type& operator=(const contiguous_type&) = delete;


    contiguous_type(contiguous_type&& other) noexcept : type_(MPI_DATATYPE_NULL)

    {

        *this = std::move(other);

    }


    contiguous_type& operator=(contiguous_type&& other) noexcept

    {

        if (this != &other) {

            this->type_ = std::exchange(other.type_, MPI_DATATYPE_NULL);

        }

        return *this;

    }


    ~contiguous_type()

    {

        if (type_ != MPI_DATATYPE_NULL) {

            MPI_Type_free(&type_);

        }

    }


    MPI_Datatype get() const { return type_; }


private:

    MPI_Datatype type_;

};


enum class thread_type {

    serialized = MPI_THREAD_SERIALIZED,

    funneled = MPI_THREAD_FUNNELED,

    single = MPI_THREAD_SINGLE,

    multiple = MPI_THREAD_MULTIPLE

};


class environment {

public:

    static bool is_finalized()

    {

        int flag = 0;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Finalized(&flag));

        return flag;

    }


    static bool is_initialized()

    {

        int flag = 0;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Initialized(&flag));

        return flag;

    }


    int get_provided_thread_support() const { return provided_thread_support_; }


    environment(int& argc, char**& argv,

                const thread_type thread_t = thread_type::serialized)

    {

        this->required_thread_support_ = static_cast<int>(thread_t);

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Init_thread(&argc, &argv, this->required_thread_support_,

                            &(this->provided_thread_support_)));

    }


    ~environment() { MPI_Finalize(); }


    environment(const environment&) = delete;

    environment(environment&&) = delete;

    environment& operator=(const environment&) = delete;

    environment& operator=(environment&&) = delete;


private:

    int required_thread_support_;

    int provided_thread_support_;

};


namespace {


class comm_deleter {

public:

    using pointer = MPI_Comm*;

    void operator()(pointer comm) const

    {

        GKO_ASSERT(*comm != MPI_COMM_NULL);

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_free(comm));

        delete comm;

    }

};


}  // namespace


struct status {

    status() : status_(MPI_Status{}) {}


    MPI_Status* get() { return &this->status_; }


    template <typename T>

    int get_count(const T* data) const

    {

        int count;

        MPI_Get_count(&status_, type_impl<T>::get_type(), &count);

        return count;

    }


private:

    MPI_Status status_;

};


class request {

public:

    request() : req_(MPI_REQUEST_NULL) {}


    request(const request&) = delete;


    request& operator=(const request&) = delete;


    request(request&& o) noexcept { *this = std::move(o); }


    request& operator=(request&& o) noexcept

    {

        if (this != &o) {

            this->req_ = std::exchange(o.req_, MPI_REQUEST_NULL);

        }

        return *this;

    }


    ~request()

    {

        if (req_ != MPI_REQUEST_NULL) {

            if (MPI_Request_free(&req_) != MPI_SUCCESS) {

                std::terminate();  // since we can't throw in destructors, we

                                   // have to terminate the program

            }

        }

    }


    MPI_Request* get() { return &this->req_; }


    status wait()

    {

        status status;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Wait(&req_, status.get()));

        return status;

    }


private:

    MPI_Request req_;

};


inline std::vector<status> wait_all(std::vector<request>& req)

{

    std::vector<status> stat;

    for (std::size_t i = 0; i < req.size(); ++i) {

        stat.emplace_back(req[i].wait());

    }

    return stat;

}


class communicator {

public:

    communicator(const MPI_Comm& comm, bool force_host_buffer = false)

        : comm_(), force_host_buffer_(force_host_buffer)

    {

        this->comm_.reset(new MPI_Comm(comm));

    }


    communicator(const MPI_Comm& comm, int color, int key)

    {

        MPI_Comm comm_out;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_split(comm, color, key, &comm_out));

        this->comm_.reset(new MPI_Comm(comm_out), comm_deleter{});

    }


    communicator(const communicator& comm, int color, int key)

    {

        MPI_Comm comm_out;

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Comm_split(comm.get(), color, key, &comm_out));

        this->comm_.reset(new MPI_Comm(comm_out), comm_deleter{});

    }


    static communicator create_owning(const MPI_Comm& comm,

                                      bool force_host_buffer = false)

    {

        communicator comm_out(MPI_COMM_NULL, force_host_buffer);

        comm_out.comm_.reset(new MPI_Comm(comm), comm_deleter{});

        return comm_out;

    }


    communicator(const communicator& other) = default;


    communicator(communicator&& other) { *this = std::move(other); }


    communicator& operator=(const communicator& other) = default;


    communicator& operator=(communicator&& other)

    {

        if (this != &other) {

            comm_ = std::exchange(other.comm_,

                                  std::make_shared<MPI_Comm>(MPI_COMM_NULL));

            force_host_buffer_ = other.force_host_buffer_;

        }

        return *this;

    }


    const MPI_Comm& get() const { return *(this->comm_.get()); }


    bool force_host_buffer() const { return force_host_buffer_; }


    int size() const { return get_num_ranks(); }


    int rank() const { return get_my_rank(); };


    int node_local_rank() const { return get_node_local_rank(); };


    bool operator==(const communicator& rhs) const { return is_identical(rhs); }


    bool operator!=(const communicator& rhs) const { return !(*this == rhs); }


    bool is_identical(const communicator& rhs) const

    {

        if (get() == MPI_COMM_NULL || rhs.get() == MPI_COMM_NULL) {

            return get() == rhs.get();

        }

        int flag;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_compare(get(), rhs.get(), &flag));

        return flag == MPI_IDENT;

    }


    bool is_congruent(const communicator& rhs) const

    {

        if (get() == MPI_COMM_NULL || rhs.get() == MPI_COMM_NULL) {

            return get() == rhs.get();

        }

        int flag;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_compare(get(), rhs.get(), &flag));

        return flag == MPI_CONGRUENT;

    }


    void synchronize() const

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Barrier(this->get()));

    }


    template <typename SendType>

    void send(std::shared_ptr<const Executor> exec, const SendType* send_buffer,

              const int send_count, const int destination_rank,

              const int send_tag) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Send(send_buffer, send_count, type_impl<SendType>::get_type(),

                     destination_rank, send_tag, this->get()));

    }


    template <typename SendType>

    request i_send(std::shared_ptr<const Executor> exec,

                   const SendType* send_buffer, const int send_count,

                   const int destination_rank, const int send_tag) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Isend(send_buffer, send_count, type_impl<SendType>::get_type(),

                      destination_rank, send_tag, this->get(), req.get()));

        return req;

    }


    template <typename RecvType>

    status recv(std::shared_ptr<const Executor> exec, RecvType* recv_buffer,

                const int recv_count, const int source_rank,

                const int recv_tag) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        status st;

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Recv(recv_buffer, recv_count, type_impl<RecvType>::get_type(),

                     source_rank, recv_tag, this->get(), st.get()));

        return st;

    }


    template <typename RecvType>

    request i_recv(std::shared_ptr<const Executor> exec, RecvType* recv_buffer,

                   const int recv_count, const int source_rank,

                   const int recv_tag) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Irecv(recv_buffer, recv_count, type_impl<RecvType>::get_type(),

                      source_rank, recv_tag, this->get(), req.get()));

        return req;

    }


    template <typename BroadcastType>

    void broadcast(std::shared_ptr<const Executor> exec, BroadcastType* buffer,

                   int count, int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Bcast(buffer, count,

                                           type_impl<BroadcastType>::get_type(),

                                           root_rank, this->get()));

    }


    template <typename BroadcastType>

    request i_broadcast(std::shared_ptr<const Executor> exec,

                        BroadcastType* buffer, int count, int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Ibcast(buffer, count, type_impl<BroadcastType>::get_type(),

                       root_rank, this->get(), req.get()));

        return req;

    }


    template <typename ReduceType>

    void reduce(std::shared_ptr<const Executor> exec,

                const ReduceType* send_buffer, ReduceType* recv_buffer,

                int count, MPI_Op operation, int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Reduce(send_buffer, recv_buffer, count,

                                            type_impl<ReduceType>::get_type(),

                                            operation, root_rank, this->get()));

    }


    template <typename ReduceType>

    request i_reduce(std::shared_ptr<const Executor> exec,

                     const ReduceType* send_buffer, ReduceType* recv_buffer,

                     int count, MPI_Op operation, int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Ireduce(

            send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),

            operation, root_rank, this->get(), req.get()));

        return req;

    }


    template <typename ReduceType>

    void all_reduce(std::shared_ptr<const Executor> exec,

                    ReduceType* recv_buffer, int count, MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(

            MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),

            operation, this->get()));

    }


    template <typename ReduceType>

    request i_all_reduce(std::shared_ptr<const Executor> exec,

                         ReduceType* recv_buffer, int count,

                         MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(

            MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),

            operation, this->get(), req.get()));

        return req;

    }


    template <typename ReduceType>

    void all_reduce(std::shared_ptr<const Executor> exec,

                    const ReduceType* send_buffer, ReduceType* recv_buffer,

                    int count, MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(

            send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),

            operation, this->get()));

    }


    template <typename ReduceType>

    request i_all_reduce(std::shared_ptr<const Executor> exec,

                         const ReduceType* send_buffer, ReduceType* recv_buffer,

                         int count, MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(

            send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),

            operation, this->get(), req.get()));

        return req;

    }


    template <typename SendType, typename RecvType>

    void gather(std::shared_ptr<const Executor> exec,

                const SendType* send_buffer, const int send_count,

                RecvType* recv_buffer, const int recv_count,

                int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Gather(send_buffer, send_count, type_impl<SendType>::get_type(),

                       recv_buffer, recv_count, type_impl<RecvType>::get_type(),

                       root_rank, this->get()));

    }


    template <typename SendType, typename RecvType>

    request i_gather(std::shared_ptr<const Executor> exec,

                     const SendType* send_buffer, const int send_count,

                     RecvType* recv_buffer, const int recv_count,

                     int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Igather(

            send_buffer, send_count, type_impl<SendType>::get_type(),

            recv_buffer, recv_count, type_impl<RecvType>::get_type(), root_rank,

            this->get(), req.get()));

        return req;

    }


    template <typename SendType, typename RecvType>

    void gather_v(std::shared_ptr<const Executor> exec,

                  const SendType* send_buffer, const int send_count,

                  RecvType* recv_buffer, const int* recv_counts,

                  const int* displacements, int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Gatherv(

            send_buffer, send_count, type_impl<SendType>::get_type(),

            recv_buffer, recv_counts, displacements,

            type_impl<RecvType>::get_type(), root_rank, this->get()));

    }


    template <typename SendType, typename RecvType>

    request i_gather_v(std::shared_ptr<const Executor> exec,

                       const SendType* send_buffer, const int send_count,

                       RecvType* recv_buffer, const int* recv_counts,

                       const int* displacements, int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Igatherv(

            send_buffer, send_count, type_impl<SendType>::get_type(),

            recv_buffer, recv_counts, displacements,

            type_impl<RecvType>::get_type(), root_rank, this->get(),

            req.get()));

        return req;

    }


    template <typename SendType, typename RecvType>

    void all_gather(std::shared_ptr<const Executor> exec,

                    const SendType* send_buffer, const int send_count,

                    RecvType* recv_buffer, const int recv_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Allgather(

            send_buffer, send_count, type_impl<SendType>::get_type(),

            recv_buffer, recv_count, type_impl<RecvType>::get_type(),

            this->get()));

    }


    template <typename SendType, typename RecvType>

    request i_all_gather(std::shared_ptr<const Executor> exec,

                         const SendType* send_buffer, const int send_count,

                         RecvType* recv_buffer, const int recv_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallgather(

            send_buffer, send_count, type_impl<SendType>::get_type(),

            recv_buffer, recv_count, type_impl<RecvType>::get_type(),

            this->get(), req.get()));

        return req;

    }


    template <typename SendType, typename RecvType>

    void scatter(std::shared_ptr<const Executor> exec,

                 const SendType* send_buffer, const int send_count,

                 RecvType* recv_buffer, const int recv_count,

                 int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Scatter(

            send_buffer, send_count, type_impl<SendType>::get_type(),

            recv_buffer, recv_count, type_impl<RecvType>::get_type(), root_rank,

            this->get()));

    }


    template <typename SendType, typename RecvType>

    request i_scatter(std::shared_ptr<const Executor> exec,

                      const SendType* send_buffer, const int send_count,

                      RecvType* recv_buffer, const int recv_count,

                      int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iscatter(

            send_buffer, send_count, type_impl<SendType>::get_type(),

            recv_buffer, recv_count, type_impl<RecvType>::get_type(), root_rank,

            this->get(), req.get()));

        return req;

    }


    template <typename SendType, typename RecvType>

    void scatter_v(std::shared_ptr<const Executor> exec,

                   const SendType* send_buffer, const int* send_counts,

                   const int* displacements, RecvType* recv_buffer,

                   const int recv_count, int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Scatterv(

            send_buffer, send_counts, displacements,

            type_impl<SendType>::get_type(), recv_buffer, recv_count,

            type_impl<RecvType>::get_type(), root_rank, this->get()));

    }


    template <typename SendType, typename RecvType>

    request i_scatter_v(std::shared_ptr<const Executor> exec,

                        const SendType* send_buffer, const int* send_counts,

                        const int* displacements, RecvType* recv_buffer,

                        const int recv_count, int root_rank) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Iscatterv(send_buffer, send_counts, displacements,

                          type_impl<SendType>::get_type(), recv_buffer,

                          recv_count, type_impl<RecvType>::get_type(),

                          root_rank, this->get(), req.get()));

        return req;

    }


    template <typename RecvType>

    void all_to_all(std::shared_ptr<const Executor> exec, RecvType* recv_buffer,

                    const int recv_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoall(

            MPI_IN_PLACE, recv_count, type_impl<RecvType>::get_type(),

            recv_buffer, recv_count, type_impl<RecvType>::get_type(),

            this->get()));

    }


    template <typename RecvType>

    request i_all_to_all(std::shared_ptr<const Executor> exec,

                         RecvType* recv_buffer, const int recv_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoall(

            MPI_IN_PLACE, recv_count, type_impl<RecvType>::get_type(),

            recv_buffer, recv_count, type_impl<RecvType>::get_type(),

            this->get(), req.get()));

        return req;

    }


    template <typename SendType, typename RecvType>

    void all_to_all(std::shared_ptr<const Executor> exec,

                    const SendType* send_buffer, const int send_count,

                    RecvType* recv_buffer, const int recv_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoall(

            send_buffer, send_count, type_impl<SendType>::get_type(),

            recv_buffer, recv_count, type_impl<RecvType>::get_type(),

            this->get()));

    }


    template <typename SendType, typename RecvType>

    request i_all_to_all(std::shared_ptr<const Executor> exec,

                         const SendType* send_buffer, const int send_count,

                         RecvType* recv_buffer, const int recv_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoall(

            send_buffer, send_count, type_impl<SendType>::get_type(),

            recv_buffer, recv_count, type_impl<RecvType>::get_type(),

            this->get(), req.get()));

        return req;

    }


    template <typename SendType, typename RecvType>

    void all_to_all_v(std::shared_ptr<const Executor> exec,

                      const SendType* send_buffer, const int* send_counts,

                      const int* send_offsets, RecvType* recv_buffer,

                      const int* recv_counts, const int* recv_offsets) const

    {

        this->all_to_all_v(std::move(exec), send_buffer, send_counts,

                           send_offsets, type_impl<SendType>::get_type(),

                           recv_buffer, recv_counts, recv_offsets,

                           type_impl<RecvType>::get_type());

    }


    void all_to_all_v(std::shared_ptr<const Executor> exec,

                      const void* send_buffer, const int* send_counts,

                      const int* send_offsets, MPI_Datatype send_type,

                      void* recv_buffer, const int* recv_counts,

                      const int* recv_offsets, MPI_Datatype recv_type) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoallv(

            send_buffer, send_counts, send_offsets, send_type, recv_buffer,

            recv_counts, recv_offsets, recv_type, this->get()));

    }


    request i_all_to_all_v(std::shared_ptr<const Executor> exec,

                           const void* send_buffer, const int* send_counts,

                           const int* send_offsets, MPI_Datatype send_type,

                           void* recv_buffer, const int* recv_counts,

                           const int* recv_offsets,

                           MPI_Datatype recv_type) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoallv(

            send_buffer, send_counts, send_offsets, send_type, recv_buffer,

            recv_counts, recv_offsets, recv_type, this->get(), req.get()));

        return req;

    }


    template <typename SendType, typename RecvType>

    request i_all_to_all_v(std::shared_ptr<const Executor> exec,

                           const SendType* send_buffer, const int* send_counts,

                           const int* send_offsets, RecvType* recv_buffer,

                           const int* recv_counts,

                           const int* recv_offsets) const

    {

        return this->i_all_to_all_v(

            std::move(exec), send_buffer, send_counts, send_offsets,

            type_impl<SendType>::get_type(), recv_buffer, recv_counts,

            recv_offsets, type_impl<RecvType>::get_type());

    }


    template <typename ScanType>

    void scan(std::shared_ptr<const Executor> exec, const ScanType* send_buffer,

              ScanType* recv_buffer, int count, MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Scan(send_buffer, recv_buffer, count,

                                          type_impl<ScanType>::get_type(),

                                          operation, this->get()));

    }


    template <typename ScanType>

    request i_scan(std::shared_ptr<const Executor> exec,

                   const ScanType* send_buffer, ScanType* recv_buffer,

                   int count, MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iscan(send_buffer, recv_buffer, count,

                                           type_impl<ScanType>::get_type(),

                                           operation, this->get(), req.get()));

        return req;

    }


private:

    std::shared_ptr<MPI_Comm> comm_;

    bool force_host_buffer_;


    int get_my_rank() const

    {

        int my_rank = 0;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_rank(get(), &my_rank));

        return my_rank;

    }


    int get_node_local_rank() const

    {

        MPI_Comm local_comm;

        int rank;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_split_type(

            this->get(), MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local_comm));

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_rank(local_comm, &rank));

        MPI_Comm_free(&local_comm);

        return rank;

    }


    int get_num_ranks() const

    {

        int size = 1;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_size(this->get(), &size));

        return size;

    }

};


bool requires_host_buffer(const std::shared_ptr<const Executor>& exec,

                          const communicator& comm);


inline double get_walltime() { return MPI_Wtime(); }


template <typename ValueType>

class window {

public:

    enum class create_type { allocate = 1, create = 2, dynamic_create = 3 };


    enum class lock_type { shared = 1, exclusive = 2 };


    window() : window_(MPI_WIN_NULL) {}


    window(const window& other) = delete;


    window& operator=(const window& other) = delete;


    window(window&& other) : window_{std::exchange(other.window_, MPI_WIN_NULL)}

    {}


    window& operator=(window&& other)

    {

        window_ = std::exchange(other.window_, MPI_WIN_NULL);

    }


    window(std::shared_ptr<const Executor> exec, ValueType* base, int num_elems,

           const communicator& comm, const int disp_unit = sizeof(ValueType),

           MPI_Info input_info = MPI_INFO_NULL,

           create_type c_type = create_type::create)

    {

        auto guard = exec->get_scoped_device_id_guard();

        unsigned size = num_elems * sizeof(ValueType);

        if (c_type == create_type::create) {

            GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_create(

                base, size, disp_unit, input_info, comm.get(), &this->window_));

        } else if (c_type == create_type::dynamic_create) {

            GKO_ASSERT_NO_MPI_ERRORS(

                MPI_Win_create_dynamic(input_info, comm.get(), &this->window_));

        } else if (c_type == create_type::allocate) {

            GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_allocate(

                size, disp_unit, input_info, comm.get(), base, &this->window_));

        } else {

            GKO_NOT_IMPLEMENTED;

        }

    }


    MPI_Win get_window() const { return this->window_; }


    void fence(int assert = 0) const

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_fence(assert, this->window_));

    }


    void lock(int rank, lock_type lock_t = lock_type::shared,

              int assert = 0) const

    {

        if (lock_t == lock_type::shared) {

            GKO_ASSERT_NO_MPI_ERRORS(

                MPI_Win_lock(MPI_LOCK_SHARED, rank, assert, this->window_));

        } else if (lock_t == lock_type::exclusive) {

            GKO_ASSERT_NO_MPI_ERRORS(

                MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, assert, this->window_));

        } else {

            GKO_NOT_IMPLEMENTED;

        }

    }


    void unlock(int rank) const

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_unlock(rank, this->window_));

    }


    void lock_all(int assert = 0) const

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_lock_all(assert, this->window_));

    }


    void unlock_all() const

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_unlock_all(this->window_));

    }


    void flush(int rank) const

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_flush(rank, this->window_));

    }


    void flush_local(int rank) const

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_flush_local(rank, this->window_));

    }


    void flush_all() const

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_flush_all(this->window_));

    }


    void flush_all_local() const

    {

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_flush_local_all(this->window_));

    }


    void sync() const { GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_sync(this->window_)); }


    ~window()

    {

        if (this->window_ && this->window_ != MPI_WIN_NULL) {

            MPI_Win_free(&this->window_);

        }

    }


    template <typename PutType>

    void put(std::shared_ptr<const Executor> exec, const PutType* origin_buffer,

             const int origin_count, const int target_rank,

             const unsigned int target_disp, const int target_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Put(origin_buffer, origin_count, type_impl<PutType>::get_type(),

                    target_rank, target_disp, target_count,

                    type_impl<PutType>::get_type(), this->get_window()));

    }


    template <typename PutType>

    request r_put(std::shared_ptr<const Executor> exec,

                  const PutType* origin_buffer, const int origin_count,

                  const int target_rank, const unsigned int target_disp,

                  const int target_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Rput(

            origin_buffer, origin_count, type_impl<PutType>::get_type(),

            target_rank, target_disp, target_count,

            type_impl<PutType>::get_type(), this->get_window(), req.get()));

        return req;

    }


    template <typename PutType>

    void accumulate(std::shared_ptr<const Executor> exec,

                    const PutType* origin_buffer, const int origin_count,

                    const int target_rank, const unsigned int target_disp,

                    const int target_count, MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Accumulate(

            origin_buffer, origin_count, type_impl<PutType>::get_type(),

            target_rank, target_disp, target_count,

            type_impl<PutType>::get_type(), operation, this->get_window()));

    }


    template <typename PutType>

    request r_accumulate(std::shared_ptr<const Executor> exec,

                         const PutType* origin_buffer, const int origin_count,

                         const int target_rank, const unsigned int target_disp,

                         const int target_count, MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Raccumulate(

            origin_buffer, origin_count, type_impl<PutType>::get_type(),

            target_rank, target_disp, target_count,

            type_impl<PutType>::get_type(), operation, this->get_window(),

            req.get()));

        return req;

    }


    template <typename GetType>

    void get(std::shared_ptr<const Executor> exec, GetType* origin_buffer,

             const int origin_count, const int target_rank,

             const unsigned int target_disp, const int target_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(

            MPI_Get(origin_buffer, origin_count, type_impl<GetType>::get_type(),

                    target_rank, target_disp, target_count,

                    type_impl<GetType>::get_type(), this->get_window()));

    }


    template <typename GetType>

    request r_get(std::shared_ptr<const Executor> exec, GetType* origin_buffer,

                  const int origin_count, const int target_rank,

                  const unsigned int target_disp, const int target_count) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Rget(

            origin_buffer, origin_count, type_impl<GetType>::get_type(),

            target_rank, target_disp, target_count,

            type_impl<GetType>::get_type(), this->get_window(), req.get()));

        return req;

    }


    template <typename GetType>

    void get_accumulate(std::shared_ptr<const Executor> exec,

                        GetType* origin_buffer, const int origin_count,

                        GetType* result_buffer, const int result_count,

                        const int target_rank, const unsigned int target_disp,

                        const int target_count, MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Get_accumulate(

            origin_buffer, origin_count, type_impl<GetType>::get_type(),

            result_buffer, result_count, type_impl<GetType>::get_type(),

            target_rank, target_disp, target_count,

            type_impl<GetType>::get_type(), operation, this->get_window()));

    }


    template <typename GetType>

    request r_get_accumulate(std::shared_ptr<const Executor> exec,

                             GetType* origin_buffer, const int origin_count,

                             GetType* result_buffer, const int result_count,

                             const int target_rank,

                             const unsigned int target_disp,

                             const int target_count, MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        request req;

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Rget_accumulate(

            origin_buffer, origin_count, type_impl<GetType>::get_type(),

            result_buffer, result_count, type_impl<GetType>::get_type(),

            target_rank, target_disp, target_count,

            type_impl<GetType>::get_type(), operation, this->get_window(),

            req.get()));

        return req;

    }


    template <typename GetType>

    void fetch_and_op(std::shared_ptr<const Executor> exec,

                      GetType* origin_buffer, GetType* result_buffer,

                      const int target_rank, const unsigned int target_disp,

                      MPI_Op operation) const

    {

        auto guard = exec->get_scoped_device_id_guard();

        GKO_ASSERT_NO_MPI_ERRORS(MPI_Fetch_and_op(

            origin_buffer, result_buffer, type_impl<GetType>::get_type(),

            target_rank, target_disp, operation, this->get_window()));

    }


private:

    MPI_Win window_;

};


}  // namespace mpi

}  // namespace experimental

}  // namespace gko


#endif  // GKO_HAVE_MPI


#endif  // GKO_PUBLIC_CORE_BASE_MPI_HPP_