The Kokkos assembly example..
This example depends on simple-solver, poisson-solver, three-pt-stencil-solver, .
Introduction
This example solves a 1D Poisson equation:
using a finite difference method on an equidistant grid with K
discretization points (K
can be controlled with a command line parameter).
The resulting CSR matrix is assembled using Kokkos kernels. This example show how to use Ginkgo data with Kokkos kernels.
Notes
If this example is built as part of Ginkgo, it is advised to configure Ginkgo with -DGINKGO_WITH_CCACHE=OFF
to prevent incompabilities with Kokkos' compiler wrapper for nvcc
.
The commented program
#include <iostream>
#include <map>
#include <string>
#include <omp.h>
#include <Kokkos_Core.hpp>
#include <ginkgo/ginkgo.hpp>
Creates a stencil matrix in CSR format for the given number of discretization points.
template <typename ValueType, typename IndexType>
{
const auto discretization_points = matrix->
get_size()[0];
Over-allocate storage for the matrix elements. Each row has 3 entries, except for the first and last one. To handle each row uniformly, we allocate space for 3x the number of rows.
discretization_points * 3);
Create Kokkos views on Ginkgo data.
Kokkos::View<IndexType*> v_row_idxs(md.get_row_idxs(), md.get_num_elems());
Kokkos::View<IndexType*> v_col_idxs(md.get_col_idxs(), md.get_num_elems());
Kokkos::View<ValueType*> v_values(md.get_values(), md.get_num_elems());
Create the matrix entries. This also creates zero entries for the first and second row to handle all rows uniformly.
Kokkos::parallel_for(
"generate_stencil_matrix", md.get_num_elems(), KOKKOS_LAMBDA(int i) {
const ValueType coefs[] = {-1, 2, -1};
auto ofs = static_cast<IndexType>((i % 3) - 1);
auto row = static_cast<IndexType>(i / 3);
auto col = row + ofs;
To prevent branching, a mask is used to set the entry to zero, if the column is out-of-bounds
auto mask =
static_cast<IndexType>(0 <= col && col < discretization_points);
v_row_idxs[i] = mask * row;
v_col_idxs[i] = mask * col;
v_values[i] = mask * coefs[ofs + 1];
});
Add up duplicate (zero) entries.
Build Csr matrix.
matrix->
read(std::move(md));
}
Generates the RHS vector given f
and the boundary conditions.
template <typename Closure, typename ValueType>
void generate_rhs(Closure&& f, ValueType u0, ValueType u1,
{
const auto discretization_points =
rhs->get_size()[0];
auto values =
rhs->get_values();
Kokkos::View<ValueType*> values_view(values, discretization_points);
Kokkos::parallel_for(
"generate_rhs", discretization_points, KOKKOS_LAMBDA(int i) {
const ValueType h = 1.0 / (discretization_points + 1);
const ValueType xi = ValueType(i + 1) * h;
values_view[i] = -f(xi) * h * h;
if (i == 0) {
values_view[i] += u0;
}
if (i == discretization_points - 1) {
values_view[i] += u1;
}
});
}
Computes the 1-norm of the error given the computed u
and the correct solution function correct_u
.
template <typename Closure, typename ValueType>
double calculate_error(int discretization_points,
Closure&& correct_u)
{
discretization_points);
auto error = 0.0;
Kokkos::parallel_reduce(
"calculate_error", discretization_points,
KOKKOS_LAMBDA(int i, double& lsum) {
const auto h = 1.0 / (discretization_points + 1);
const auto xi = (i + 1) * h;
lsum += Kokkos::Experimental::abs(
(v_u(i) - correct_u(xi)) /
Kokkos::Experimental::abs(correct_u(xi)));
},
error);
return error;
}
int main(int argc, char* argv[])
{
Kokkos::ScopeGuard kokkos(argc, argv);
Some shortcuts
using ValueType = double;
using IndexType = int;
Print help message. For details on the kokkos-options see https://kokkos.github.io/kokkos-core-wiki/ProgrammingGuide/Initialization.html#initialization-by-command-line-arguments
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0]
<< " [discretization_points] [kokkos-options]" << std::endl;
std::exit(-1);
}
const unsigned int discretization_points =
argc >= 2 ? std::atoi(argv[1]) : 100u;
chooses the executor that corresponds to the Kokkos DefaultExecutionSpace
auto exec = []() -> std::shared_ptr<gko::Executor> {
#ifdef KOKKOS_ENABLE_SERIAL
if (std::is_same<Kokkos::DefaultExecutionSpace,
Kokkos::Serial>::value) {
return gko::ReferenceExecutor::create();
}
#endif
#ifdef KOKKOS_ENABLE_OPENMP
if (std::is_same<Kokkos::DefaultExecutionSpace,
Kokkos::OpenMP>::value) {
}
#endif
#ifdef KOKKOS_ENABLE_CUDA
if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Cuda>::value) {
gko::ReferenceExecutor::create());
}
#endif
#ifdef KOKKOS_ENABLE_HIP
if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::HIP>::value) {
gko::ReferenceExecutor::create());
}
#endif
}();
problem:
auto correct_u = [] KOKKOS_FUNCTION(ValueType x) { return x * x * x; };
auto f = [] KOKKOS_FUNCTION(ValueType x) { return ValueType{6} * x; };
auto u0 = correct_u(0);
auto u1 = correct_u(1);
initialize vectors
generate_rhs(f, u0, u1,
rhs.get());
auto u = vec::create(exec,
gko::dim<2>(discretization_points, 1));
for (int i = 0; i < u->get_size()[0]; ++i) {
u->get_values()[i] = 0.0;
}
initialize the stencil matrix
auto A =
share(mtx::create(
exec,
gko::dim<2>{discretization_points, discretization_points}));
generate_stencil_matrix(A.get());
const RealValueType reduction_factor{1e-7};
Generate solver and solve the system
cg::build()
.with_criteria(
gko::stop::Iteration::build().with_max_iters(discretization_points),
reduction_factor))
.with_preconditioner(bj::build())
.on(exec)
->generate(A)
->apply(rhs, u);
std::cout << "\nSolve complete."
<< "\nThe average relative error is "
<< calculate_error(discretization_points, u.get(), correct_u) /
discretization_points
<< std::endl;
}
Results
Example output:
> ./kokkos-assembly
Solve complete.
The average relative error is 1.05488e-11
The actual error depends on the used hardware.
The plain program
#include <iostream>
#include <map>
#include <string>
#include <omp.h>
#include <Kokkos_Core.hpp>
#include <ginkgo/ginkgo.hpp>
template <typename ValueType, typename IndexType>
{
const auto discretization_points = matrix->
get_size()[0];
discretization_points * 3);
Kokkos::View<IndexType*> v_row_idxs(md.get_row_idxs(), md.get_num_elems());
Kokkos::View<IndexType*> v_col_idxs(md.get_col_idxs(), md.get_num_elems());
Kokkos::View<ValueType*> v_values(md.get_values(), md.get_num_elems());
Kokkos::parallel_for(
"generate_stencil_matrix", md.get_num_elems(), KOKKOS_LAMBDA(int i) {
const ValueType coefs[] = {-1, 2, -1};
auto ofs = static_cast<IndexType>((i % 3) - 1);
auto row = static_cast<IndexType>(i / 3);
auto col = row + ofs;
auto mask =
static_cast<IndexType>(0 <= col && col < discretization_points);
v_row_idxs[i] = mask * row;
v_col_idxs[i] = mask * col;
v_values[i] = mask * coefs[ofs + 1];
});
matrix->
read(std::move(md));
}
template <typename Closure, typename ValueType>
void generate_rhs(Closure&& f, ValueType u0, ValueType u1,
{
const auto discretization_points =
rhs->get_size()[0];
auto values =
rhs->get_values();
Kokkos::View<ValueType*> values_view(values, discretization_points);
Kokkos::parallel_for(
"generate_rhs", discretization_points, KOKKOS_LAMBDA(int i) {
const ValueType h = 1.0 / (discretization_points + 1);
const ValueType xi = ValueType(i + 1) * h;
values_view[i] = -f(xi) * h * h;
if (i == 0) {
values_view[i] += u0;
}
if (i == discretization_points - 1) {
values_view[i] += u1;
}
});
}
template <typename Closure, typename ValueType>
double calculate_error(int discretization_points,
Closure&& correct_u)
{
discretization_points);
auto error = 0.0;
Kokkos::parallel_reduce(
"calculate_error", discretization_points,
KOKKOS_LAMBDA(int i, double& lsum) {
const auto h = 1.0 / (discretization_points + 1);
const auto xi = (i + 1) * h;
lsum += Kokkos::Experimental::abs(
(v_u(i) - correct_u(xi)) /
Kokkos::Experimental::abs(correct_u(xi)));
},
error);
return error;
}
int main(int argc, char* argv[])
{
Kokkos::ScopeGuard kokkos(argc, argv);
using ValueType = double;
using IndexType = int;
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0]
<< " [discretization_points] [kokkos-options]" << std::endl;
std::exit(-1);
}
const unsigned int discretization_points =
argc >= 2 ? std::atoi(argv[1]) : 100u;
auto exec = []() -> std::shared_ptr<gko::Executor> {
#ifdef KOKKOS_ENABLE_SERIAL
if (std::is_same<Kokkos::DefaultExecutionSpace,
Kokkos::Serial>::value) {
return gko::ReferenceExecutor::create();
}
#endif
#ifdef KOKKOS_ENABLE_OPENMP
if (std::is_same<Kokkos::DefaultExecutionSpace,
Kokkos::OpenMP>::value) {
}
#endif
#ifdef KOKKOS_ENABLE_CUDA
if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Cuda>::value) {
gko::ReferenceExecutor::create());
}
#endif
#ifdef KOKKOS_ENABLE_HIP
if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::HIP>::value) {
gko::ReferenceExecutor::create());
}
#endif
}();
auto correct_u = [] KOKKOS_FUNCTION(ValueType x) { return x * x * x; };
auto f = [] KOKKOS_FUNCTION(ValueType x) { return ValueType{6} * x; };
auto u0 = correct_u(0);
auto u1 = correct_u(1);
generate_rhs(f, u0, u1,
rhs.get());
auto u = vec::create(exec,
gko::dim<2>(discretization_points, 1));
for (int i = 0; i < u->get_size()[0]; ++i) {
u->get_values()[i] = 0.0;
}
auto A =
share(mtx::create(
exec,
gko::dim<2>{discretization_points, discretization_points}));
generate_stencil_matrix(A.get());
const RealValueType reduction_factor{1e-7};
cg::build()
.with_criteria(
gko::stop::Iteration::build().with_max_iters(discretization_points),
reduction_factor))
.with_preconditioner(bj::build())
.on(exec)
->generate(A)
->apply(rhs, u);
std::cout << "\nSolve complete."
<< "\nThe average relative error is "
<< calculate_error(discretization_points, u.get(), correct_u) /
discretization_points
<< std::endl;
}