Ginkgo  Generated from pipelines/1589998975 branch based on develop. Ginkgo version 1.10.0
A numerical linear algebra library targeting many-core architectures
The mixed-precision-ir program

The Mixed Precision Iterative Refinement (MPIR) solver example..

This example depends on iterative-refinement.

Table of contents
  1. Introduction
  2. The commented program
  1. Results
  2. The plain program

This example manually implements a Mixed Precision Iterative Refinement (MPIR) solver.

In this example, we first read in a matrix from file, then generate a right-hand side and an initial guess. An inaccurate CG solver in single precision is used as the inner solver to an iterative refinement (IR) in double precision method which solves a linear system.

The commented program

Print version information

std::cout << gko::version_info::get() << std::endl;

Figure out where to run the code

if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"omp", [] { return gko::OmpExecutor::create(); }},
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
0, gko::ReferenceExecutor::create());
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};

executor where Ginkgo will perform the computation

const auto exec = exec_map.at(executor_string)(); // throws if not valid

Read data

auto A = share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));

Create RHS and initial guess as 1

gko::size_type size = A->get_size()[0];
auto host_x = vec::create(exec->get_master(), gko::dim<2>(size, 1));
for (auto i = 0; i < size; i++) {
host_x->at(i, 0) = 1.;
}
auto x = gko::clone(exec, host_x);
auto b = gko::clone(exec, host_x);

Calculate initial residual by overwriting b

auto one = gko::initialize<vec>({1.0}, exec);
auto neg_one = gko::initialize<vec>({-1.0}, exec);
auto initres_vec = gko::initialize<real_vec>({0.0}, exec);
A->apply(one, x, neg_one, b);
b->compute_norm2(initres_vec);

Build lower-precision system matrix and residual

auto solver_A = solver_mtx::create(exec);
auto inner_residual = solver_vec::create(exec);
auto outer_residual = vec::create(exec);
A->convert_to(solver_A);
b->convert_to(outer_residual);

restore b

b->copy_from(host_x);

Create inner solver

auto inner_solver =
cg::build()
.with_criteria(
.with_reduction_factor(inner_reduction_factor),
gko::stop::Iteration::build().with_max_iters(max_inner_iters))
.on(exec)
->generate(give(solver_A));

Solve system

exec->synchronize();
std::chrono::nanoseconds time(0);
auto res_vec = gko::initialize<real_vec>({0.0}, exec);
auto initres = exec->copy_val_to_host(initres_vec->get_const_values());
auto inner_solution = solver_vec::create(exec);
auto outer_delta = vec::create(exec);
auto tic = std::chrono::steady_clock::now();
int iter = -1;
while (true) {
++iter;

convert residual to inner precision

outer_residual->convert_to(inner_residual);
outer_residual->compute_norm2(res_vec);
auto res = exec->copy_val_to_host(res_vec->get_const_values());

break if we exceed the number of iterations or have converged

if (iter > max_outer_iters || res / initres < outer_reduction_factor) {
break;
}

Use the inner solver to solve A * inner_solution = inner_residual with residual as initial guess.

inner_solution->copy_from(inner_residual);
inner_solver->apply(inner_residual, inner_solution);

convert inner solution to outer precision

inner_solution->convert_to(outer_delta);

x = x + inner_solution

x->add_scaled(one, outer_delta);

residual = b - A * x

outer_residual->copy_from(b);
A->apply(neg_one, x, one, outer_residual);
}
auto toc = std::chrono::steady_clock::now();
time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);

Calculate residual

A->apply(one, x, neg_one, b);
b->compute_norm2(res_vec);
std::cout << "Initial residual norm sqrt(r^T r):\n";
write(std::cout, initres_vec);
std::cout << "Final residual norm sqrt(r^T r):\n";
write(std::cout, res_vec);

Print solver statistics

std::cout << "MPIR iteration count: " << iter << std::endl;
std::cout << "MPIR execution time [ms]: "
<< static_cast<double>(time.count()) / 1000000.0 << std::endl;
}

Results

This is the expected output:

Initial residual norm sqrt(r^T r):
%%MatrixMarket matrix array real general
1 1
194.679
Final residual norm sqrt(r^T r):
%%MatrixMarket matrix array real general
1 1
1.22728e-10
MPIR iteration count: 25
MPIR execution time [ms]: 0.846559

Comments about programming and debugging

The plain program

#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <string>
#include <ginkgo/ginkgo.hpp>
int main(int argc, char* argv[])
{
using ValueType = double;
using RealValueType = gko::remove_complex<ValueType>;
using SolverType = float;
using RealSolverType = gko::remove_complex<SolverType>;
using IndexType = int;
using solver_vec = gko::matrix::Dense<SolverType>;
gko::size_type max_outer_iters = 100u;
gko::size_type max_inner_iters = 100u;
RealValueType outer_reduction_factor{1e-12};
RealSolverType inner_reduction_factor{1e-2};
std::cout << gko::version_info::get() << std::endl;
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"omp", [] { return gko::OmpExecutor::create(); }},
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
0, gko::ReferenceExecutor::create());
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
const auto exec = exec_map.at(executor_string)(); // throws if not valid
auto A = share(gko::read<mtx>(std::ifstream("data/A.mtx"), exec));
gko::size_type size = A->get_size()[0];
auto host_x = vec::create(exec->get_master(), gko::dim<2>(size, 1));
for (auto i = 0; i < size; i++) {
host_x->at(i, 0) = 1.;
}
auto x = gko::clone(exec, host_x);
auto b = gko::clone(exec, host_x);
auto one = gko::initialize<vec>({1.0}, exec);
auto neg_one = gko::initialize<vec>({-1.0}, exec);
auto initres_vec = gko::initialize<real_vec>({0.0}, exec);
A->apply(one, x, neg_one, b);
b->compute_norm2(initres_vec);
auto solver_A = solver_mtx::create(exec);
auto inner_residual = solver_vec::create(exec);
auto outer_residual = vec::create(exec);
A->convert_to(solver_A);
b->convert_to(outer_residual);
b->copy_from(host_x);
auto inner_solver =
cg::build()
.with_criteria(
.with_reduction_factor(inner_reduction_factor),
gko::stop::Iteration::build().with_max_iters(max_inner_iters))
.on(exec)
->generate(give(solver_A));
exec->synchronize();
std::chrono::nanoseconds time(0);
auto res_vec = gko::initialize<real_vec>({0.0}, exec);
auto initres = exec->copy_val_to_host(initres_vec->get_const_values());
auto inner_solution = solver_vec::create(exec);
auto outer_delta = vec::create(exec);
auto tic = std::chrono::steady_clock::now();
int iter = -1;
while (true) {
++iter;
outer_residual->convert_to(inner_residual);
outer_residual->compute_norm2(res_vec);
auto res = exec->copy_val_to_host(res_vec->get_const_values());
if (iter > max_outer_iters || res / initres < outer_reduction_factor) {
break;
}
inner_solution->copy_from(inner_residual);
inner_solver->apply(inner_residual, inner_solution);
inner_solution->convert_to(outer_delta);
x->add_scaled(one, outer_delta);
outer_residual->copy_from(b);
A->apply(neg_one, x, one, outer_residual);
}
auto toc = std::chrono::steady_clock::now();
time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
A->apply(one, x, neg_one, b);
b->compute_norm2(res_vec);
std::cout << "Initial residual norm sqrt(r^T r):\n";
write(std::cout, initres_vec);
std::cout << "Final residual norm sqrt(r^T r):\n";
write(std::cout, res_vec);
std::cout << "MPIR iteration count: " << iter << std::endl;
std::cout << "MPIR execution time [ms]: "
<< static_cast<double>(time.count()) / 1000000.0 << std::endl;
}
gko::matrix::Csr
CSR is a matrix format which stores only the nonzero coefficients by compressing each row of the matr...
Definition: matrix.hpp:28
gko::layout_type::array
The matrix should be written as dense matrix in column-major order.
gko::give
std::remove_reference< OwningPointer >::type && give(OwningPointer &&p)
Marks that the object pointed to by p can be given to the callee.
Definition: utils_helper.hpp:247
gko::matrix::Dense
Dense is a matrix format which explicitly stores all values of the matrix.
Definition: dense_cache.hpp:19
gko::size_type
std::size_t size_type
Integral type used for allocation quantities.
Definition: types.hpp:89
gko::clone
detail::cloned_type< Pointer > clone(const Pointer &p)
Creates a unique clone of the object pointed to by p.
Definition: utils_helper.hpp:173
gko::HipExecutor::create
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
gko::version_info::get
static const version_info & get()
Returns an instance of version_info.
Definition: version.hpp:139
gko::stop::ResidualNorm
The ResidualNorm class is a stopping criterion which stops the iteration process when the actual resi...
Definition: residual_norm.hpp:113
gko::dim< 2 >
gko::solver::Cg
CG or the conjugate gradient method is an iterative type Krylov subspace method which is suitable for...
Definition: cg.hpp:48
gko::write
void write(StreamType &&os, MatrixPtrType &&matrix, layout_type layout=detail::mtx_io_traits< std::remove_cv_t< detail::pointee< MatrixPtrType >>>::default_layout)
Writes a matrix into an output stream in matrix market format.
Definition: mtx_io.hpp:295
gko::share
detail::shared_type< OwningPointer > share(OwningPointer &&p)
Marks the object pointed to by p as shared.
Definition: utils_helper.hpp:224
gko::CudaExecutor::create
static std::shared_ptr< CudaExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_cuda_alloc_mode, CUstream_st *stream=nullptr)
Creates a new CudaExecutor.
gko::OmpExecutor::create
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition: executor.hpp:1396
gko::remove_complex
typename detail::remove_complex_s< T >::type remove_complex
Obtain the type which removed the complex of complex/scalar type or the template parameter of class b...
Definition: math.hpp:260
gko::DpcppExecutor::create
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
gko::real
constexpr auto real(const T &x)
Returns the real part of the object.
Definition: math.hpp:869
gko::one
constexpr T one()
Returns the multiplicative identity for T.
Definition: math.hpp:630