The IR-ILU preconditioned solver example..
This example depends on ilu-preconditioned-solver, iterative-refinement.
Introduction
About the example
This example shows how to combine iterative refinement with the adaptive precision block-Jacobi preconditioner in order to approximately solve the triangular systems occurring in ILU preconditioning. Using an adaptive precision block-Jacobi preconditioner matrix as inner solver for the iterative refinement method is equivalent to doing adaptive precision block-Jacobi relaxation in the triangular solves. This example roughly approximates the triangular solves with five adaptive precision block-Jacobi sweeps with a maximum block size of 16.
This example is motivated by "Multiprecision block-Jacobi for Iterative
Triangular Solves" (Göbel, Anzt, Cojean, Flegar, Quintana-Ortí, Euro-Par 2020). The theory and a detailed analysis can be found there.
The commented program
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
const auto executor_string = argc >= 2 ? argv[1] : "reference";
const unsigned int sweeps = argc == 3 ? std::atoi(argv[2]) : 5u;
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
executor where Ginkgo will perform the computation
const auto exec = exec_map.at(executor_string)();
Read data
auto A =
gko::share(gko::read<mtx>(std::ifstream(
"data/A.mtx"), exec));
Create RHS and initial guess as 1
auto host_x = vec::create(exec->get_master(),
gko::dim<2>(num_rows, 1));
host_x->at(i, 0) = 1.;
}
Generate incomplete factors using ParILU
Generate concrete factorization for input matrix
auto par_ilu =
gko::share(par_ilu_fact->generate(A));
Generate an iterative refinement factory to be used as a triangular solver in the preconditioner application. The generated method is equivalent to doing five block-Jacobi sweeps with a maximum block size of 16.
bj::build()
.with_max_block_size(16u)
.on(exec));
auto trisolve_factory =
ir::build()
.with_solver(bj_factory)
.with_criteria(gko::stop::Iteration::build().with_max_iters(sweeps))
.on(exec);
Generate an ILU preconditioner factory by setting lower and upper triangular solver - in this case the previously defined iterative refinement method.
Use incomplete factors to generate ILU preconditioner
auto ilu_preconditioner =
gko::share(ilu_pre_factory->generate(par_ilu));
Create stopping criteria for Gmres
const RealValueType reduction_factor{1e-12};
gko::stop::Iteration::build().with_max_iters(1000u).on(exec));
.with_reduction_factor(reduction_factor)
.on(exec));
Use preconditioner inside GMRES solver factory Generating a solver factory tied to a specific preconditioner makes sense if there are several very similar systems to solve, and the same solver+preconditioner combination is expected to be effective.
auto ilu_gmres_factory =
gmres::build()
.with_criteria(iter_stop, tol_stop)
.with_generated_preconditioner(ilu_preconditioner)
.on(exec);
Generate preconditioned solver for a specific target system
auto ilu_gmres = ilu_gmres_factory->generate(A);
Add logger
std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
ilu_gmres->add_logger(logger);
Warmup run
Solve system 100 times and take the average time.
std::chrono::nanoseconds time(0);
for (int i = 0; i < 100; i++) {
x->copy_from(clone_x);
auto tic = std::chrono::high_resolution_clock::now();
ilu_gmres->apply(b, x);
auto toc = std::chrono::high_resolution_clock::now();
time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
}
std::cout << "Using " << sweeps << " block-Jacobi sweeps.\n";
Print solution
std::cout << "Solution (x):\n";
Get residual
auto res = gko::as<vec>(logger->get_residual_norm());
std::cout << "GMRES iteration count: " << logger->get_num_iterations()
<< "\n";
std::cout << "GMRES execution time [ms]: "
<< static_cast<double>(time.count()) / 100000000.0 << "\n";
std::cout << "Residual norm sqrt(r^T r):\n";
}
Results
This is the expected output:
Using 5 block-Jacobi sweeps.
Solution (x):
19 1
0.252218
0.108645
0.0662811
0.0630433
0.0384088
0.0396536
0.0402648
0.0338935
0.0193098
0.0234653
0.0211499
0.0196413
0.0199151
0.0181674
0.0162722
0.0150714
0.0107016
0.0121141
0.0123025
GMRES iteration count: 8
GMRES execution time [ms]: 0.377673
Residual norm sqrt(r^T r):
1 1
1.65303e-12
Comments about programming and debugging
The plain program
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <map>
#include <string>
#include <ginkgo/ginkgo.hpp>
int main(int argc, char* argv[])
{
using ValueType = double;
using IndexType = int;
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
const auto executor_string = argc >= 2 ? argv[1] : "reference";
const unsigned int sweeps = argc == 3 ? std::atoi(argv[2]) : 5u;
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
const auto exec = exec_map.at(executor_string)();
auto A =
gko::share(gko::read<mtx>(std::ifstream(
"data/A.mtx"), exec));
auto host_x = vec::create(exec->get_master(),
gko::dim<2>(num_rows, 1));
host_x->at(i, 0) = 1.;
}
auto par_ilu_fact =
auto par_ilu =
gko::share(par_ilu_fact->generate(A));
bj::build()
.with_max_block_size(16u)
.on(exec));
auto trisolve_factory =
ir::build()
.with_solver(bj_factory)
.with_criteria(gko::stop::Iteration::build().with_max_iters(sweeps))
.on(exec);
.on(exec);
auto ilu_preconditioner =
gko::share(ilu_pre_factory->generate(par_ilu));
const RealValueType reduction_factor{1e-12};
gko::stop::Iteration::build().with_max_iters(1000u).on(exec));
.with_reduction_factor(reduction_factor)
.on(exec));
auto ilu_gmres_factory =
gmres::build()
.with_criteria(iter_stop, tol_stop)
.with_generated_preconditioner(ilu_preconditioner)
.on(exec);
auto ilu_gmres = ilu_gmres_factory->generate(A);
std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
ilu_gmres->add_logger(logger);
ilu_gmres->apply(b, x);
std::chrono::nanoseconds time(0);
for (int i = 0; i < 100; i++) {
x->copy_from(clone_x);
auto tic = std::chrono::high_resolution_clock::now();
ilu_gmres->apply(b, x);
auto toc = std::chrono::high_resolution_clock::now();
time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
}
std::cout << "Using " << sweeps << " block-Jacobi sweeps.\n";
std::cout << "Solution (x):\n";
auto res = gko::as<vec>(logger->get_residual_norm());
std::cout << "GMRES iteration count: " << logger->get_num_iterations()
<< "\n";
std::cout << "GMRES execution time [ms]: "
<< static_cast<double>(time.count()) / 100000000.0 << "\n";
std::cout << "Residual norm sqrt(r^T r):\n";
}