The simple solver with performance debugging example..
This example depends on simple-solver-logging, minimal-cuda-solver.
Introduction
About the example
This example runs a solver on a test problem and shows how to use loggers to debug performance and convergence rate.
The commented program
creates a zero vector
template <typename ValueType>
std::unique_ptr<vec<ValueType>> create_vector(
ValueType value)
{
auto res = vec<ValueType>::create(exec);
return res;
}
utilities for computing norms and residuals
template <typename ValueType>
ValueType get_first_element(const vec<ValueType>* norm)
{
return norm->get_executor()->copy_val_to_host(norm->get_const_values());
}
template <typename ValueType>
{
auto exec = b->get_executor();
auto b_norm = gko::initialize<real_vec<ValueType>>({0.0}, exec);
b->compute_norm2(b_norm);
return get_first_element(b_norm.get());
}
template <typename ValueType>
const gko::LinOp* system_matrix,
const vec<ValueType>* b,
const vec<ValueType>* x)
{
auto one = gko::initialize<vec<ValueType>>({1.0}, exec);
auto neg_one = gko::initialize<vec<ValueType>>({-1.0}, exec);
system_matrix->
apply(one, x, neg_one, res);
return compute_norm(res.get());
}
}
namespace loggers {
A logger that accumulates the time of all operations. For each operation type (allocations, free, copy, internal operations i.e. kernels), the timing is taken before and after. This can create significant overhead since to ensure proper timings, calls to synchronize
are required.
{
this->start_operation(exec, "allocate");
}
{
this->end_operation(exec, "allocate");
}
{
this->start_operation(exec, "free");
}
{
this->end_operation(exec, "free");
}
{
this->start_operation(to, "copy");
}
{
this->end_operation(to, "copy");
}
{
this->start_operation(exec, op->
get_name());
}
{
this->end_operation(exec, op->
get_name());
}
void write_data(std::ostream& ostream)
{
for (const auto& entry : total) {
ostream << "\t" << entry.first.c_str() << ": "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(
entry.second)
.count()
<< std::endl;
}
}
private:
Helper which synchronizes and starts the time before every operation.
const std::string& name) const
{
nested.emplace_back(0);
start[name] = std::chrono::steady_clock::now();
}
Helper to compute the end time and store the operation's time at its end. Also time nested operations.
void end_operation(
const gko::Executor* exec,
const std::string& name)
const
{
const auto end = std::chrono::steady_clock::now();
const auto diff = end - start[name];
make sure timings for nested operations are not counted twice
total[name] += diff - nested.back();
nested.pop_back();
if (nested.size() > 0) {
nested.back() += diff;
}
}
mutable std::map<std::string, std::chrono::steady_clock::time_point> start;
mutable std::map<std::string, std::chrono::steady_clock::duration> total;
the position i of this vector holds the total time spend on child operations on nesting level i
mutable std::vector<std::chrono::steady_clock::duration> nested;
};
This logger tracks the persistently allocated data
Store amount of bytes allocated on every allocation
{
storage[location] = num_bytes;
}
Reset the amount of bytes on every free
{
storage[location] = 0;
}
Write the data after summing the total from all allocations
void write_data(std::ostream& ostream)
{
for (const auto& e : storage) {
total += e.second;
}
ostream << "Storage: " << total << std::endl;
}
private:
mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
};
Logs true and recurrent residuals of the solver
template <typename ValueType>
Depending on the available information, store the norm or compute it from the residual. If the true residual norm could not be computed, store the value -1.0
.
{
if (residual_norm) {
rec_res_norms.push_back(utils::get_first_element(
gko::as<real_vec<ValueType>>(residual_norm)));
} else {
rec_res_norms.push_back(
utils::compute_norm(
gko::as<vec<ValueType>>(residual)));
}
if (solution) {
true_res_norms.push_back(utils::compute_residual_norm(
matrix, b,
gko::as<vec<ValueType>>(solution)));
} else {
true_res_norms.push_back(-1.0);
}
}
ResidualLogger(
const gko::LinOp* matrix,
const vec<ValueType>* b)
:
gko::log::Logger(
gko::log::Logger::iteration_complete_mask),
matrix{matrix},
b{b}
{}
void write_data(std::ostream& ostream)
{
ostream << "Recurrent Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : rec_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
ostream << "True Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : true_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
}
private:
const vec<ValueType>* b;
mutable std::vector<gko::remove_complex<ValueType>> rec_res_norms;
mutable std::vector<gko::remove_complex<ValueType>> true_res_norms;
};
}
namespace {
Print usage help
void print_usage(const char* filename)
{
std::cerr << "Usage: " << filename << " [executor] [matrix file]"
<< std::endl;
std::cerr << "matrix file should be a file in matrix market format. "
"The file data/A.mtx is provided as an example."
<< std::endl;
std::exit(-1);
}
template <typename ValueType>
{
std::cout << "[" << std::endl;
for (int i = 0; i < elements_to_print; ++i) {
std::cout <<
"\t" << vec->
at(i) << std::endl;
}
std::cout << "];" << std::endl;
}
}
int main(int argc, char* argv[])
{
Parametrize the benchmark here Pick a value type
using ValueType = double;
using IndexType = int;
Pick a matrix format
Pick a solver
Pick a preconditioner type
Pick a residual norm reduction value
Pick an output file name
const auto of_name = "log.txt";
Simple shortcut
Print version information
Figure out where to run the code
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
Figure out where to run the code
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
executor where Ginkgo will perform the computation
const auto exec = exec_map.at(executor_string)();
Read the input matrix file directory
std::string input_mtx = "data/A.mtx";
if (argc == 3) {
input_mtx = std::string(argv[2]);
}
Read data: A is read from disk Create a StorageLogger to track the size of A
auto storage_logger = std::make_shared<loggers::StorageLogger>();
Add the logger to the executor
Read the matrix A from file
auto A =
gko::share(gko::read<mtx>(std::ifstream(input_mtx), exec));
Remove the storage logger
Pick a maximum iteration count
const auto max_iters = A->get_size()[0];
Generate b and x vectors
auto b = utils::create_vector<ValueType>(exec, A->get_size()[0], 1.0);
auto x = utils::create_vector<ValueType>(exec, A->get_size()[0], 0.0);
Declare the solver factory. The preconditioner's arguments should be adapted if needed.
auto solver_factory =
solver::build()
.with_criteria(
.with_reduction_factor(reduction_factor),
gko::stop::Iteration::build().with_max_iters(max_iters))
.with_preconditioner(preconditioner::create(exec))
.on(exec);
Declare the output file for all our loggers
std::ofstream output_file(of_name);
Do a warmup run
Clone x to not overwrite the original one
Generate and call apply on a solver
solver_factory->generate(A)->apply(b, x_clone);
}
Do a timed run
Clone x to not overwrite the original one
Synchronize ensures no operation are ongoing
Time before generate
auto g_tic = std::chrono::steady_clock::now();
Generate a solver
auto generated_solver = solver_factory->generate(A);
Time after generate
auto g_tac = std::chrono::steady_clock::now();
Compute the generation time
auto generate_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(g_tac - g_tic);
Write the generate time to the output file
output_file << "Generate time (ns): " << generate_time.count()
<< std::endl;
Similarly time the apply
auto a_tic = std::chrono::steady_clock::now();
generated_solver->apply(b, x_clone);
auto a_tac = std::chrono::steady_clock::now();
auto apply_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(a_tac - a_tic);
output_file << "Apply time (ns): " << apply_time.count() << std::endl;
Compute the residual norm
auto residual =
utils::compute_residual_norm(A.get(), b.get(), x_clone.get());
output_file << "Residual_norm: " << residual << std::endl;
}
Log the internal operations using the OperationLogger without timing
Create an OperationLogger to analyze the generate step
auto gen_logger = std::make_shared<loggers::OperationLogger>();
Add the generate logger to the executor
Generate a solver
auto generated_solver = solver_factory->generate(A);
Remove the generate logger from the executor
Write the data to the output file
output_file << "Generate operations times (ns):" << std::endl;
gen_logger->write_data(output_file);
Create an OperationLogger to analyze the apply step
auto apply_logger = std::make_shared<loggers::OperationLogger>();
Create a ResidualLogger to log the recurent residual
auto res_logger = std::make_shared<loggers::ResidualLogger<ValueType>>(
A.get(), b.get());
generated_solver->add_logger(res_logger);
Solve the system
generated_solver->apply(b, x);
Write the data to the output file
output_file << "Apply operations times (ns):" << std::endl;
apply_logger->write_data(output_file);
res_logger->write_data(output_file);
}
Print solution
std::cout << "Solution, first ten entries: \n";
print_vector(x.get());
Print output file location
std::cout << "The performance and residual data can be found in " << of_name
<< std::endl;
}
Results
This is the expected standard output:
Solution, first ten entries:
[
0.252218
0.108645
0.0662811
0.0630433
0.0384088
0.0396536
0.0402648
0.0338935
0.0193098
0.0234653
];
The performance and residual data can be found in log.txt
Here is a sample output in the file log.txt:
Generate time (ns): 861
Apply time (ns): 108144
Residual_norm: 2.10788e-15
Generate operations times (ns):
Apply operations times (ns):
allocate: 14991
cg::step_1#5: 7683
cg::step_2#7: 7756
copy: 7751
csr::advanced_spmv#5: 21819
csr::spmv#3: 20429
dense::compute_dot#3: 18043
dense::compute_norm2#2: 16726
free: 8857
residual_norm::residual_norm#9: 3614
Recurrent Residual Norms:
[
4.3589
2.30455
1.46771
0.984875
0.741833
0.513623
0.384165
0.316439
0.227709
0.170312
0.0973722
0.0616831
0.0454123
0.031953
0.0161606
0.00657015
0.00264367
0.000858809
0.000286461
1.64195e-15
];
True Residual Norms:
[
4.3589
2.30455
1.46771
0.984875
0.741833
0.513623
0.384165
0.316439
0.227709
0.170312
0.0973722
0.0616831
0.0454123
0.031953
0.0161606
0.00657015
0.00264367
0.000858809
0.000286461
2.10788e-15
];
Comments about programming and debugging
The plain program
#include <algorithm>
#include <array>
#include <chrono>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <ostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <ginkgo/ginkgo.hpp>
template <typename ValueType>
template <typename ValueType>
namespace utils {
template <typename ValueType>
std::unique_ptr<vec<ValueType>> create_vector(
ValueType value)
{
auto res = vec<ValueType>::create(exec);
return res;
}
template <typename ValueType>
ValueType get_first_element(const vec<ValueType>* norm)
{
return norm->get_executor()->copy_val_to_host(norm->get_const_values());
}
template <typename ValueType>
{
auto exec = b->get_executor();
auto b_norm = gko::initialize<real_vec<ValueType>>({0.0}, exec);
b->compute_norm2(b_norm);
return get_first_element(b_norm.get());
}
template <typename ValueType>
const gko::LinOp* system_matrix,
const vec<ValueType>* b,
const vec<ValueType>* x)
{
auto one = gko::initialize<vec<ValueType>>({1.0}, exec);
auto neg_one = gko::initialize<vec<ValueType>>({-1.0}, exec);
system_matrix->
apply(one, x, neg_one, res);
return compute_norm(res.get());
}
}
namespace loggers {
{
this->start_operation(exec, "allocate");
}
{
this->end_operation(exec, "allocate");
}
{
this->start_operation(exec, "free");
}
{
this->end_operation(exec, "free");
}
{
this->start_operation(to, "copy");
}
{
this->end_operation(to, "copy");
}
{
this->start_operation(exec, op->
get_name());
}
{
this->end_operation(exec, op->
get_name());
}
void write_data(std::ostream& ostream)
{
for (const auto& entry : total) {
ostream << "\t" << entry.first.c_str() << ": "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(
entry.second)
.count()
<< std::endl;
}
}
private:
const std::string& name) const
{
nested.emplace_back(0);
start[name] = std::chrono::steady_clock::now();
}
void end_operation(
const gko::Executor* exec,
const std::string& name)
const
{
const auto end = std::chrono::steady_clock::now();
const auto diff = end - start[name];
total[name] += diff - nested.back();
nested.pop_back();
if (nested.size() > 0) {
nested.back() += diff;
}
}
mutable std::map<std::string, std::chrono::steady_clock::time_point> start;
mutable std::map<std::string, std::chrono::steady_clock::duration> total;
mutable std::vector<std::chrono::steady_clock::duration> nested;
};
{
storage[location] = num_bytes;
}
{
storage[location] = 0;
}
void write_data(std::ostream& ostream)
{
for (const auto& e : storage) {
total += e.second;
}
ostream << "Storage: " << total << std::endl;
}
private:
mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
};
template <typename ValueType>
{
if (residual_norm) {
rec_res_norms.push_back(utils::get_first_element(
gko::as<real_vec<ValueType>>(residual_norm)));
} else {
rec_res_norms.push_back(
utils::compute_norm(
gko::as<vec<ValueType>>(residual)));
}
if (solution) {
true_res_norms.push_back(utils::compute_residual_norm(
matrix, b,
gko::as<vec<ValueType>>(solution)));
} else {
true_res_norms.push_back(-1.0);
}
}
ResidualLogger(
const gko::LinOp* matrix,
const vec<ValueType>* b)
:
gko::log::Logger(
gko::log::Logger::iteration_complete_mask),
matrix{matrix},
b{b}
{}
void write_data(std::ostream& ostream)
{
ostream << "Recurrent Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : rec_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
ostream << "True Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : true_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
}
private:
const vec<ValueType>* b;
mutable std::vector<gko::remove_complex<ValueType>> rec_res_norms;
mutable std::vector<gko::remove_complex<ValueType>> true_res_norms;
};
}
namespace {
void print_usage(const char* filename)
{
std::cerr << "Usage: " << filename << " [executor] [matrix file]"
<< std::endl;
std::cerr << "matrix file should be a file in matrix market format. "
"The file data/A.mtx is provided as an example."
<< std::endl;
std::exit(-1);
}
template <typename ValueType>
{
std::cout << "[" << std::endl;
for (int i = 0; i < elements_to_print; ++i) {
std::cout <<
"\t" << vec->
at(i) << std::endl;
}
std::cout << "];" << std::endl;
}
}
int main(int argc, char* argv[])
{
using ValueType = double;
using IndexType = int;
const auto of_name = "log.txt";
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
const auto exec = exec_map.at(executor_string)();
std::string input_mtx = "data/A.mtx";
if (argc == 3) {
input_mtx = std::string(argv[2]);
}
auto storage_logger = std::make_shared<loggers::StorageLogger>();
auto A =
gko::share(gko::read<mtx>(std::ifstream(input_mtx), exec));
const auto max_iters = A->get_size()[0];
auto b = utils::create_vector<ValueType>(exec, A->get_size()[0], 1.0);
auto x = utils::create_vector<ValueType>(exec, A->get_size()[0], 0.0);
auto solver_factory =
solver::build()
.with_criteria(
.with_reduction_factor(reduction_factor),
gko::stop::Iteration::build().with_max_iters(max_iters))
.with_preconditioner(preconditioner::create(exec))
.on(exec);
std::ofstream output_file(of_name);
{
solver_factory->generate(A)->apply(b, x_clone);
}
{
auto g_tic = std::chrono::steady_clock::now();
auto generated_solver = solver_factory->generate(A);
auto g_tac = std::chrono::steady_clock::now();
auto generate_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(g_tac - g_tic);
output_file << "Generate time (ns): " << generate_time.count()
<< std::endl;
auto a_tic = std::chrono::steady_clock::now();
generated_solver->apply(b, x_clone);
auto a_tac = std::chrono::steady_clock::now();
auto apply_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(a_tac - a_tic);
output_file << "Apply time (ns): " << apply_time.count() << std::endl;
auto residual =
utils::compute_residual_norm(A.get(), b.get(), x_clone.get());
output_file << "Residual_norm: " << residual << std::endl;
}
{
auto gen_logger = std::make_shared<loggers::OperationLogger>();
auto generated_solver = solver_factory->generate(A);
output_file << "Generate operations times (ns):" << std::endl;
gen_logger->write_data(output_file);
auto apply_logger = std::make_shared<loggers::OperationLogger>();
auto res_logger = std::make_shared<loggers::ResidualLogger<ValueType>>(
A.get(), b.get());
generated_solver->add_logger(res_logger);
generated_solver->apply(b, x);
output_file << "Apply operations times (ns):" << std::endl;
apply_logger->write_data(output_file);
res_logger->write_data(output_file);
}
std::cout << "Solution, first ten entries: \n";
print_vector(x.get());
std::cout << "The performance and residual data can be found in " << of_name
<< std::endl;
}