// Copyright 2020, 2021 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) // // This file is part of the heyoka library. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include #include #include #include // NOTE: needed for the spdlog stopwatch. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(HEYOKA_HAVE_REAL128) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #if defined(_MSC_VER) && !defined(__clang__) // NOTE: MSVC has issues with the other "using" // statement form. using namespace fmt::literals; #else using fmt::literals::operator""_format; #endif namespace heyoka { namespace detail { namespace { std::string taylor_c_diff_mangle(const variable &) { return "var"; } std::string taylor_c_diff_mangle(const number &) { return "num"; } std::string taylor_c_diff_mangle(const param &) { return "par"; } } // namespace // NOTE: precondition on name: must be conforming to LLVM requirements for // function names, and must not contain "." (as we use it as a separator in // the mangling scheme). std::pair> taylor_c_diff_func_name_args_impl(llvm::LLVMContext &context, const std::string &name, llvm::Type *val_t, std::uint32_t n_uvars, const std::vector> &args, std::uint32_t n_hidden_deps) { assert(val_t != nullptr); assert(n_uvars > 0u); // Init the name. auto fname = "heyoka.taylor_c_diff.{}."_format(name); // Init the vector of arguments: // - diff order, // - idx of the u variable whose diff is being computed, // - diff array (pointer to val_t), // - par ptr (pointer to scalar), // - time ptr (pointer to scalar). std::vector fargs{ llvm::Type::getInt32Ty(context), llvm::Type::getInt32Ty(context), llvm::PointerType::getUnqual(val_t), llvm::PointerType::getUnqual(val_t->getScalarType()), llvm::PointerType::getUnqual(val_t->getScalarType())}; // Add the mangling and LLVM arg types for the argument types. Also, detect if // we have variables in the arguments. bool with_var = false; for (decltype(args.size()) i = 0; i < args.size(); ++i) { // Detect variable. if (std::holds_alternative(args[i])) { with_var = true; } // Name mangling. fname += std::visit([](const auto &v) { return taylor_c_diff_mangle(v); }, args[i]); // Add the arguments separator, if we are not at the // last argument. if (i != args.size() - 1u) { fname += '_'; } // Add the LLVM function argument type. fargs.push_back(std::visit( [&](const auto &v) -> llvm::Type * { using type = detail::uncvref_t; if constexpr (std::is_same_v) { // For numbers, the argument is passed as a scalar // floating-point value. return val_t->getScalarType(); } else { // For vars and params, the argument is an index // in an array. return llvm::Type::getInt32Ty(context); } }, args[i])); } // Close the argument list with a ".". // NOTE: this will result in a ".." in the name // if the function has zero arguments. fname += '.'; // If we have variables in the arguments, add mangling // for n_uvars. if (with_var) { fname += "n_uvars_{}."_format(n_uvars); } // Finally, add the mangling for the floating-point type. fname += llvm_mangle_type(val_t); // Fill in the hidden dependency arguments. These are all indices. fargs.insert(fargs.end(), boost::numeric_cast(n_hidden_deps), llvm::Type::getInt32Ty(context)); return std::make_pair(std::move(fname), std::move(fargs)); } namespace { template llvm::Value *taylor_codegen_numparam_num(llvm_state &s, const number &num, std::uint32_t batch_size) { return vector_splat(s.builder(), codegen(s, num), batch_size); } llvm::Value *taylor_codegen_numparam_par(llvm_state &s, const param &p, llvm::Value *par_ptr, std::uint32_t batch_size) { assert(batch_size > 0u); auto &builder = s.builder(); // Determine the index into the parameter array. // LCOV_EXCL_START if (p.idx() > std::numeric_limits::max() / batch_size) { throw std::overflow_error("Overflow detected in the computation of the index into a parameter array"); } // LCOV_EXCL_STOP const auto arr_idx = static_cast(p.idx() * batch_size); // Compute the pointer to load from. auto *ptr = builder.CreateInBoundsGEP(par_ptr, {builder.getInt32(arr_idx)}); // Load. return load_vector_from_memory(builder, ptr, batch_size); } } // namespace llvm::Value *taylor_codegen_numparam_dbl(llvm_state &s, const number &num, llvm::Value *, std::uint32_t batch_size) { return taylor_codegen_numparam_num(s, num, batch_size); } llvm::Value *taylor_codegen_numparam_ldbl(llvm_state &s, const number &num, llvm::Value *, std::uint32_t batch_size) { return taylor_codegen_numparam_num(s, num, batch_size); } #if defined(HEYOKA_HAVE_REAL128) llvm::Value *taylor_codegen_numparam_f128(llvm_state &s, const number &num, llvm::Value *, std::uint32_t batch_size) { return taylor_codegen_numparam_num(s, num, batch_size); } #endif llvm::Value *taylor_codegen_numparam_dbl(llvm_state &s, const param &p, llvm::Value *par_ptr, std::uint32_t batch_size) { return taylor_codegen_numparam_par(s, p, par_ptr, batch_size); } llvm::Value *taylor_codegen_numparam_ldbl(llvm_state &s, const param &p, llvm::Value *par_ptr, std::uint32_t batch_size) { return taylor_codegen_numparam_par(s, p, par_ptr, batch_size); } #if defined(HEYOKA_HAVE_REAL128) llvm::Value *taylor_codegen_numparam_f128(llvm_state &s, const param &p, llvm::Value *par_ptr, std::uint32_t batch_size) { return taylor_codegen_numparam_par(s, p, par_ptr, batch_size); } #endif // Codegen helpers for number/param for use in the generic c_diff implementations. llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &s, const number &, llvm::Value *n, llvm::Value *, std::uint32_t batch_size) { return vector_splat(s.builder(), n, batch_size); } llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &s, const param &, llvm::Value *p, llvm::Value *par_ptr, std::uint32_t batch_size) { auto &builder = s.builder(); // Fetch the pointer into par_ptr. // NOTE: the overflow check is done in taylor_compute_jet(). auto *ptr = builder.CreateInBoundsGEP(par_ptr, {builder.CreateMul(p, builder.getInt32(batch_size))}); return load_vector_from_memory(builder, ptr, batch_size); } // Helper to fetch the derivative of order 'order' of the u variable at index u_idx from the // derivative array 'arr'. The total number of u variables is n_uvars. llvm::Value *taylor_fetch_diff(const std::vector &arr, std::uint32_t u_idx, std::uint32_t order, std::uint32_t n_uvars) { // Sanity check. assert(u_idx < n_uvars); // Compute the index. const auto idx = static_cast(order) * n_uvars + u_idx; assert(idx < arr.size()); return arr[idx]; } // Load the derivative of order 'order' of the u variable u_idx from the array of Taylor derivatives diff_arr. // n_uvars is the total number of u variables. llvm::Value *taylor_c_load_diff(llvm_state &s, llvm::Value *diff_arr, std::uint32_t n_uvars, llvm::Value *order, llvm::Value *u_idx) { auto &builder = s.builder(); // NOTE: overflow check has already been done to ensure that the // total size of diff_arr fits in a 32-bit unsigned integer. auto *ptr = builder.CreateInBoundsGEP( diff_arr, {builder.CreateAdd(builder.CreateMul(order, builder.getInt32(n_uvars)), u_idx)}); return builder.CreateLoad(ptr); } namespace { // Simplify a Taylor decomposition by removing // common subexpressions. // NOTE: the hidden deps are not considered for CSE // purposes, only the actual subexpressions. taylor_dc_t taylor_decompose_cse(taylor_dc_t &v_ex, std::vector &sv_funcs_dc, taylor_dc_t::size_type n_eq) { using idx_t = taylor_dc_t::size_type; // Log runtime in trace mode. spdlog::stopwatch sw; // Cache the original size for logging later. const auto orig_size = v_ex.size(); // A Taylor decomposition is supposed // to have n_eq variables at the beginning, // n_eq variables at the end and possibly // extra variables in the middle. assert(v_ex.size() >= n_eq * 2u); // Init the return value. taylor_dc_t retval; // expression -> idx map. This will end up containing // all the unique expressions from v_ex, and it will // map them to their indices in retval (which will // in general differ from their indices in v_ex). std::unordered_map ex_map; // Map for the renaming of u variables // in the expressions. std::unordered_map uvars_rename; // The first n_eq definitions are just renaming // of the state variables into u variables. for (idx_t i = 0; i < n_eq; ++i) { assert(std::holds_alternative(v_ex[i].first.value())); // NOTE: no hidden deps allowed here. assert(v_ex[i].second.empty()); retval.push_back(std::move(v_ex[i])); // NOTE: the u vars that correspond to state // variables are never simplified, // thus map them onto themselves. [[maybe_unused]] const auto res = uvars_rename.emplace("u_{}"_format(i), "u_{}"_format(i)); assert(res.second); } // Handle the u variables which do not correspond to state variables. for (auto i = n_eq; i < v_ex.size() - n_eq; ++i) { auto &[ex, deps] = v_ex[i]; // Rename the u variables in ex. rename_variables(ex, uvars_rename); if (auto it = ex_map.find(ex); it == ex_map.end()) { // This is the first occurrence of ex in the // decomposition. Add it to retval. retval.emplace_back(ex, std::move(deps)); // Add ex to ex_map, mapping it to // the index it corresponds to in retval // (let's call it j). ex_map.emplace(std::move(ex), retval.size() - 1u); // Update uvars_rename. This will ensure that // occurrences of the variable 'u_i' in the next // elements of v_ex will be renamed to 'u_j'. [[maybe_unused]] const auto res = uvars_rename.emplace("u_{}"_format(i), "u_{}"_format(retval.size() - 1u)); assert(res.second); } else { // ex is redundant. This means // that it already appears in retval at index // it->second. Don't add anything to retval, // and remap the variable name 'u_i' to // 'u_{it->second}'. [[maybe_unused]] const auto res = uvars_rename.emplace("u_{}"_format(i), "u_{}"_format(it->second)); assert(res.second); } } // Handle the derivatives of the state variables at the // end of the decomposition. We just need to ensure that // the u variables in their definitions are renamed with // the new indices. for (auto i = v_ex.size() - n_eq; i < v_ex.size(); ++i) { auto &[ex, deps] = v_ex[i]; // NOTE: here we expect only vars, numbers or params, // and no hidden dependencies. assert(std::holds_alternative(ex.value()) || std::holds_alternative(ex.value()) || std::holds_alternative(ex.value())); assert(deps.empty()); rename_variables(ex, uvars_rename); retval.emplace_back(std::move(ex), std::move(deps)); } // Re-adjust all indices in the hidden dependencies in order to account // for the renaming of the uvars. for (auto &[_, deps] : retval) { for (auto &idx : deps) { auto it = uvars_rename.find("u_{}"_format(idx)); assert(it != uvars_rename.end()); idx = uname_to_index(it->second); } } // Same for the indices in sv_funcs_dc. for (auto &idx : sv_funcs_dc) { auto it = uvars_rename.find("u_{}"_format(idx)); assert(it != uvars_rename.end()); idx = uname_to_index(it->second); } get_logger()->debug("Taylor CSE reduced decomposition size from {} to {}", orig_size, retval.size()); get_logger()->trace("Taylor CSE runtime: {}", sw); return retval; } // Perform a topological sort on a graph representation // of the Taylor decomposition. This can improve performance // by grouping together operations that can be performed in parallel, // and it also makes compact mode much more effective by creating // clusters of subexpressions whose derivatives can be computed in // parallel. // NOTE: the original decomposition dc is already topologically sorted, // in the sense that the definitions of the u variables are already // ordered according to dependency. However, because the original decomposition // comes from a depth-first search, it has the tendency to group together // expressions which are dependent on each other. By doing another topological // sort, this time based on breadth-first search, we determine another valid // sorting in which independent operations tend to be clustered together. auto taylor_sort_dc(taylor_dc_t &dc, std::vector &sv_funcs_dc, taylor_dc_t::size_type n_eq) { // A Taylor decomposition is supposed // to have n_eq variables at the beginning, // n_eq variables at the end and possibly // extra variables in the middle assert(dc.size() >= n_eq * 2u); // Log runtime in trace mode. spdlog::stopwatch sw; // The graph type that we will use for the topological sorting. using graph_t = boost::adjacency_list; graph_t g; // Add the root node. const auto root_v = boost::add_vertex(g); // Add the nodes corresponding to the state variables. for (decltype(n_eq) i = 0; i < n_eq; ++i) { auto v = boost::add_vertex(g); // Add a dependency on the root node. boost::add_edge(root_v, v, g); } // Add the rest of the u variables. for (decltype(n_eq) i = n_eq; i < dc.size() - n_eq; ++i) { auto v = boost::add_vertex(g); // Fetch the list of variables in the current expression. const auto vars = get_variables(dc[i].first); if (vars.empty()) { // The current expression does not contain // any variable: make it depend on the root // node. This means that in the topological // sort below, the current u var will appear // immediately after the state variables. boost::add_edge(root_v, v, g); } else { // Mark the current u variable as depending on all the // variables in the current expression. for (const auto &var : vars) { // Extract the index. const auto idx = uname_to_index(var); // Add the dependency. // NOTE: add +1 because the i-th vertex // corresponds to the (i-1)-th u variable // due to the presence of the root node. boost::add_edge(boost::vertex(idx + 1u, g), v, g); } } } assert(boost::num_vertices(g) - 1u == dc.size() - n_eq); // Run the BF topological sort on the graph. This is Kahn's algorithm: // https://en.wikipedia.org/wiki/Topological_sorting // The result of the sort. std::vector v_idx; // Temp variable used to sort a list of edges in the loop below. std::vector::edge_descriptor> tmp_edges; // The set of all nodes with no incoming edge. std::deque tmp; // The root node has no incoming edge. tmp.push_back(0); // Main loop. while (!tmp.empty()) { // Pop the first element from tmp // and append it to the result. const auto v = tmp.front(); tmp.pop_front(); v_idx.push_back(v); // Fetch all the out edges of v and sort them according // to the target vertex. // NOTE: the sorting is important to ensure that all the state // variables are insered into v_idx in the correct order. const auto e_range = boost::out_edges(v, g); tmp_edges.assign(e_range.first, e_range.second); std::sort(tmp_edges.begin(), tmp_edges.end(), [&g](const auto &e1, const auto &e2) { return boost::target(e1, g) < boost::target(e2, g); }); // For each out edge of v: // - eliminate it; // - check if the target vertex of the edge // has other incoming edges; // - if it does not, insert it into tmp. for (auto &e : tmp_edges) { // Fetch the target of the edge. const auto t = boost::target(e, g); // Remove the edge. boost::remove_edge(e, g); // Get the range of vertices connecting to t. const auto iav = boost::inv_adjacent_vertices(t, g); if (iav.first == iav.second) { // t does not have any incoming edges, add it to tmp. tmp.push_back(t); } } } assert(v_idx.size() == boost::num_vertices(g)); assert(boost::num_edges(g) == 0u); // Adjust v_idx: remove the index of the root node, // decrease by one all other indices, insert the final // n_eq indices. for (decltype(v_idx.size()) i = 0; i < v_idx.size() - 1u; ++i) { v_idx[i] = v_idx[i + 1u] - 1u; } v_idx.resize(boost::numeric_cast(dc.size())); std::iota(v_idx.data() + dc.size() - n_eq, v_idx.data() + dc.size(), dc.size() - n_eq); // Create the remapping dictionary. std::unordered_map remap; // NOTE: the u vars that correspond to state // variables were inserted into v_idx in the original // order, thus they are not re-sorted and they do not // need renaming. for (decltype(v_idx.size()) i = 0; i < n_eq; ++i) { [[maybe_unused]] const auto res = remap.emplace("u_{}"_format(i), "u_{}"_format(i)); assert(res.second); } // Establish the remapping for the u variables that are not // state variables. for (decltype(v_idx.size()) i = n_eq; i < v_idx.size() - n_eq; ++i) { [[maybe_unused]] const auto res = remap.emplace("u_{}"_format(v_idx[i]), "u_{}"_format(i)); assert(res.second); } // Do the remap for the definitions of the u variables, the // derivatives and the hidden deps. for (auto *it = dc.data() + n_eq; it != dc.data() + dc.size(); ++it) { // Remap the expression. rename_variables(it->first, remap); // Remap the hidden dependencies. for (auto &idx : it->second) { auto it_remap = remap.find("u_{}"_format(idx)); assert(it_remap != remap.end()); idx = uname_to_index(it_remap->second); } } // Do the remap for sv_funcs. for (auto &idx : sv_funcs_dc) { auto it_remap = remap.find("u_{}"_format(idx)); assert(it_remap != remap.end()); idx = uname_to_index(it_remap->second); } // Reorder the decomposition. taylor_dc_t retval; retval.reserve(v_idx.size()); for (auto idx : v_idx) { retval.push_back(std::move(dc[idx])); } get_logger()->trace("Taylor topological sort runtime: {}", sw); return retval; } // LCOV_EXCL_START #if !defined(NDEBUG) // Helper to verify a Taylor decomposition. void verify_taylor_dec(const std::vector &orig, const taylor_dc_t &dc) { using idx_t = taylor_dc_t::size_type; const auto n_eq = orig.size(); assert(dc.size() >= n_eq * 2u); // The first n_eq expressions of u variables // must be just variables. No hidden dependencies // are allowed. for (idx_t i = 0; i < n_eq; ++i) { assert(std::holds_alternative(dc[i].first.value())); assert(dc[i].second.empty()); } // From n_eq to dc.size() - n_eq, the expressions // must be functions whose arguments // are either variables in the u_n form, // where n < i, or numbers/params. // The hidden dependencies must contain indices // only in the [n_eq, dc.size() - n_eq) range. for (auto i = n_eq; i < dc.size() - n_eq; ++i) { std::visit( [i](const auto &v) { using type = detail::uncvref_t; if constexpr (std::is_same_v) { auto check_arg = [i](const auto &arg) { if (auto p_var = std::get_if(&arg.value())) { assert(p_var->name().rfind("u_", 0) == 0); assert(uname_to_index(p_var->name()) < i); } else if (std::get_if(&arg.value()) == nullptr && std::get_if(&arg.value()) == nullptr) { assert(false); } }; for (const auto &arg : v.args()) { check_arg(arg); } } else { assert(false); } }, dc[i].first.value()); for (auto idx : dc[i].second) { assert(idx >= n_eq); assert(idx < dc.size() - n_eq); // Hidden dep onto itself does not make any sense. assert(idx != i); } } // From dc.size() - n_eq to dc.size(), the expressions // must be either variables in the u_n form, where n < i, // or numbers/params. for (auto i = dc.size() - n_eq; i < dc.size(); ++i) { std::visit( [i](const auto &v) { using type = detail::uncvref_t; if constexpr (std::is_same_v) { assert(v.name().rfind("u_", 0) == 0); assert(uname_to_index(v.name()) < i); } else if constexpr (!std::is_same_v && !std::is_same_v) { assert(false); } }, dc[i].first.value()); // No hidden dependencies. assert(dc[i].second.empty()); } std::unordered_map subs_map; // For each u variable, expand its definition // in terms of state variables or other u variables, // and store it in subs_map. for (idx_t i = 0; i < dc.size() - n_eq; ++i) { subs_map.emplace("u_{}"_format(i), subs(dc[i].first, subs_map)); } // Reconstruct the right-hand sides of the system // and compare them to the original ones. for (auto i = dc.size() - n_eq; i < dc.size(); ++i) { assert(subs(dc[i].first, subs_map) == orig[i - (dc.size() - n_eq)]); } } // Helper to verify the decomposition of the sv funcs. void verify_taylor_dec_sv_funcs(const std::vector &sv_funcs_dc, const std::vector &sv_funcs, const taylor_dc_t &dc, std::vector::size_type n_eq) { assert(sv_funcs.size() == sv_funcs_dc.size()); std::unordered_map subs_map; // For each u variable, expand its definition // in terms of state variables or other u variables, // and store it in subs_map. for (decltype(dc.size()) i = 0; i < dc.size() - n_eq; ++i) { subs_map.emplace("u_{}"_format(i), subs(dc[i].first, subs_map)); } // Reconstruct the sv functions and compare them to the // original ones. for (decltype(sv_funcs.size()) i = 0; i < sv_funcs.size(); ++i) { assert(sv_funcs_dc[i] < dc.size()); auto sv_func = subs(dc[sv_funcs_dc[i]].first, subs_map); assert(sv_func == sv_funcs[i]); } } #endif // LCOV_EXCL_STOP // A couple of helpers for deep-copying containers of expressions. auto copy(const std::vector &v_ex) { std::vector ret; ret.reserve(v_ex.size()); std::transform(v_ex.begin(), v_ex.end(), std::back_inserter(ret), [](const expression &e) { return copy(e); }); return ret; } auto copy(const std::vector> &v) { std::vector> ret; ret.reserve(v.size()); std::transform(v.begin(), v.end(), std::back_inserter(ret), [](const auto &p) { return std::pair{copy(p.first), copy(p.second)}; }); return ret; } } // namespace } // namespace detail // Taylor decomposition with automatic deduction // of variables. // NOTE: when dealing with functions with hidden deps, // we should consider avoiding adding hidden deps if the // function argument(s) is a number/param: the hidden deps // won't be used for the computation of the derivatives // and thus they can be optimised out. Note that a straightforward // implementation of this idea this will only work when the argument // is a number/param, not when, e.g., the argument is par[0] + par[1] - in // order to simplify this out, it should be recognized that the definition // of a u variable depends only on numbers/params. std::pair> taylor_decompose(const std::vector &v_ex_, const std::vector &sv_funcs_) { // Need to operate on copies due to in-place mutation // via rename_variables(). auto v_ex = detail::copy(v_ex_); auto sv_funcs = detail::copy(sv_funcs_); if (v_ex.empty()) { throw std::invalid_argument("Cannot decompose a system of zero equations"); } // Determine the variables in the system of equations. std::set vars; for (const auto &ex : v_ex) { for (const auto &var : get_variables(ex)) { vars.emplace(var); } } if (vars.size() != v_ex.size()) { throw std::invalid_argument( "The number of deduced variables for a Taylor decomposition ({}) differs from the number of equations ({})"_format( vars.size(), v_ex.size())); } // Check that the expressions in sv_funcs contain only // state variables. for (const auto &ex : sv_funcs) { for (const auto &var : get_variables(ex)) { if (vars.find(var) == vars.end()) { throw std::invalid_argument("The extra functions in a Taylor decomposition contain the variable '{}', " "which is not a state variable"_format(var)); } } } // Cache the number of equations/variables // for later use. const auto n_eq = v_ex.size(); // Create the map for renaming the variables to u_i. // The renaming will be done in alphabetical order. std::unordered_map repl_map; { decltype(vars.size()) var_idx = 0; for (const auto &var : vars) { [[maybe_unused]] const auto eres = repl_map.emplace(var, "u_{}"_format(var_idx++)); assert(eres.second); } } #if !defined(NDEBUG) // Store a copy of the original system and // sv_funcs for checking later. auto orig_v_ex = detail::copy(v_ex); auto orig_sv_funcs = detail::copy(sv_funcs); #endif // Rename the variables in the original equations. for (auto &ex : v_ex) { rename_variables(ex, repl_map); } // Rename the variables in sv_funcs. for (auto &ex : sv_funcs) { rename_variables(ex, repl_map); } // Init the decomposition. It begins with a list // of the original variables of the system. taylor_dc_t u_vars_defs; u_vars_defs.reserve(vars.size()); for (const auto &var : vars) { u_vars_defs.emplace_back(variable{var}, std::vector{}); } // Log the construction runtime in trace mode. spdlog::stopwatch sw; // Run the decomposition on each equation. for (auto &ex : v_ex) { // Decompose the current equation. if (const auto dres = taylor_decompose(ex, u_vars_defs)) { // NOTE: if the equation was decomposed // (that is, it is not constant or a single variable), // we have to update the original definition // of the equation in v_ex // so that it points to the u variable // that now represents it. // NOTE: all functions are forced to return dres != 0 // in the func API, so the only entities that // can return dres == 0 are const/params or // variables. ex = expression{"u_{}"_format(dres)}; } } // Decompose sv_funcs, and write into sv_funcs_dc the index // of the u variable which each sv_func corresponds to. std::vector sv_funcs_dc; for (auto &sv_ex : sv_funcs) { if (const auto *var_ptr = std::get_if(&sv_ex.value())) { // The current sv_func is a variable, add its index to sv_funcs_dc. sv_funcs_dc.push_back(detail::uname_to_index(var_ptr->name())); } else if (const auto dres = taylor_decompose(sv_ex, u_vars_defs)) { // The sv_func was decomposed, add to sv_funcs_dc // the index of the u variable which represents // the result of the decomposition. sv_funcs_dc.push_back(boost::numeric_cast(dres)); } else { // The sv_func was not decomposed, meaning it's a const/param. throw std::invalid_argument( "The extra functions in a Taylor decomposition cannot be constants or parameters"); } } assert(sv_funcs_dc.size() == sv_funcs.size()); // Append the (possibly updated) definitions of the diff equations // in terms of u variables. for (auto &ex : v_ex) { u_vars_defs.emplace_back(std::move(ex), std::vector{}); } detail::get_logger()->trace("Taylor decomposition construction runtime: {}", sw); #if !defined(NDEBUG) // Verify the decomposition. detail::verify_taylor_dec(orig_v_ex, u_vars_defs); detail::verify_taylor_dec_sv_funcs(sv_funcs_dc, orig_sv_funcs, u_vars_defs, n_eq); #endif // Simplify the decomposition. u_vars_defs = detail::taylor_decompose_cse(u_vars_defs, sv_funcs_dc, n_eq); #if !defined(NDEBUG) // Verify the simplified decomposition. detail::verify_taylor_dec(orig_v_ex, u_vars_defs); detail::verify_taylor_dec_sv_funcs(sv_funcs_dc, orig_sv_funcs, u_vars_defs, n_eq); #endif // Run the breadth-first topological sort on the decomposition. u_vars_defs = detail::taylor_sort_dc(u_vars_defs, sv_funcs_dc, n_eq); #if !defined(NDEBUG) // Verify the reordered decomposition. detail::verify_taylor_dec(orig_v_ex, u_vars_defs); detail::verify_taylor_dec_sv_funcs(sv_funcs_dc, orig_sv_funcs, u_vars_defs, n_eq); #endif return std::make_pair(std::move(u_vars_defs), std::move(sv_funcs_dc)); } // Taylor decomposition from lhs and rhs // of a system of equations. std::pair> taylor_decompose(const std::vector> &sys_, const std::vector &sv_funcs_) { // Need to operate on copies due to in-place mutation // via rename_variables(). auto sys = detail::copy(sys_); auto sv_funcs = detail::copy(sv_funcs_); if (sys.empty()) { throw std::invalid_argument("Cannot decompose a system of zero equations"); } // Determine the variables in the system of equations // from the lhs of the equations. We need to ensure that: // - all the lhs expressions are variables // and there are no duplicates, // - all the variables in the rhs expressions // appear in the lhs expressions. // Note that not all variables in the lhs // need to appear in the rhs. // This will eventually contain the list // of all variables in the system. std::vector lhs_vars; // Maintain a set as well to check for duplicates. std::unordered_set lhs_vars_set; // The set of variables in the rhs. std::unordered_set rhs_vars_set; for (const auto &p : sys) { const auto &lhs = p.first; const auto &rhs = p.second; // Infer the variable from the current lhs. std::visit( [&lhs, &lhs_vars, &lhs_vars_set](const auto &v) { if constexpr (std::is_same_v, variable>) { // Check if this is a duplicate variable. if (auto res = lhs_vars_set.emplace(v.name()); res.second) { // Not a duplicate, add it to lhs_vars. lhs_vars.emplace_back(v.name()); } else { // Duplicate, error out. throw std::invalid_argument( "Error in the Taylor decomposition of a system of equations: the variable '{}' " "appears in the left-hand side twice"_format(v.name())); } } else { throw std::invalid_argument( "Error in the Taylor decomposition of a system of equations: the " "left-hand side contains the expression '{}', which is not a variable"_format(lhs)); } }, lhs.value()); // Update the global list of variables // for the rhs. for (auto &var : get_variables(rhs)) { rhs_vars_set.emplace(std::move(var)); } } // Check that all variables in the rhs appear in the lhs. for (const auto &var : rhs_vars_set) { if (lhs_vars_set.find(var) == lhs_vars_set.end()) { throw std::invalid_argument("Error in the Taylor decomposition of a system of equations: the variable '{}' " "appears in the right-hand side but not in the left-hand side"_format(var)); } } // Check that the expressions in sv_funcs contain only // state variables. for (const auto &ex : sv_funcs) { for (const auto &var : get_variables(ex)) { if (lhs_vars_set.find(var) == lhs_vars_set.end()) { throw std::invalid_argument("The extra functions in a Taylor decomposition contain the variable '{}', " "which is not a state variable"_format(var)); } } } // Cache the number of equations/variables. const auto n_eq = sys.size(); assert(n_eq == lhs_vars.size()); // Create the map for renaming the variables to u_i. // The renaming will be done following the order of the lhs // variables. std::unordered_map repl_map; for (decltype(lhs_vars.size()) i = 0; i < lhs_vars.size(); ++i) { [[maybe_unused]] const auto eres = repl_map.emplace(lhs_vars[i], "u_{}"_format(i)); assert(eres.second); } #if !defined(NDEBUG) // Store a copy of the original rhs and sv_funcs // for checking later. std::vector orig_rhs; orig_rhs.reserve(sys.size()); for (const auto &[_, rhs_ex] : sys) { orig_rhs.push_back(copy(rhs_ex)); } auto orig_sv_funcs = detail::copy(sv_funcs); #endif // Rename the variables in the original equations. for (auto &[_, rhs_ex] : sys) { rename_variables(rhs_ex, repl_map); } // Rename the variables in sv_funcs. for (auto &ex : sv_funcs) { rename_variables(ex, repl_map); } // Log the construction runtime in trace mode. spdlog::stopwatch sw; // Init the decomposition. It begins with a list // of the original lhs variables of the system. taylor_dc_t u_vars_defs; u_vars_defs.reserve(lhs_vars.size()); for (const auto &var : lhs_vars) { u_vars_defs.emplace_back(variable{var}, std::vector{}); } // Run the decomposition on each equation. for (auto &[_, ex] : sys) { // Decompose the current equation. if (const auto dres = taylor_decompose(ex, u_vars_defs)) { // NOTE: if the equation was decomposed // (that is, it is not constant or a single variable), // we have to update the original definition // of the equation in sys // so that it points to the u variable // that now represents it. // NOTE: all functions are forced to return dres != 0 // in the func API, so the only entities that // can return dres == 0 are const/params or // variables. ex = expression{"u_{}"_format(dres)}; } } // Decompose sv_funcs, and write into sv_funcs_dc the index // of the u variable which each sv_func corresponds to. std::vector sv_funcs_dc; for (auto &sv_ex : sv_funcs) { if (auto *const var_ptr = std::get_if(&sv_ex.value())) { // The current sv_func is a variable, add its index to sv_funcs_dc. sv_funcs_dc.push_back(detail::uname_to_index(var_ptr->name())); } else if (const auto dres = taylor_decompose(sv_ex, u_vars_defs)) { // The sv_func was decomposed, add to sv_funcs_dc // the index of the u variable which represents // the result of the decomposition. sv_funcs_dc.push_back(boost::numeric_cast(dres)); } else { // The sv_func was not decomposed, meaning it's a const/param. throw std::invalid_argument( "The extra functions in a Taylor decomposition cannot be constants or parameters"); } } assert(sv_funcs_dc.size() == sv_funcs.size()); // Append the (possibly updated) definitions of the diff equations // in terms of u variables. for (auto &[_, rhs] : sys) { u_vars_defs.emplace_back(std::move(rhs), std::vector{}); } detail::get_logger()->trace("Taylor decomposition construction runtime: {}", sw); #if !defined(NDEBUG) // Verify the decomposition. detail::verify_taylor_dec(orig_rhs, u_vars_defs); detail::verify_taylor_dec_sv_funcs(sv_funcs_dc, orig_sv_funcs, u_vars_defs, n_eq); #endif // Simplify the decomposition. u_vars_defs = detail::taylor_decompose_cse(u_vars_defs, sv_funcs_dc, n_eq); #if !defined(NDEBUG) // Verify the simplified decomposition. detail::verify_taylor_dec(orig_rhs, u_vars_defs); detail::verify_taylor_dec_sv_funcs(sv_funcs_dc, orig_sv_funcs, u_vars_defs, n_eq); #endif // Run the breadth-first topological sort on the decomposition. u_vars_defs = detail::taylor_sort_dc(u_vars_defs, sv_funcs_dc, n_eq); #if !defined(NDEBUG) // Verify the reordered decomposition. detail::verify_taylor_dec(orig_rhs, u_vars_defs); detail::verify_taylor_dec_sv_funcs(sv_funcs_dc, orig_sv_funcs, u_vars_defs, n_eq); #endif return std::make_pair(std::move(u_vars_defs), std::move(sv_funcs_dc)); } namespace detail { namespace { // Implementation of the streaming operator for the scalar integrators. template std::ostream &taylor_adaptive_stream_impl(std::ostream &os, const taylor_adaptive_impl &ta) { std::ostringstream oss; oss.exceptions(std::ios_base::failbit | std::ios_base::badbit); oss.imbue(std::locale::classic()); oss << std::showpoint; oss.precision(std::numeric_limits::max_digits10); oss << std::boolalpha; oss << "Tolerance : " << ta.get_tol() << '\n'; oss << "High accuracy : " << ta.get_high_accuracy() << '\n'; oss << "Compact mode : " << ta.get_compact_mode() << '\n'; oss << "Taylor order : " << ta.get_order() << '\n'; oss << "Dimension : " << ta.get_dim() << '\n'; oss << "Time : " << ta.get_time() << '\n'; oss << "State : ["; for (decltype(ta.get_state().size()) i = 0; i < ta.get_state().size(); ++i) { oss << ta.get_state()[i]; if (i != ta.get_state().size() - 1u) { oss << ", "; } } oss << "]\n"; if (!ta.get_pars().empty()) { oss << "Parameters : ["; for (decltype(ta.get_pars().size()) i = 0; i < ta.get_pars().size(); ++i) { oss << ta.get_pars()[i]; if (i != ta.get_pars().size() - 1u) { oss << ", "; } } oss << "]\n"; } if (ta.with_events()) { if (!ta.get_t_events().empty()) { oss << "N of terminal events : " << ta.get_t_events().size() << '\n'; } if (!ta.get_nt_events().empty()) { oss << "N of non-terminal events: " << ta.get_nt_events().size() << '\n'; } } return os << oss.str(); } // Implementation of the streaming operator for the batch integrators. template std::ostream &taylor_adaptive_batch_stream_impl(std::ostream &os, const taylor_adaptive_batch_impl &ta) { std::ostringstream oss; oss.exceptions(std::ios_base::failbit | std::ios_base::badbit); oss.imbue(std::locale::classic()); oss << std::showpoint; oss.precision(std::numeric_limits::max_digits10); oss << std::boolalpha; oss << "Tolerance : " << ta.get_tol() << '\n'; oss << "High accuracy : " << ta.get_high_accuracy() << '\n'; oss << "Compact mode : " << ta.get_compact_mode() << '\n'; oss << "Taylor order : " << ta.get_order() << '\n'; oss << "Dimension : " << ta.get_dim() << '\n'; oss << "Batch size : " << ta.get_batch_size() << '\n'; oss << "Time : ["; for (decltype(ta.get_time().size()) i = 0; i < ta.get_time().size(); ++i) { oss << ta.get_time()[i]; if (i != ta.get_time().size() - 1u) { oss << ", "; } } oss << "]\n"; oss << "State : ["; for (decltype(ta.get_state().size()) i = 0; i < ta.get_state().size(); ++i) { oss << ta.get_state()[i]; if (i != ta.get_state().size() - 1u) { oss << ", "; } } oss << "]\n"; if (!ta.get_pars().empty()) { oss << "Parameters : ["; for (decltype(ta.get_pars().size()) i = 0; i < ta.get_pars().size(); ++i) { oss << ta.get_pars()[i]; if (i != ta.get_pars().size() - 1u) { oss << ", "; } } oss << "]\n"; } if (ta.with_events()) { if (!ta.get_t_events().empty()) { oss << "N of terminal events : " << ta.get_t_events().size() << '\n'; } if (!ta.get_nt_events().empty()) { oss << "N of non-terminal events: " << ta.get_nt_events().size() << '\n'; } } return os << oss.str(); } } // namespace template <> std::ostream &operator<<(std::ostream &os, const taylor_adaptive_impl &ta) { return taylor_adaptive_stream_impl(os, ta); } template <> std::ostream &operator<<(std::ostream &os, const taylor_adaptive_impl &ta) { return taylor_adaptive_stream_impl(os, ta); } #if defined(HEYOKA_HAVE_REAL128) template <> std::ostream &operator<<(std::ostream &os, const taylor_adaptive_impl &ta) { return taylor_adaptive_stream_impl(os, ta); } #endif template <> std::ostream &operator<<(std::ostream &os, const taylor_adaptive_batch_impl &ta) { return taylor_adaptive_batch_stream_impl(os, ta); } template <> std::ostream &operator<<(std::ostream &os, const taylor_adaptive_batch_impl &ta) { return taylor_adaptive_batch_stream_impl(os, ta); } #if defined(HEYOKA_HAVE_REAL128) template <> std::ostream &operator<<(std::ostream &os, const taylor_adaptive_batch_impl &ta) { return taylor_adaptive_batch_stream_impl(os, ta); } #endif } // namespace detail #define HEYOKA_TAYLOR_ENUM_STREAM_CASE(val) \ case val: \ os << #val; \ break std::ostream &operator<<(std::ostream &os, taylor_outcome oc) { switch (oc) { HEYOKA_TAYLOR_ENUM_STREAM_CASE(taylor_outcome::success); HEYOKA_TAYLOR_ENUM_STREAM_CASE(taylor_outcome::step_limit); HEYOKA_TAYLOR_ENUM_STREAM_CASE(taylor_outcome::time_limit); HEYOKA_TAYLOR_ENUM_STREAM_CASE(taylor_outcome::err_nf_state); HEYOKA_TAYLOR_ENUM_STREAM_CASE(taylor_outcome::cb_stop); default: if (oc >= taylor_outcome{0}) { // Continuing terminal event. os << "taylor_outcome::terminal_event_{} (continuing)"_format(static_cast(oc)); } else if (oc > taylor_outcome::success) { // Stopping terminal event. os << "taylor_outcome::terminal_event_{} (stopping)"_format(-static_cast(oc) - 1); } else { // Unknown value. os << "taylor_outcome::??"; } } return os; } std::ostream &operator<<(std::ostream &os, event_direction dir) { switch (dir) { HEYOKA_TAYLOR_ENUM_STREAM_CASE(event_direction::any); HEYOKA_TAYLOR_ENUM_STREAM_CASE(event_direction::positive); HEYOKA_TAYLOR_ENUM_STREAM_CASE(event_direction::negative); default: // Unknown value. os << "event_direction::??"; } return os; } #undef HEYOKA_TAYLOR_OUTCOME_STREAM_CASE namespace detail { namespace { // Helper to create the callback used in the default // constructor of a non-terminal event. template auto nt_event_def_cb() { if constexpr (B) { return [](taylor_adaptive_batch_impl &, T, int, std::uint32_t) {}; } else { return [](taylor_adaptive_impl &, T, int) {}; } } } // namespace template nt_event_impl::nt_event_impl() : nt_event_impl(expression{}, nt_event_def_cb()) { } template void nt_event_impl::finalise_ctor(event_direction d) { if (!callback) { throw std::invalid_argument("Cannot construct a non-terminal event with an empty callback"); } if (d < event_direction::negative || d > event_direction::positive) { throw std::invalid_argument("Invalid value selected for the direction of a non-terminal event"); } dir = d; } template nt_event_impl::nt_event_impl(const nt_event_impl &o) : eq(copy(o.eq)), callback(o.callback), dir(o.dir) { } template nt_event_impl::nt_event_impl(nt_event_impl &&) noexcept = default; template nt_event_impl &nt_event_impl::operator=(const nt_event_impl &o) { if (this != &o) { *this = nt_event_impl(o); } return *this; } template nt_event_impl &nt_event_impl::operator=(nt_event_impl &&) noexcept = default; template nt_event_impl::~nt_event_impl() = default; template const expression &nt_event_impl::get_expression() const { return eq; } template const typename nt_event_impl::callback_t &nt_event_impl::get_callback() const { return callback; } template event_direction nt_event_impl::get_direction() const { return dir; } template t_event_impl::t_event_impl() : t_event_impl(expression{}) { } template void t_event_impl::finalise_ctor(callback_t cb, T cd, event_direction d) { using std::isfinite; callback = std::move(cb); if (!isfinite(cd)) { throw std::invalid_argument("Cannot set a non-finite cooldown value for a terminal event"); } cooldown = cd; if (d < event_direction::negative || d > event_direction::positive) { throw std::invalid_argument("Invalid value selected for the direction of a terminal event"); } dir = d; } template t_event_impl::t_event_impl(const t_event_impl &o) : eq(copy(o.eq)), callback(o.callback), cooldown(o.cooldown), dir(o.dir) { } template t_event_impl::t_event_impl(t_event_impl &&) noexcept = default; template t_event_impl &t_event_impl::operator=(const t_event_impl &o) { if (this != &o) { *this = t_event_impl(o); } return *this; } template t_event_impl &t_event_impl::operator=(t_event_impl &&) noexcept = default; template t_event_impl::~t_event_impl() = default; template const expression &t_event_impl::get_expression() const { return eq; } template const typename t_event_impl::callback_t &t_event_impl::get_callback() const { return callback; } template event_direction t_event_impl::get_direction() const { return dir; } template T t_event_impl::get_cooldown() const { return cooldown; } namespace { // Implementation of stream insertion for the non-terminal event class. std::ostream &nt_event_impl_stream_impl(std::ostream &os, const expression &eq, event_direction dir) { os << "Event type : non-terminal\n"; os << "Event equation : " << eq << '\n'; os << "Event direction: " << dir << '\n'; return os; } // Implementation of stream insertion for the terminal event class. template std::ostream &t_event_impl_stream_impl(std::ostream &os, const expression &eq, event_direction dir, const C &callback, const T &cooldown) { os << "Event type : terminal\n"; os << "Event equation : " << eq << '\n'; os << "Event direction: " << dir << '\n'; os << "With callback : " << (callback ? "yes" : "no") << '\n'; os << "Cooldown : " << (cooldown < 0 ? "auto" : "{}"_format(cooldown)) << '\n'; return os; } } // namespace template <> std::ostream &operator<<(std::ostream &os, const nt_event_impl &e) { return nt_event_impl_stream_impl(os, e.get_expression(), e.get_direction()); } template <> std::ostream &operator<<(std::ostream &os, const nt_event_impl &e) { return nt_event_impl_stream_impl(os, e.get_expression(), e.get_direction()); } template <> std::ostream &operator<<(std::ostream &os, const nt_event_impl &e) { return nt_event_impl_stream_impl(os, e.get_expression(), e.get_direction()); } template <> std::ostream &operator<<(std::ostream &os, const nt_event_impl &e) { return nt_event_impl_stream_impl(os, e.get_expression(), e.get_direction()); } #if defined(HEYOKA_HAVE_REAL128) template <> std::ostream &operator<<(std::ostream &os, const nt_event_impl &e) { return nt_event_impl_stream_impl(os, e.get_expression(), e.get_direction()); } template <> std::ostream &operator<<(std::ostream &os, const nt_event_impl &e) { return nt_event_impl_stream_impl(os, e.get_expression(), e.get_direction()); } #endif template <> std::ostream &operator<<(std::ostream &os, const t_event_impl &e) { return t_event_impl_stream_impl(os, e.get_expression(), e.get_direction(), e.get_callback(), e.get_cooldown()); } template <> std::ostream &operator<<(std::ostream &os, const t_event_impl &e) { return t_event_impl_stream_impl(os, e.get_expression(), e.get_direction(), e.get_callback(), e.get_cooldown()); } template <> std::ostream &operator<<(std::ostream &os, const t_event_impl &e) { return t_event_impl_stream_impl(os, e.get_expression(), e.get_direction(), e.get_callback(), e.get_cooldown()); } template <> std::ostream &operator<<(std::ostream &os, const t_event_impl &e) { return t_event_impl_stream_impl(os, e.get_expression(), e.get_direction(), e.get_callback(), e.get_cooldown()); } #if defined(HEYOKA_HAVE_REAL128) template <> std::ostream &operator<<(std::ostream &os, const t_event_impl &e) { return t_event_impl_stream_impl(os, e.get_expression(), e.get_direction(), e.get_callback(), e.get_cooldown()); } template <> std::ostream &operator<<(std::ostream &os, const t_event_impl &e) { return t_event_impl_stream_impl(os, e.get_expression(), e.get_direction(), e.get_callback(), e.get_cooldown()); } #endif // Explicit instantiation of the implementation classes/functions. template class nt_event_impl; template class t_event_impl; template class nt_event_impl; template class t_event_impl; template class nt_event_impl; template class t_event_impl; template class nt_event_impl; template class t_event_impl; #if defined(HEYOKA_HAVE_REAL128) template class nt_event_impl; template class t_event_impl; template class nt_event_impl; template class t_event_impl; #endif // Add a function for computing the dense output // via polynomial evaluation. template void taylor_add_d_out_function(llvm_state &s, std::uint32_t n_eq, std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool external_linkage, bool optimise) { assert(n_eq > 0u); assert(order > 0u); assert(batch_size > 0u); auto &builder = s.builder(); auto &context = s.context(); // The function arguments: // - the output pointer (read/write, used also for accumulation), // - the pointer to the Taylor coefficients (read-only), // - the pointer to the h values (read-only). // No overlap is allowed. std::vector fargs(3, llvm::PointerType::getUnqual(to_llvm_type(context))); // The function does not return anything. auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false); assert(ft != nullptr); // Now create the function. auto *f = llvm::Function::Create( ft, external_linkage ? llvm::Function::ExternalLinkage : llvm::Function::InternalLinkage, "d_out_f", &s.module()); // LCOV_EXCL_START if (f == nullptr) { throw std::invalid_argument( "Unable to create a function for the dense output in an adaptive Taylor integrator"); } // LCOV_EXCL_STOP // Set the names/attributes of the function arguments. auto *out_ptr = f->args().begin(); out_ptr->setName("out_ptr"); out_ptr->addAttr(llvm::Attribute::NoCapture); out_ptr->addAttr(llvm::Attribute::NoAlias); auto *tc_ptr = f->args().begin() + 1; tc_ptr->setName("tc_ptr"); tc_ptr->addAttr(llvm::Attribute::NoCapture); tc_ptr->addAttr(llvm::Attribute::NoAlias); tc_ptr->addAttr(llvm::Attribute::ReadOnly); auto *h_ptr = f->args().begin() + 2; h_ptr->setName("h_ptr"); h_ptr->addAttr(llvm::Attribute::NoCapture); h_ptr->addAttr(llvm::Attribute::NoAlias); h_ptr->addAttr(llvm::Attribute::ReadOnly); // Create a new basic block to start insertion into. auto *bb = llvm::BasicBlock::Create(context, "entry", f); assert(bb != nullptr); builder.SetInsertPoint(bb); // Load the value of h. auto *h = load_vector_from_memory(builder, h_ptr, batch_size); if (high_accuracy) { // Create the array for storing the running compensations. auto array_type = llvm::ArrayType::get(make_vector_type(to_llvm_type(context), batch_size), n_eq); auto comp_arr = builder.CreateInBoundsGEP(builder.CreateAlloca(array_type), {builder.getInt32(0), builder.getInt32(0)}); // Start by writing into out_ptr the zero-order coefficients // and by filling with zeroes the running compensations. llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) { // Load the coefficient from tc_ptr. The index is: // batch_size * (order + 1u) * cur_var_idx. auto *tc_idx = builder.CreateMul(builder.getInt32(batch_size * (order + 1u)), cur_var_idx); auto *tc = load_vector_from_memory(builder, builder.CreateInBoundsGEP(tc_ptr, {tc_idx}), batch_size); // Store it in out_ptr. The index is: // batch_size * cur_var_idx. auto *out_idx = builder.CreateMul(builder.getInt32(batch_size), cur_var_idx); store_vector_to_memory(builder, builder.CreateInBoundsGEP(out_ptr, {out_idx}), tc); // Zero-init the element in comp_arr. builder.CreateStore(vector_splat(builder, codegen(s, number{0.}), batch_size), builder.CreateInBoundsGEP(comp_arr, {cur_var_idx})); }); // Init the running updater for the powers of h. auto *cur_h = builder.CreateAlloca(h->getType()); builder.CreateStore(h, cur_h); // Run the evaluation. llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(order + 1u), [&](llvm::Value *cur_order) { // Load the current power of h. auto *cur_h_val = builder.CreateLoad(cur_h); llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) { // Load the coefficient from tc_ptr. The index is: // batch_size * (order + 1u) * cur_var_idx + batch_size * cur_order. auto *tc_idx = builder.CreateAdd(builder.CreateMul(builder.getInt32(batch_size * (order + 1u)), cur_var_idx), builder.CreateMul(builder.getInt32(batch_size), cur_order)); auto *cf = load_vector_from_memory(builder, builder.CreateInBoundsGEP(tc_ptr, {tc_idx}), batch_size); auto *tmp = builder.CreateFMul(cf, cur_h_val); // Compute the quantities for the compensation. auto *comp_ptr = builder.CreateInBoundsGEP(comp_arr, {cur_var_idx}); auto *out_idx = builder.CreateMul(builder.getInt32(batch_size), cur_var_idx); auto *res_ptr = builder.CreateInBoundsGEP(out_ptr, {out_idx}); auto *y = builder.CreateFSub(tmp, builder.CreateLoad(comp_ptr)); auto *cur_res = load_vector_from_memory(builder, res_ptr, batch_size); auto *t = builder.CreateFAdd(cur_res, y); // Update the compensation and the return value. builder.CreateStore(builder.CreateFSub(builder.CreateFSub(t, cur_res), y), comp_ptr); store_vector_to_memory(builder, res_ptr, t); }); // Update the value of h. builder.CreateStore(builder.CreateFMul(cur_h_val, h), cur_h); }); } else { // Start by writing into out_ptr the coefficients of the highest-degree // monomial in each polynomial. llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) { // Load the coefficient from tc_ptr. The index is: // batch_size * (order + 1u) * cur_var_idx + batch_size * order. auto *tc_idx = builder.CreateAdd(builder.CreateMul(builder.getInt32(batch_size * (order + 1u)), cur_var_idx), builder.getInt32(batch_size * order)); auto *tc = load_vector_from_memory(builder, builder.CreateInBoundsGEP(tc_ptr, {tc_idx}), batch_size); // Store it in out_ptr. The index is: // batch_size * cur_var_idx. auto *out_idx = builder.CreateMul(builder.getInt32(batch_size), cur_var_idx); store_vector_to_memory(builder, builder.CreateInBoundsGEP(out_ptr, {out_idx}), tc); }); // Now let's run the Horner scheme. llvm_loop_u32( s, builder.getInt32(1), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)), [&](llvm::Value *cur_order) { llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) { // Load the current Taylor coefficient from tc_ptr. // NOTE: we are loading the coefficients backwards wrt the order, hence // we specify order - cur_order. // NOTE: the index is: // batch_size * (order + 1u) * cur_var_idx + batch_size * (order - cur_order). auto *tc_idx = builder.CreateAdd(builder.CreateMul(builder.getInt32(batch_size * (order + 1u)), cur_var_idx), builder.CreateMul(builder.getInt32(batch_size), builder.CreateSub(builder.getInt32(order), cur_order))); auto *tc = load_vector_from_memory(builder, builder.CreateInBoundsGEP(tc_ptr, {tc_idx}), batch_size); // Accumulate in out_ptr. The index is: // batch_size * cur_var_idx. auto *out_idx = builder.CreateMul(builder.getInt32(batch_size), cur_var_idx); auto *out_p = builder.CreateInBoundsGEP(out_ptr, {out_idx}); auto *cur_out = load_vector_from_memory(builder, out_p, batch_size); store_vector_to_memory(builder, out_p, builder.CreateFAdd(tc, builder.CreateFMul(cur_out, h))); }); }); } // Create the return value. builder.CreateRetVoid(); // Verify the function. s.verify_function(f); // Run the optimisation pass, if requested. if (optimise) { s.optimise(); } } template void taylor_add_d_out_function(llvm_state &, std::uint32_t, std::uint32_t, std::uint32_t, bool, bool, bool); template void taylor_add_d_out_function(llvm_state &, std::uint32_t, std::uint32_t, std::uint32_t, bool, bool, bool); #if defined(HEYOKA_HAVE_REAL128) template void taylor_add_d_out_function(llvm_state &, std::uint32_t, std::uint32_t, std::uint32_t, bool, bool, bool); #endif } // namespace detail template void continuous_output::add_c_out_function(std::uint32_t order, std::uint32_t dim, bool high_accuracy) { // Overflow check: we want to be able to index into the arrays of // times and Taylor coefficients using 32-bit ints. // LCOV_EXCL_START if (m_tcs.size() > std::numeric_limits::max() || m_times_hi.size() > std::numeric_limits::max()) { throw std::overflow_error("Overflow detected while adding continuous output to a Taylor integrator"); } // LCOV_EXCL_STOP auto &md = m_llvm_state.module(); auto &builder = m_llvm_state.builder(); auto &context = m_llvm_state.context(); // Fetch the current insertion block. auto orig_bb = builder.GetInsertBlock(); // Add the function for the computation of the dense output. detail::taylor_add_d_out_function(m_llvm_state, dim, order, 1, high_accuracy, false, false); // Fetch it. auto d_out_f = md.getFunction("d_out_f"); assert(d_out_f != nullptr); // LCOV_EXCL_LINE // Restore the original insertion block. builder.SetInsertPoint(orig_bb); // Establish the time direction. const detail::dfloat df_t_start(m_times_hi[0], m_times_lo[0]), df_t_end(m_times_hi.back(), m_times_lo.back()); const auto dir = df_t_start < df_t_end; // The function arguments: // - the output pointer (read/write, used also for accumulation), // - the time value, // - the pointer to the Taylor coefficients (read-only), // - the pointer to the hi times (read-only), // - the pointer to the lo times (read-only). // No overlap is allowed. auto fp_t = detail::to_llvm_type(context); auto ptr_t = llvm::PointerType::getUnqual(fp_t); std::vector fargs{ptr_t, fp_t, ptr_t, ptr_t, ptr_t}; // The function does not return anything. auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false); assert(ft != nullptr); // LCOV_EXCL_LINE // Now create the function. auto *f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "c_out", &md); // LCOV_EXCL_START if (f == nullptr) { throw std::invalid_argument("Unable to create a function for continuous output in a Taylor integrator"); } // LCOV_EXCL_STOP // Set the names/attributes of the function arguments. auto out_ptr = f->args().begin(); out_ptr->setName("out_ptr"); out_ptr->addAttr(llvm::Attribute::NoCapture); out_ptr->addAttr(llvm::Attribute::NoAlias); auto tm = f->args().begin() + 1; tm->setName("tm"); auto tc_ptr = f->args().begin() + 2; tc_ptr->setName("tc_ptr"); tc_ptr->addAttr(llvm::Attribute::NoCapture); tc_ptr->addAttr(llvm::Attribute::NoAlias); tc_ptr->addAttr(llvm::Attribute::ReadOnly); auto times_ptr_hi = f->args().begin() + 3; times_ptr_hi->setName("times_ptr_hi"); times_ptr_hi->addAttr(llvm::Attribute::NoCapture); times_ptr_hi->addAttr(llvm::Attribute::NoAlias); times_ptr_hi->addAttr(llvm::Attribute::ReadOnly); auto times_ptr_lo = f->args().begin() + 4; times_ptr_lo->setName("times_ptr_lo"); times_ptr_lo->addAttr(llvm::Attribute::NoCapture); times_ptr_lo->addAttr(llvm::Attribute::NoAlias); times_ptr_lo->addAttr(llvm::Attribute::ReadOnly); // Create a new basic block to start insertion into. auto *bb = llvm::BasicBlock::Create(context, "entry", f); assert(bb != nullptr); // LCOV_EXCL_LINE builder.SetInsertPoint(bb); // Create the variable in which we will store the timestep size. // This is necessary because the d_out_f function requires a pointer // to the timestep. auto h_ptr = builder.CreateAlloca(fp_t); // Look for the index in the times vector corresponding to // a time greater than tm (less than tm in backwards integration). // This is essentially an implementation of std::upper_bound: // https://en.cppreference.com/w/cpp/algorithm/upper_bound auto tidx = builder.CreateAlloca(builder.getInt32Ty()); auto count = builder.CreateAlloca(builder.getInt32Ty()); auto step = builder.CreateAlloca(builder.getInt32Ty()); auto first = builder.CreateAlloca(builder.getInt32Ty()); // count is inited with the size of the range. builder.CreateStore(builder.getInt32(static_cast(m_times_hi.size())), count); // first is inited to zero. builder.CreateStore(builder.getInt32(0), first); detail::llvm_while_loop( m_llvm_state, [&]() { return builder.CreateICmpNE(builder.CreateLoad(count), builder.getInt32(0)); }, [&]() { // tidx = first. builder.CreateStore(builder.CreateLoad(first), tidx); // step = count / 2. builder.CreateStore(builder.CreateUDiv(builder.CreateLoad(count), builder.getInt32(2)), step); // tidx = tidx + step. builder.CreateStore(builder.CreateAdd(builder.CreateLoad(tidx), builder.CreateLoad(step)), tidx); // Logical condition: // - !(tm < *tidx), if integrating forward, // - !(tm > *tidx), if integrating backward. auto tidx_val_hi = builder.CreateLoad(builder.CreateInBoundsGEP(times_ptr_hi, {builder.CreateLoad(tidx)})); auto tidx_val_lo = builder.CreateLoad(builder.CreateInBoundsGEP(times_ptr_lo, {builder.CreateLoad(tidx)})); auto zero_val = llvm::ConstantFP::get(fp_t, 0.); auto cond = dir ? detail::llvm_dl_lt(m_llvm_state, tm, zero_val, tidx_val_hi, tidx_val_lo) : detail::llvm_dl_gt(m_llvm_state, tm, zero_val, tidx_val_hi, tidx_val_lo); cond = builder.CreateNot(cond); detail::llvm_if_then_else( m_llvm_state, cond, [&]() { // ++tidx. builder.CreateStore(builder.CreateAdd(builder.CreateLoad(tidx), builder.getInt32(1)), tidx); // first = tidx. builder.CreateStore(builder.CreateLoad(tidx), first); // count = count - (step + 1). builder.CreateStore( builder.CreateSub(builder.CreateLoad(count), builder.CreateAdd(builder.CreateLoad(step), builder.getInt32(1))), count); }, [&]() { // count = step. builder.CreateStore(builder.CreateLoad(step), count); }); }); // NOTE: the output of the std::upper_bound algorithm // is in the 'first' variable. llvm::Value *tc_idx = builder.CreateLoad(first); // Normally, the TC index should be first - 1. The exceptions are: // - first == 0, in which case TC index is also 0, // - first == range size, in which case TC index is first - 2. // These two exceptions arise when tm is outside the range of validity // for the continuous output. In such cases, we will use either the first // or the last possible set of TCs. detail::llvm_if_then_else( m_llvm_state, builder.CreateICmpEQ(tc_idx, builder.getInt32(0)), [&]() { // first == 0, do nothing. }, [&]() { detail::llvm_if_then_else( m_llvm_state, builder.CreateICmpEQ(tc_idx, builder.getInt32(static_cast(m_times_hi.size()))), [&]() { // first == range size. builder.CreateStore(builder.CreateSub(tc_idx, builder.getInt32(2)), first); }, [&]() { // The normal path. builder.CreateStore(builder.CreateSub(tc_idx, builder.getInt32(1)), first); }); }); // Reload tc_idx. tc_idx = builder.CreateLoad(first); // Load the time corresponding to tc_idx. auto start_tm_hi = builder.CreateLoad(builder.CreateInBoundsGEP(times_ptr_hi, {tc_idx})); auto start_tm_lo = builder.CreateLoad(builder.CreateInBoundsGEP(times_ptr_lo, {tc_idx})); // Compute and store the value of h = tm - start_tm. auto [h_hi, h_lo] = detail::llvm_dl_add(m_llvm_state, tm, llvm::ConstantFP::get(fp_t, 0.), builder.CreateFNeg(start_tm_hi), builder.CreateFNeg(start_tm_lo)); builder.CreateStore(h_hi, h_ptr); // Compute the index into the Taylor coefficients array. tc_idx = builder.CreateMul(tc_idx, builder.getInt32(dim * (order + 1u))); // Invoke the d_out function. builder.CreateCall(d_out_f, {out_ptr, builder.CreateInBoundsGEP(tc_ptr, {tc_idx}), h_ptr}); // Create the return value. builder.CreateRetVoid(); // Verify the function. m_llvm_state.verify_function(f); // Run the optimisation pass. m_llvm_state.optimise(); // Compile. m_llvm_state.compile(); // Fetch the function pointer and assign it. m_f_ptr = reinterpret_cast(m_llvm_state.jit_lookup("c_out")); } template continuous_output::continuous_output() = default; template continuous_output::continuous_output(llvm_state &&s) : m_llvm_state(std::move(s)) { } template continuous_output::continuous_output(const continuous_output &o) : m_llvm_state(o.m_llvm_state), m_tcs(o.m_tcs), m_times_hi(o.m_times_hi), m_times_lo(o.m_times_lo), m_output(o.m_output) { // If o is valid, fetch the function pointer from the copied state. // Otherwise, m_f_ptr will remain null. if (o.m_f_ptr != nullptr) { m_f_ptr = reinterpret_cast(m_llvm_state.jit_lookup("c_out")); } } template continuous_output::continuous_output(continuous_output &&) noexcept = default; template continuous_output::~continuous_output() = default; template continuous_output &continuous_output::operator=(const continuous_output &o) { if (this != &o) { *this = continuous_output(o); } return *this; } template continuous_output &continuous_output::operator=(continuous_output &&) noexcept = default; template void continuous_output::call_impl(T t) { using std::isfinite; if (m_f_ptr == nullptr) { throw std::invalid_argument("Cannot use a default-constructed continuous_output object"); } // NOTE: run the assertions only after ensuring this // is a valid object. // LCOV_EXCL_START #if !defined(NDEBUG) // m_output must not be empty. assert(!m_output.empty()); // Need at least 2 time points. assert(m_times_hi.size() >= 2u); // hi/lo parts of times must have the same sizes. assert(m_times_hi.size() == m_times_lo.size()); #endif // LCOV_EXCL_STOP if (!isfinite(t)) { throw std::invalid_argument("Cannot compute the continuous output at the non-finite time {}"_format(t)); } m_f_ptr(m_output.data(), t, m_tcs.data(), m_times_hi.data(), m_times_lo.data()); } template const llvm_state &continuous_output::get_llvm_state() const { return m_llvm_state; } template void continuous_output::save(boost::archive::binary_oarchive &ar, unsigned) const { ar << m_llvm_state; ar << m_tcs; ar << m_times_hi; ar << m_times_lo; ar << m_output; } template void continuous_output::load(boost::archive::binary_iarchive &ar, unsigned) { ar >> m_llvm_state; ar >> m_tcs; ar >> m_times_hi; ar >> m_times_lo; ar >> m_output; // NOTE: if m_output is not empty, it means the archived // object had been initialised. if (m_output.empty()) { m_f_ptr = nullptr; } else { m_f_ptr = reinterpret_cast(m_llvm_state.jit_lookup("c_out")); } } template std::pair continuous_output::get_bounds() const { if (m_f_ptr == nullptr) { throw std::invalid_argument("Cannot use a default-constructed continuous_output object"); } return {m_times_hi[0], m_times_hi.back()}; } template std::size_t continuous_output::get_n_steps() const { if (m_f_ptr == nullptr) { throw std::invalid_argument("Cannot use a default-constructed continuous_output object"); } return boost::numeric_cast(m_times_hi.size() - 1u); } template class continuous_output; template class continuous_output; #if defined(HEYOKA_HAVE_REAL128) template class continuous_output; #endif namespace detail { template std::ostream &c_out_stream_impl(std::ostream &os, const continuous_output &co) { std::ostringstream oss; oss.exceptions(std::ios_base::failbit | std::ios_base::badbit); oss.imbue(std::locale::classic()); oss << std::showpoint; oss.precision(std::numeric_limits::max_digits10); if (co.get_output().empty()) { oss << "Default-constructed continuous_output"; } else { const detail::dfloat df_t_start(co.m_times_hi[0], co.m_times_lo[0]), df_t_end(co.m_times_hi.back(), co.m_times_lo.back()); const auto dir = df_t_start < df_t_end; oss << "Direction : " << (dir ? "forward" : "backward") << '\n'; oss << "Time range: " << (dir ? "[{}, {})"_format(co.m_times_hi[0], co.m_times_hi.back()) : "({}, {}]"_format(co.m_times_hi.back(), co.m_times_hi[0])) << '\n'; oss << "N of steps: " << (co.m_times_hi.size() - 1u) << '\n'; } return os << oss.str(); } } // namespace detail template <> std::ostream &operator<<(std::ostream &os, const continuous_output &co) { return detail::c_out_stream_impl(os, co); } template <> std::ostream &operator<<(std::ostream &os, const continuous_output &co) { return detail::c_out_stream_impl(os, co); } #if defined(HEYOKA_HAVE_REAL128) template <> std::ostream &operator<<(std::ostream &os, const continuous_output &co) { return detail::c_out_stream_impl(os, co); } #endif #if !defined(NDEBUG) extern "C" { // Function to check, in debug mode, the indexing of the Taylor coefficients // in the batch mode continuous output implementation. HEYOKA_DLL_PUBLIC void heyoka_continuous_output_batch_tc_idx_debug(const std::uint32_t *tc_idx, std::uint32_t times_size, std::uint32_t batch_size) noexcept { // LCOV_EXCL_START assert(batch_size != 0u); assert(times_size % batch_size == 0u); assert(times_size / batch_size >= 3u); // LCOV_EXCL_STOP for (std::uint32_t i = 0; i < batch_size; ++i) { assert(tc_idx[i] < times_size / batch_size - 2u); // LCOV_EXCL_LINE } } } #endif // Continuous output for the batch integrator. template void continuous_output_batch::add_c_out_function(std::uint32_t order, std::uint32_t dim, bool high_accuracy) { // Overflow check: we want to be able to index into the arrays of // times and Taylor coefficients using 32-bit ints. // LCOV_EXCL_START if (m_tcs.size() > std::numeric_limits::max() || m_times_hi.size() > std::numeric_limits::max()) { throw std::overflow_error( "Overflow detected while adding continuous output to a Taylor integrator in batch mode"); } // LCOV_EXCL_STOP auto &md = m_llvm_state.module(); auto &builder = m_llvm_state.builder(); auto &context = m_llvm_state.context(); // The function arguments: // - the output pointer (read/write, used also for accumulation), // - the pointer to the target time values (read-only), // - the pointer to the Taylor coefficients (read-only), // - the pointer to the hi times (read-only), // - the pointer to the lo times (read-only). // No overlap is allowed. auto fp_t = detail::to_llvm_type(context); auto fp_vec_t = detail::make_vector_type(fp_t, m_batch_size); auto ptr_t = llvm::PointerType::getUnqual(fp_t); std::vector fargs{ptr_t, ptr_t, ptr_t, ptr_t, ptr_t}; // The function does not return anything. auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false); assert(ft != nullptr); // LCOV_EXCL_LINE // Now create the function. auto *f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "c_out", &md); // LCOV_EXCL_START if (f == nullptr) { throw std::invalid_argument("Unable to create a function for continuous output in a Taylor integrator"); } // LCOV_EXCL_STOP // Set the names/attributes of the function arguments. auto out_ptr = f->args().begin(); out_ptr->setName("out_ptr"); out_ptr->addAttr(llvm::Attribute::NoCapture); out_ptr->addAttr(llvm::Attribute::NoAlias); auto tm_ptr = f->args().begin() + 1; tm_ptr->setName("tm_ptr"); tm_ptr->addAttr(llvm::Attribute::NoCapture); tm_ptr->addAttr(llvm::Attribute::NoAlias); tm_ptr->addAttr(llvm::Attribute::ReadOnly); auto tc_ptr = f->args().begin() + 2; tc_ptr->setName("tc_ptr"); tc_ptr->addAttr(llvm::Attribute::NoCapture); tc_ptr->addAttr(llvm::Attribute::NoAlias); tc_ptr->addAttr(llvm::Attribute::ReadOnly); auto times_ptr_hi = f->args().begin() + 3; times_ptr_hi->setName("times_ptr_hi"); times_ptr_hi->addAttr(llvm::Attribute::NoCapture); times_ptr_hi->addAttr(llvm::Attribute::NoAlias); times_ptr_hi->addAttr(llvm::Attribute::ReadOnly); auto times_ptr_lo = f->args().begin() + 4; times_ptr_lo->setName("times_ptr_lo"); times_ptr_lo->addAttr(llvm::Attribute::NoCapture); times_ptr_lo->addAttr(llvm::Attribute::NoAlias); times_ptr_lo->addAttr(llvm::Attribute::ReadOnly); // Create a new basic block to start insertion into. auto *bb = llvm::BasicBlock::Create(context, "entry", f); assert(bb != nullptr); // LCOV_EXCL_LINE builder.SetInsertPoint(bb); // Establish the time directions. auto bool_vector_t = detail::make_vector_type(builder.getInt1Ty(), m_batch_size); assert(bool_vector_t != nullptr); // LCOV_EXCL_LINE llvm::Value *dir_vec{}; if (m_batch_size == 1u) { // In scalar mode, the direction is a single value. const detail::dfloat df_t_start(m_times_hi[0], m_times_lo[0]), // NOTE: we load from the padding values here. df_t_end(m_times_hi.back(), m_times_lo.back()); const auto dir = df_t_start < df_t_end; dir_vec = builder.getInt1(dir); } else { dir_vec = llvm::UndefValue::get(bool_vector_t); for (std::uint32_t i = 0; i < m_batch_size; ++i) { const detail::dfloat df_t_start(m_times_hi[i], m_times_lo[i]), // NOTE: we load from the padding values here. df_t_end(m_times_hi[m_times_hi.size() - m_batch_size + i], m_times_lo[m_times_lo.size() - m_batch_size + i]); const auto dir = df_t_start < df_t_end; dir_vec = builder.CreateInsertElement(dir_vec, builder.getInt1(dir), i); } } // Look for the index in the times vector corresponding to // a time greater than tm (less than tm in backwards integration). // This is essentially an implementation of std::upper_bound: // https://en.cppreference.com/w/cpp/algorithm/upper_bound auto int32_vec_t = detail::make_vector_type(builder.getInt32Ty(), m_batch_size); auto tidx = builder.CreateAlloca(int32_vec_t); auto count = builder.CreateAlloca(int32_vec_t); auto step = builder.CreateAlloca(int32_vec_t); auto first = builder.CreateAlloca(int32_vec_t); // count is inited with the size of the range. // NOTE: count includes the padding. builder.CreateStore( detail::vector_splat(builder, builder.getInt32(static_cast(m_times_hi.size()) / m_batch_size), m_batch_size), count); // first is inited to zero. auto zero_vec_i32 = detail::vector_splat(builder, builder.getInt32(0), m_batch_size); builder.CreateStore(zero_vec_i32, first); // Load the time value from tm_ptr. auto tm = detail::load_vector_from_memory(builder, tm_ptr, m_batch_size); // This is the vector [0, 1, 2, ..., (batch_size - 1)]. llvm::Value *batch_offset{}; if (m_batch_size == 1u) { // In scalar mode, use a single value. batch_offset = builder.getInt32(0); } else { batch_offset = llvm::UndefValue::get(int32_vec_t); for (std::uint32_t i = 0; i < m_batch_size; ++i) { batch_offset = builder.CreateInsertElement(batch_offset, builder.getInt32(i), i); } } // Splatted version of the batch size. auto batch_splat = detail::vector_splat(builder, builder.getInt32(m_batch_size), m_batch_size); // Splatted versions of the base pointers for the time data. auto times_ptr_hi_vec = detail::vector_splat(builder, times_ptr_hi, m_batch_size); auto times_ptr_lo_vec = detail::vector_splat(builder, times_ptr_lo, m_batch_size); // fp vector of zeroes. auto zero_vec_fp = detail::vector_splat(builder, llvm::ConstantFP::get(fp_t, 0.), m_batch_size); // Vector of i32 ones. auto one_vec_i32 = detail::vector_splat(builder, builder.getInt32(1), m_batch_size); detail::llvm_while_loop( m_llvm_state, [&]() -> llvm::Value * { // NOTE: the condition here is that any value in count is not zero. auto cmp = builder.CreateICmpNE(builder.CreateLoad(count), zero_vec_i32); // NOTE: in scalar mode, no reduction is needed. return (m_batch_size == 1u) ? cmp : builder.CreateOrReduce(cmp); }, [&]() { // tidx = first. builder.CreateStore(builder.CreateLoad(first), tidx); // step = count / 2. auto two_vec_i32 = detail::vector_splat(builder, builder.getInt32(2), m_batch_size); builder.CreateStore(builder.CreateUDiv(builder.CreateLoad(count), two_vec_i32), step); // tidx = tidx + step. builder.CreateStore(builder.CreateAdd(builder.CreateLoad(tidx), builder.CreateLoad(step)), tidx); // Compute the indices for loading the times from the pointers. auto tl_idx = builder.CreateAdd(builder.CreateMul(builder.CreateLoad(tidx), batch_splat), batch_offset); // Compute the pointers for loading the time data. auto tptr_hi = builder.CreateInBoundsGEP(times_ptr_hi_vec, {tl_idx}); auto tptr_lo = builder.CreateInBoundsGEP(times_ptr_lo_vec, {tl_idx}); // Gather the hi/lo values. auto tidx_val_hi = detail::gather_vector_from_memory(builder, fp_vec_t, tptr_hi, alignof(T)); auto tidx_val_lo = detail::gather_vector_from_memory(builder, fp_vec_t, tptr_lo, alignof(T)); // Compute the two conditions !(tm < *tidx) and !(tm > *tidx). auto cmp_lt = builder.CreateNot(detail::llvm_dl_lt(m_llvm_state, tm, zero_vec_fp, tidx_val_hi, tidx_val_lo)); auto cmp_gt = builder.CreateNot(detail::llvm_dl_gt(m_llvm_state, tm, zero_vec_fp, tidx_val_hi, tidx_val_lo)); // Select cmp_lt if integrating forward, cmp_gt when integrating backward. auto cond = builder.CreateSelect(dir_vec, cmp_lt, cmp_gt); // tidx += (1 or 0). builder.CreateStore( builder.CreateAdd(builder.CreateLoad(tidx), builder.CreateSelect(cond, one_vec_i32, zero_vec_i32)), tidx); // first = (tidx or first). builder.CreateStore(builder.CreateSelect(cond, builder.CreateLoad(tidx), builder.CreateLoad(first)), first); // count = count - (step or count). auto old_count = builder.CreateLoad(count); auto new_count = builder.CreateSub(old_count, builder.CreateSelect(cond, builder.CreateLoad(step), old_count)); // count = count + (-1 or step). new_count = builder.CreateAdd( new_count, builder.CreateSelect(cond, builder.CreateNeg(one_vec_i32), builder.CreateLoad(step))); builder.CreateStore(new_count, count); }); // NOTE: the output of the std::upper_bound algorithm // is in the 'first' variable. llvm::Value *tc_idx = builder.CreateLoad(first); // Normally, the TC index should be first - 1. The exceptions are: // - first == 0, in which case TC index is also 0, // - first == (range size - 1), in which case TC index is first - 2. // These two exceptions arise when tm is outside the range of validity // for the continuous output. In such cases, we will use either the first // or the last possible set of TCs. // NOTE: the second check is range size - 1 (rather than just range size // like in the scalar case) due to padding. // In order to vectorise the check, we compute: // tc_idx = tc_idx - (tc_idx != 0) - (tc_idx == range size - 1). auto tc_idx_cmp1 = builder.CreateZExt(builder.CreateICmpNE(tc_idx, zero_vec_i32), int32_vec_t); auto tc_idx_cmp2 = builder.CreateZExt( builder.CreateICmpEQ( tc_idx, detail::vector_splat( builder, builder.getInt32(static_cast(m_times_hi.size() / m_batch_size - 1u)), m_batch_size)), int32_vec_t); tc_idx = builder.CreateSub(tc_idx, tc_idx_cmp1); tc_idx = builder.CreateSub(tc_idx, tc_idx_cmp2); #if !defined(NDEBUG) // In debug mode, invoke the index checking function. auto tc_idx_debug_ptr = builder.CreateInBoundsGEP(builder.CreateAlloca(llvm::ArrayType::get(builder.getInt32Ty(), m_batch_size)), {builder.getInt32(0), builder.getInt32(0)}); detail::store_vector_to_memory(builder, tc_idx_debug_ptr, tc_idx); detail::llvm_invoke_external(m_llvm_state, "heyoka_continuous_output_batch_tc_idx_debug", builder.getVoidTy(), {tc_idx_debug_ptr, builder.getInt32(static_cast(m_times_hi.size())), builder.getInt32(m_batch_size)}); #endif // Convert tc_idx into an index for loading from the time vectors. auto tc_l_idx = builder.CreateAdd(builder.CreateMul(tc_idx, batch_splat), batch_offset); // Load the times corresponding to tc_idx. auto start_tm_hi = detail::gather_vector_from_memory( builder, fp_vec_t, builder.CreateInBoundsGEP(times_ptr_hi_vec, {tc_l_idx}), alignof(T)); auto start_tm_lo = detail::gather_vector_from_memory( builder, fp_vec_t, builder.CreateInBoundsGEP(times_ptr_lo_vec, {tc_l_idx}), alignof(T)); // Compute the value of h = tm - start_tm. auto h = detail::llvm_dl_add(m_llvm_state, tm, zero_vec_fp, builder.CreateFNeg(start_tm_hi), builder.CreateFNeg(start_tm_lo)) .first; // Compute the base pointers in the array of TC for the computation // of Horner's scheme. tc_idx = builder.CreateAdd( builder.CreateMul( tc_idx, detail::vector_splat(builder, builder.getInt32(dim * (order + 1u) * m_batch_size), m_batch_size)), batch_offset); // NOTE: each pointer in tc_ptrs points to the Taylor coefficient of // order 0 for the first state variable in the timestep data block selected // for that batch index. auto tc_ptrs = builder.CreateInBoundsGEP(tc_ptr, {tc_idx}); // Run the Horner scheme. if (high_accuracy) { // Create the array for storing the running compensations. auto array_type = llvm::ArrayType::get(fp_vec_t, dim); auto comp_arr = builder.CreateInBoundsGEP(builder.CreateAlloca(array_type), {builder.getInt32(0), builder.getInt32(0)}); // Start by writing into out_ptr the zero-order coefficients // and by filling with zeroes the running compensations. detail::llvm_loop_u32(m_llvm_state, builder.getInt32(0), builder.getInt32(dim), [&](llvm::Value *cur_var_idx) { // Load the coefficient from tc_ptrs. The index is: // m_batch_size * (order + 1u) * cur_var_idx. auto *load_idx = builder.CreateMul(builder.getInt32(m_batch_size * (order + 1u)), cur_var_idx); auto *tcs = detail::gather_vector_from_memory(builder, fp_vec_t, builder.CreateInBoundsGEP(tc_ptrs, {load_idx}), alignof(T)); // Store it in out_ptr. The index is: // m_batch_size * cur_var_idx. auto *out_idx = builder.CreateMul(builder.getInt32(m_batch_size), cur_var_idx); detail::store_vector_to_memory(builder, builder.CreateInBoundsGEP(out_ptr, {out_idx}), tcs); // Zero-init the element in comp_arr. builder.CreateStore(zero_vec_fp, builder.CreateInBoundsGEP(comp_arr, {cur_var_idx})); }); // Init the running updater for the powers of h. auto *cur_h = builder.CreateAlloca(h->getType()); builder.CreateStore(h, cur_h); // Run the evaluation. detail::llvm_loop_u32( m_llvm_state, builder.getInt32(1), builder.getInt32(order + 1u), [&](llvm::Value *cur_order) { // Load the current power of h. auto *cur_h_val = builder.CreateLoad(cur_h); detail::llvm_loop_u32( m_llvm_state, builder.getInt32(0), builder.getInt32(dim), [&](llvm::Value *cur_var_idx) { // Load the coefficient from tc_ptrs. The index is: // m_batch_size * (order + 1u) * cur_var_idx + m_batch_size * cur_order. auto *load_idx = builder.CreateAdd( builder.CreateMul(builder.getInt32(m_batch_size * (order + 1u)), cur_var_idx), builder.CreateMul(builder.getInt32(m_batch_size), cur_order)); auto *cf = detail::gather_vector_from_memory( builder, fp_vec_t, builder.CreateInBoundsGEP(tc_ptrs, {load_idx}), alignof(T)); auto *tmp = builder.CreateFMul(cf, cur_h_val); // Compute the quantities for the compensation. auto *comp_ptr = builder.CreateInBoundsGEP(comp_arr, {cur_var_idx}); auto *out_idx = builder.CreateMul(builder.getInt32(m_batch_size), cur_var_idx); auto *res_ptr = builder.CreateInBoundsGEP(out_ptr, {out_idx}); auto *y = builder.CreateFSub(tmp, builder.CreateLoad(comp_ptr)); auto *cur_res = detail::load_vector_from_memory(builder, res_ptr, m_batch_size); auto *t = builder.CreateFAdd(cur_res, y); // Update the compensation and the return value. builder.CreateStore(builder.CreateFSub(builder.CreateFSub(t, cur_res), y), comp_ptr); detail::store_vector_to_memory(builder, res_ptr, t); }); // Update the value of h. builder.CreateStore(builder.CreateFMul(cur_h_val, h), cur_h); }); } else { // Start by writing into out_ptr the coefficients of the highest-degree // monomial in each polynomial. detail::llvm_loop_u32(m_llvm_state, builder.getInt32(0), builder.getInt32(dim), [&](llvm::Value *cur_var_idx) { // Load the coefficient from tc_ptrs. The index is: // m_batch_size * (order + 1u) * cur_var_idx + m_batch_size * order. auto *load_idx = builder.CreateAdd(builder.CreateMul(builder.getInt32(m_batch_size * (order + 1u)), cur_var_idx), builder.getInt32(m_batch_size * order)); auto *tcs = detail::gather_vector_from_memory(builder, fp_vec_t, builder.CreateInBoundsGEP(tc_ptrs, {load_idx}), alignof(T)); // Store it in out_ptr. The index is: // m_batch_size * cur_var_idx. auto *out_idx = builder.CreateMul(builder.getInt32(m_batch_size), cur_var_idx); detail::store_vector_to_memory(builder, builder.CreateInBoundsGEP(out_ptr, {out_idx}), tcs); }); // Now let's run the Horner scheme. detail::llvm_loop_u32( m_llvm_state, builder.getInt32(1), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)), [&](llvm::Value *cur_order) { detail::llvm_loop_u32( m_llvm_state, builder.getInt32(0), builder.getInt32(dim), [&](llvm::Value *cur_var_idx) { // Load the current Taylor coefficients from tc_ptrs. // NOTE: we are loading the coefficients backwards wrt the order, hence // we specify order - cur_order. // NOTE: the index is: // m_batch_size * (order + 1u) * cur_var_idx + m_batch_size * (order - cur_order). auto *load_idx = builder.CreateAdd( builder.CreateMul(builder.getInt32(m_batch_size * (order + 1u)), cur_var_idx), builder.CreateMul(builder.getInt32(m_batch_size), builder.CreateSub(builder.getInt32(order), cur_order))); auto *tcs = detail::gather_vector_from_memory( builder, fp_vec_t, builder.CreateInBoundsGEP(tc_ptrs, {load_idx}), alignof(T)); // Accumulate in out_ptr. The index is: // m_batch_size * cur_var_idx. auto *out_idx = builder.CreateMul(builder.getInt32(m_batch_size), cur_var_idx); auto *out_p = builder.CreateInBoundsGEP(out_ptr, {out_idx}); auto *cur_out = detail::load_vector_from_memory(builder, out_p, m_batch_size); detail::store_vector_to_memory(builder, out_p, builder.CreateFAdd(tcs, builder.CreateFMul(cur_out, h))); }); }); } // Create the return value. builder.CreateRetVoid(); // Verify the function. m_llvm_state.verify_function(f); // Run the optimisation pass. m_llvm_state.optimise(); // Compile. m_llvm_state.compile(); // Fetch the function pointer and assign it. m_f_ptr = reinterpret_cast(m_llvm_state.jit_lookup("c_out")); } template continuous_output_batch::continuous_output_batch() = default; template continuous_output_batch::continuous_output_batch(llvm_state &&s) : m_llvm_state(std::move(s)) { } template continuous_output_batch::continuous_output_batch(const continuous_output_batch &o) : m_batch_size(o.m_batch_size), m_llvm_state(o.m_llvm_state), m_tcs(o.m_tcs), m_times_hi(o.m_times_hi), m_times_lo(o.m_times_lo), m_output(o.m_output), m_tmp_tm(o.m_tmp_tm) { // If o is valid, fetch the function pointer from the copied state. // Otherwise, m_f_ptr will remain null. if (o.m_f_ptr != nullptr) { m_f_ptr = reinterpret_cast(m_llvm_state.jit_lookup("c_out")); } } template continuous_output_batch::continuous_output_batch(continuous_output_batch &&) noexcept = default; template continuous_output_batch::~continuous_output_batch() = default; template continuous_output_batch &continuous_output_batch::operator=(const continuous_output_batch &o) { if (this != &o) { *this = continuous_output_batch(o); } return *this; } template continuous_output_batch &continuous_output_batch::operator=(continuous_output_batch &&) noexcept = default; template void continuous_output_batch::call_impl(const T *t) { using std::isfinite; if (m_f_ptr == nullptr) { throw std::invalid_argument("Cannot use a default-constructed continuous_output_batch object"); } // NOTE: run the assertions only after ensuring this // is a valid object. // LCOV_EXCL_START #if !defined(NDEBUG) // The batch size must not be zero. assert(m_batch_size != 0u); // m_batch_size must divide m_output exactly. assert(m_output.size() % m_batch_size == 0u); // m_tmp_tm must be of size m_batch_size. assert(m_tmp_tm.size() == m_batch_size); // m_batch_size must divide the time and tcs vectors exactly. assert(m_times_hi.size() % m_batch_size == 0u); assert(m_tcs.size() % m_batch_size == 0u); // Need at least 3 time points (2 + 1 for padding). assert(m_times_hi.size() / m_batch_size >= 3u); // hi/lo parts of times must have the same sizes. assert(m_times_hi.size() == m_times_lo.size()); #endif // LCOV_EXCL_STOP // Copy over the times to the temp buffer and check that they are finite. // NOTE: this copy ensures we avoid aliasing issues with the // other data members. for (std::uint32_t i = 0; i < m_batch_size; ++i) { if (!isfinite(t[i])) { throw std::invalid_argument("Cannot compute the continuous output in batch mode " "for the batch index {} at the non-finite time {}"_format(i, t[i])); } m_tmp_tm[i] = t[i]; } m_f_ptr(m_output.data(), m_tmp_tm.data(), m_tcs.data(), m_times_hi.data(), m_times_lo.data()); } template const std::vector &continuous_output_batch::operator()(const std::vector &tm) { if (m_f_ptr == nullptr) { throw std::invalid_argument("Cannot use a default-constructed continuous_output_batch object"); } if (tm.size() != m_batch_size) { throw std::invalid_argument( "An invalid time vector was passed to the call operator of continuous_output_batch: the " "vector size is {}, but a size of {} was expected instead"_format(tm.size(), m_batch_size)); } return (*this)(tm.data()); } template const llvm_state &continuous_output_batch::get_llvm_state() const { return m_llvm_state; } template std::uint32_t continuous_output_batch::get_batch_size() const { return m_batch_size; } template void continuous_output_batch::save(boost::archive::binary_oarchive &ar, unsigned) const { ar << m_batch_size; ar << m_llvm_state; ar << m_tcs; ar << m_times_hi; ar << m_times_lo; ar << m_output; ar << m_tmp_tm; } template void continuous_output_batch::load(boost::archive::binary_iarchive &ar, unsigned) { ar >> m_batch_size; ar >> m_llvm_state; ar >> m_tcs; ar >> m_times_hi; ar >> m_times_lo; ar >> m_output; ar >> m_tmp_tm; // NOTE: if m_output is not empty, it means the archived // object had been initialised. if (m_output.empty()) { m_f_ptr = nullptr; } else { m_f_ptr = reinterpret_cast(m_llvm_state.jit_lookup("c_out")); } } template std::pair, std::vector> continuous_output_batch::get_bounds() const { if (m_f_ptr == nullptr) { throw std::invalid_argument("Cannot use a default-constructed continuous_output_batch object"); } std::vector lb, ub; lb.resize(boost::numeric_cast(m_batch_size)); ub.resize(boost::numeric_cast(m_batch_size)); for (std::uint32_t i = 0; i < m_batch_size; ++i) { lb[i] = m_times_hi[i]; // NOTE: take into account the padding. ub[i] = m_times_hi[m_times_hi.size() - 2u * m_batch_size + i]; } return std::make_pair(std::move(lb), std::move(ub)); } template std::size_t continuous_output_batch::get_n_steps() const { if (m_f_ptr == nullptr) { throw std::invalid_argument("Cannot use a default-constructed continuous_output_batch object"); } // NOTE: account for padding. return boost::numeric_cast(m_times_hi.size() / m_batch_size - 2u); } template class continuous_output_batch; template class continuous_output_batch; #if defined(HEYOKA_HAVE_REAL128) template class continuous_output_batch; #endif namespace detail { template std::ostream &c_out_batch_stream_impl(std::ostream &os, const continuous_output_batch &co) { std::ostringstream oss; oss.exceptions(std::ios_base::failbit | std::ios_base::badbit); oss.imbue(std::locale::classic()); oss << std::showpoint; oss.precision(std::numeric_limits::max_digits10); if (co.get_output().empty()) { oss << "Default-constructed continuous_output_batch"; } else { const auto batch_size = co.m_batch_size; oss << "Directions : ["; for (std::uint32_t i = 0; i < batch_size; ++i) { const detail::dfloat df_t_start(co.m_times_hi[i], co.m_times_lo[i]), df_t_end(co.m_times_hi[co.m_times_hi.size() - 2u * batch_size + i], co.m_times_lo[co.m_times_lo.size() - 2u * batch_size + i]); const auto dir = df_t_start < df_t_end; oss << (dir ? "forward" : "backward"); if (i != batch_size - 1u) { oss << ", "; } } oss << "]\n"; oss << "Time ranges: ["; for (std::uint32_t i = 0; i < batch_size; ++i) { const detail::dfloat df_t_start(co.m_times_hi[i], co.m_times_lo[i]), df_t_end(co.m_times_hi[co.m_times_hi.size() - 2u * batch_size + i], co.m_times_lo[co.m_times_lo.size() - 2u * batch_size + i]); const auto dir = df_t_start < df_t_end; oss << (dir ? "[{}, {})"_format(df_t_start.hi, df_t_end.hi) : "({}, {}]"_format(df_t_end.hi, df_t_start.hi)); if (i != batch_size - 1u) { oss << ", "; } } oss << "]\n"; oss << "N of steps : " << co.get_n_steps() << '\n'; } return os << oss.str(); } } // namespace detail template <> std::ostream &operator<<(std::ostream &os, const continuous_output_batch &co) { return detail::c_out_batch_stream_impl(os, co); } template <> std::ostream &operator<<(std::ostream &os, const continuous_output_batch &co) { return detail::c_out_batch_stream_impl(os, co); } #if defined(HEYOKA_HAVE_REAL128) template <> std::ostream &operator<<(std::ostream &os, const continuous_output_batch &co) { return detail::c_out_batch_stream_impl(os, co); } #endif } // namespace heyoka