1 // Copyright 2020, 2021 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
2 //
3 // This file is part of the heyoka library.
4 //
5 // This Source Code Form is subject to the terms of the Mozilla
6 // Public License v. 2.0. If a copy of the MPL was not distributed
7 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 
9 #include <heyoka/config.hpp>
10 
11 #include <algorithm>
12 #include <array>
13 #include <cassert>
14 #include <chrono> // NOTE: needed for the spdlog stopwatch.
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdint>
18 #include <exception>
19 #include <functional>
20 #include <limits>
21 #include <map>
22 #include <optional>
23 #include <stdexcept>
24 #include <string>
25 #include <tuple>
26 #include <type_traits>
27 #include <typeinfo>
28 #include <unordered_map>
29 #include <utility>
30 #include <variant>
31 #include <vector>
32 
33 #include <boost/numeric/conversion/cast.hpp>
34 
35 #include <fmt/format.h>
36 #include <fmt/ostream.h>
37 
38 #include <spdlog/stopwatch.h>
39 
40 #include <llvm/IR/Attributes.h>
41 #include <llvm/IR/BasicBlock.h>
42 #include <llvm/IR/Constant.h>
43 #include <llvm/IR/Constants.h>
44 #include <llvm/IR/DerivedTypes.h>
45 #include <llvm/IR/Function.h>
46 #include <llvm/IR/GlobalVariable.h>
47 #include <llvm/IR/IRBuilder.h>
48 #include <llvm/IR/LLVMContext.h>
49 #include <llvm/IR/Type.h>
50 #include <llvm/IR/Value.h>
51 #include <llvm/Support/Casting.h>
52 
53 #if defined(HEYOKA_HAVE_REAL128)
54 
55 #include <mp++/real128.hpp>
56 
57 #endif
58 
59 #include <heyoka/detail/event_detection.hpp>
60 #include <heyoka/detail/llvm_fwd.hpp>
61 #include <heyoka/detail/llvm_helpers.hpp>
62 #include <heyoka/detail/llvm_vector_type.hpp>
63 #include <heyoka/detail/logging_impl.hpp>
64 #include <heyoka/detail/sleef.hpp>
65 #include <heyoka/detail/string_conv.hpp>
66 #include <heyoka/detail/type_traits.hpp>
67 #include <heyoka/detail/visibility.hpp>
68 #include <heyoka/exceptions.hpp>
69 #include <heyoka/expression.hpp>
70 #include <heyoka/func.hpp>
71 #include <heyoka/llvm_state.hpp>
72 #include <heyoka/number.hpp>
73 #include <heyoka/param.hpp>
74 #include <heyoka/s11n.hpp>
75 #include <heyoka/taylor.hpp>
76 #include <heyoka/variable.hpp>
77 
78 #if defined(_MSC_VER) && !defined(__clang__)
79 
80 // NOTE: MSVC has issues with the other "using"
81 // statement form.
82 using namespace fmt::literals;
83 
84 #else
85 
86 using fmt::literals::operator""_format;
87 
88 #endif
89 
90 namespace heyoka
91 {
92 
93 namespace detail
94 {
95 
96 namespace
97 {
98 
99 // RAII helper to temporarily set the opt level to 0 in an llvm_state.
100 struct opt_disabler {
101     llvm_state &m_s;
102     unsigned m_orig_opt_level;
103 
opt_disablerheyoka::detail::__anon89701e160111::opt_disabler104     explicit opt_disabler(llvm_state &s) : m_s(s), m_orig_opt_level(s.opt_level())
105     {
106         // Disable optimisations.
107         m_s.opt_level() = 0;
108     }
109 
110     opt_disabler(const opt_disabler &) = delete;
111     opt_disabler(opt_disabler &&) noexcept = delete;
112     opt_disabler &operator=(const opt_disabler &) = delete;
113     opt_disabler &operator=(opt_disabler &&) noexcept = delete;
114 
~opt_disablerheyoka::detail::__anon89701e160111::opt_disabler115     ~opt_disabler()
116     {
117         // Restore the original optimisation level.
118         m_s.opt_level() = m_orig_opt_level;
119     }
120 };
121 
122 // Helper to determine the optimal Taylor order for a given tolerance,
123 // following Jorba's prescription.
124 template <typename T>
taylor_order_from_tol(T tol)125 std::uint32_t taylor_order_from_tol(T tol)
126 {
127     using std::ceil;
128     using std::isfinite;
129     using std::log;
130 
131     // Determine the order from the tolerance.
132     auto order_f = ceil(-log(tol) / 2 + 1);
133     // LCOV_EXCL_START
134     if (!isfinite(order_f)) {
135         throw std::invalid_argument(
136             "The computation of the Taylor order in an adaptive Taylor stepper produced a non-finite value");
137     }
138     // LCOV_EXCL_STOP
139     // NOTE: min order is 2.
140     order_f = std::max(T(2), order_f);
141 
142     // NOTE: cast to double as that ensures that the
143     // max of std::uint32_t is exactly representable.
144     // LCOV_EXCL_START
145     if (order_f > static_cast<double>(std::numeric_limits<std::uint32_t>::max())) {
146         throw std::overflow_error("The computation of the Taylor order in an adaptive Taylor stepper resulted "
147                                   "in an overflow condition");
148     }
149     // LCOV_EXCL_STOP
150     return static_cast<std::uint32_t>(order_f);
151 }
152 
153 // Helper to compute max(x_v, abs(y_v)) in the Taylor stepper implementation.
taylor_step_maxabs(llvm_state & s,llvm::Value * x_v,llvm::Value * y_v)154 llvm::Value *taylor_step_maxabs(llvm_state &s, llvm::Value *x_v, llvm::Value *y_v)
155 {
156     return llvm_max(s, x_v, llvm_abs(s, y_v));
157 }
158 
159 // Helper to compute min(x_v, abs(y_v)) in the Taylor stepper implementation.
taylor_step_minabs(llvm_state & s,llvm::Value * x_v,llvm::Value * y_v)160 llvm::Value *taylor_step_minabs(llvm_state &s, llvm::Value *x_v, llvm::Value *y_v)
161 {
162     return llvm_min(s, x_v, llvm_abs(s, y_v));
163 }
164 
165 // Helper to compute pow(x_v, y_v) in the Taylor stepper implementation.
taylor_step_pow(llvm_state & s,llvm::Value * x_v,llvm::Value * y_v)166 llvm::Value *taylor_step_pow(llvm_state &s, llvm::Value *x_v, llvm::Value *y_v)
167 {
168 #if defined(HEYOKA_HAVE_REAL128)
169     // Determine the scalar type of the vector arguments.
170     auto *x_t = x_v->getType()->getScalarType();
171 
172     if (x_t == llvm::Type::getFP128Ty(s.context())) {
173         return call_extern_vec(s, x_v, y_v, "powq");
174     } else {
175 #endif
176         // If we are operating on SIMD vectors, try to see if we have a sleef
177         // function available for pow().
178         if (auto *vec_t = llvm::dyn_cast<llvm_vector_type>(x_v->getType())) {
179             // NOTE: if sfn ends up empty, we will be falling through
180             // below and use the LLVM intrinsic instead.
181             if (const auto sfn = sleef_function_name(s.context(), "pow", vec_t->getElementType(),
182                                                      boost::numeric_cast<std::uint32_t>(vec_t->getNumElements()));
183                 !sfn.empty()) {
184                 return llvm_invoke_external(
185                     s, sfn, vec_t, {x_v, y_v},
186                     // NOTE: in theory we may add ReadNone here as well,
187                     // but for some reason, at least up to LLVM 10,
188                     // this causes strange codegen issues. Revisit
189                     // in the future.
190                     {llvm::Attribute::NoUnwind, llvm::Attribute::Speculatable, llvm::Attribute::WillReturn});
191             }
192         }
193 
194         return llvm_invoke_intrinsic(s, "llvm.pow", {x_v->getType()}, {x_v, y_v});
195 #if defined(HEYOKA_HAVE_REAL128)
196     }
197 #endif
198 }
199 
200 // Create a global read-only array containing the values in sv_funcs_dc, if any
201 // (otherwise, the return value will be null). This is for use in the adaptive steppers
202 // when employing compact mode.
taylor_c_make_sv_funcs_arr(llvm_state & s,const std::vector<std::uint32_t> & sv_funcs_dc)203 llvm::Value *taylor_c_make_sv_funcs_arr(llvm_state &s, const std::vector<std::uint32_t> &sv_funcs_dc)
204 {
205     auto &builder = s.builder();
206 
207     if (sv_funcs_dc.empty()) {
208         return nullptr;
209     } else {
210         auto *arr_type
211             = llvm::ArrayType::get(builder.getInt32Ty(), boost::numeric_cast<std::uint64_t>(sv_funcs_dc.size()));
212         std::vector<llvm::Constant *> sv_funcs_dc_const;
213         sv_funcs_dc_const.reserve(sv_funcs_dc.size());
214         for (auto idx : sv_funcs_dc) {
215             sv_funcs_dc_const.emplace_back(builder.getInt32(idx));
216         }
217         auto *sv_funcs_dc_arr = llvm::ConstantArray::get(arr_type, sv_funcs_dc_const);
218         auto *g_sv_funcs_dc = new llvm::GlobalVariable(s.module(), sv_funcs_dc_arr->getType(), true,
219                                                        llvm::GlobalVariable::InternalLinkage, sv_funcs_dc_arr);
220 
221         // Get out a pointer to the beginning of the array.
222         assert(llvm_depr_GEP_type_check(g_sv_funcs_dc, arr_type)); // LCOV_EXCL_LINE
223         return builder.CreateInBoundsGEP(arr_type, g_sv_funcs_dc, {builder.getInt32(0), builder.getInt32(0)});
224     }
225 }
226 
227 // Helper to generate the LLVM code to determine the timestep in an adaptive Taylor integrator,
228 // following Jorba's prescription. diff_variant is the output of taylor_compute_jet(), and it contains
229 // the jet of derivatives for the state variables and the sv_funcs. h_ptr is a pointer containing
230 // the clamping values for the timesteps. svf_ptr is a pointer to the first element of an LLVM array containing the
231 // values in sv_funcs_dc. If max_abs_state_ptr is not nullptr, the computed norm infinity of the
232 // state vector (including sv_funcs, if any) will be written into it.
233 template <typename T>
taylor_determine_h(llvm_state & s,const std::variant<llvm::Value *,std::vector<llvm::Value * >> & diff_variant,const std::vector<std::uint32_t> & sv_funcs_dc,llvm::Value * svf_ptr,llvm::Value * h_ptr,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size,llvm::Value * max_abs_state_ptr)234 llvm::Value *taylor_determine_h(llvm_state &s,
235                                 const std::variant<llvm::Value *, std::vector<llvm::Value *>> &diff_variant,
236                                 const std::vector<std::uint32_t> &sv_funcs_dc, llvm::Value *svf_ptr, llvm::Value *h_ptr,
237                                 std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order,
238                                 std::uint32_t batch_size, llvm::Value *max_abs_state_ptr)
239 {
240     assert(batch_size != 0u);
241 #if !defined(NDEBUG)
242     if (diff_variant.index() == 0u) {
243         // Compact mode.
244         assert(sv_funcs_dc.empty() == !svf_ptr);
245     } else {
246         // Non-compact mode.
247         assert(svf_ptr == nullptr);
248     }
249 #endif
250 
251     using std::exp;
252 
253     auto &builder = s.builder();
254     auto &context = s.context();
255 
256     llvm::Value *max_abs_state = nullptr, *max_abs_diff_o = nullptr, *max_abs_diff_om1 = nullptr;
257 
258     if (diff_variant.index() == 0u) {
259         // Compact mode.
260         auto *diff_arr = std::get<llvm::Value *>(diff_variant);
261 
262         // These will end up containing the norm infinity of the state vector + sv_funcs and the
263         // norm infinity of the derivatives at orders order and order - 1.
264         auto vec_t = to_llvm_vector_type<T>(context, batch_size);
265         max_abs_state = builder.CreateAlloca(vec_t);
266         max_abs_diff_o = builder.CreateAlloca(vec_t);
267         max_abs_diff_om1 = builder.CreateAlloca(vec_t);
268 
269         // Initialise with the abs(derivatives) of the first state variable at orders 0, 'order' and 'order - 1'.
270         builder.CreateStore(
271             llvm_abs(s, taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), builder.getInt32(0))),
272             max_abs_state);
273         builder.CreateStore(
274             llvm_abs(s, taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order), builder.getInt32(0))),
275             max_abs_diff_o);
276         builder.CreateStore(
277             llvm_abs(s, taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order - 1u), builder.getInt32(0))),
278             max_abs_diff_om1);
279 
280         // Iterate over the variables to compute the norm infinities.
281         llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(n_eq), [&](llvm::Value *cur_idx) {
282             builder.CreateStore(
283                 taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_state),
284                                    taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), cur_idx)),
285                 max_abs_state);
286             builder.CreateStore(
287                 taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_diff_o),
288                                    taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order), cur_idx)),
289                 max_abs_diff_o);
290             builder.CreateStore(
291                 taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_diff_om1),
292                                    taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order - 1u), cur_idx)),
293                 max_abs_diff_om1);
294         });
295 
296         if (svf_ptr != nullptr) {
297             // Consider also the functions of state variables for
298             // the computation of the timestep.
299             llvm_loop_u32(
300                 s, builder.getInt32(0), builder.getInt32(boost::numeric_cast<std::uint32_t>(sv_funcs_dc.size())),
301                 [&](llvm::Value *arr_idx) {
302                     // Fetch the index value from the array.
303                     assert(llvm_depr_GEP_type_check(svf_ptr, builder.getInt32Ty())); // LCOV_EXCL_LINE
304                     auto cur_idx = builder.CreateLoad(
305                         builder.getInt32Ty(), builder.CreateInBoundsGEP(builder.getInt32Ty(), svf_ptr, arr_idx));
306 
307                     builder.CreateStore(
308                         taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_state),
309                                            taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), cur_idx)),
310                         max_abs_state);
311                     builder.CreateStore(
312                         taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_diff_o),
313                                            taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order), cur_idx)),
314                         max_abs_diff_o);
315                     builder.CreateStore(taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_diff_om1),
316                                                            taylor_c_load_diff(s, diff_arr, n_uvars,
317                                                                               builder.getInt32(order - 1u), cur_idx)),
318                                         max_abs_diff_om1);
319                 });
320         }
321 
322         // Load the values for later use.
323         max_abs_state = builder.CreateLoad(vec_t, max_abs_state);
324         max_abs_diff_o = builder.CreateLoad(vec_t, max_abs_diff_o);
325         max_abs_diff_om1 = builder.CreateLoad(vec_t, max_abs_diff_om1);
326     } else {
327         // Non-compact mode.
328         const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_variant);
329 
330         const auto n_sv_funcs = static_cast<std::uint32_t>(sv_funcs_dc.size());
331 
332         // Compute the norm infinity of the state vector and the norm infinity of the derivatives
333         // at orders order and order - 1. We first create vectors of absolute values and then
334         // compute their maxima.
335         std::vector<llvm::Value *> v_max_abs_state, v_max_abs_diff_o, v_max_abs_diff_om1;
336 
337         // NOTE: iterate up to n_eq + n_sv_funcs in order to
338         // consider also the functions of state variables for
339         // the computation of the timestep.
340         for (std::uint32_t i = 0; i < n_eq + n_sv_funcs; ++i) {
341             v_max_abs_state.push_back(llvm_abs(s, diff_arr[i]));
342             // NOTE: in non-compact mode, diff_arr contains the derivatives only of the
343             // state variables and sv funcs (not all u vars), hence the indexing is
344             // order * (n_eq + n_sv_funcs).
345             v_max_abs_diff_o.push_back(llvm_abs(s, diff_arr[order * (n_eq + n_sv_funcs) + i]));
346             v_max_abs_diff_om1.push_back(llvm_abs(s, diff_arr[(order - 1u) * (n_eq + n_sv_funcs) + i]));
347         }
348 
349         // Find the maxima via pairwise reduction.
350         auto reducer = [&s](llvm::Value *a, llvm::Value *b) -> llvm::Value * { return llvm_max(s, a, b); };
351         max_abs_state = pairwise_reduce(v_max_abs_state, reducer);
352         max_abs_diff_o = pairwise_reduce(v_max_abs_diff_o, reducer);
353         max_abs_diff_om1 = pairwise_reduce(v_max_abs_diff_om1, reducer);
354     }
355 
356     // Store max_abs_state, if requested.
357     if (max_abs_state_ptr != nullptr) {
358         store_vector_to_memory(builder, max_abs_state_ptr, max_abs_state);
359     }
360 
361     // Determine if we are in absolute or relative tolerance mode.
362     auto abs_or_rel
363         = builder.CreateFCmpOLE(max_abs_state, vector_splat(builder, codegen<T>(s, number{1.}), batch_size));
364 
365     // Estimate rho at orders order - 1 and order.
366     auto num_rho
367         = builder.CreateSelect(abs_or_rel, vector_splat(builder, codegen<T>(s, number{1.}), batch_size), max_abs_state);
368     auto rho_o = taylor_step_pow(s, builder.CreateFDiv(num_rho, max_abs_diff_o),
369                                  vector_splat(builder, codegen<T>(s, number{T(1) / order}), batch_size));
370     auto rho_om1 = taylor_step_pow(s, builder.CreateFDiv(num_rho, max_abs_diff_om1),
371                                    vector_splat(builder, codegen<T>(s, number{T(1) / (order - 1u)}), batch_size));
372 
373     // Take the minimum.
374     auto rho_m = llvm_min(s, rho_o, rho_om1);
375 
376     // Compute the scaling + safety factor.
377     const auto rhofac = exp((T(-7) / T(10)) / (order - 1u)) / (exp(T(1)) * exp(T(1)));
378 
379     // Determine the step size in absolute value.
380     auto h = builder.CreateFMul(rho_m, vector_splat(builder, codegen<T>(s, number{rhofac}), batch_size));
381 
382     // Ensure that the step size does not exceed the limit in absolute value.
383     auto *max_h_vec = load_vector_from_memory(builder, h_ptr, batch_size);
384     h = taylor_step_minabs(s, h, max_h_vec);
385 
386     // Handle backwards propagation.
387     auto backward = builder.CreateFCmpOLT(max_h_vec, vector_splat(builder, codegen<T>(s, number{0.}), batch_size));
388     auto h_fac = builder.CreateSelect(backward, vector_splat(builder, codegen<T>(s, number{-1.}), batch_size),
389                                       vector_splat(builder, codegen<T>(s, number{1.}), batch_size));
390     h = builder.CreateFMul(h_fac, h);
391 
392     return h;
393 }
394 
395 // Store the value val as the derivative of order 'order' of the u variable u_idx
396 // into the array of Taylor derivatives diff_arr. n_uvars is the total number of u variables.
taylor_c_store_diff(llvm_state & s,llvm::Value * diff_arr,std::uint32_t n_uvars,llvm::Value * order,llvm::Value * u_idx,llvm::Value * val)397 void taylor_c_store_diff(llvm_state &s, llvm::Value *diff_arr, std::uint32_t n_uvars, llvm::Value *order,
398                          llvm::Value *u_idx, llvm::Value *val)
399 {
400     auto &builder = s.builder();
401 
402     // NOTE: overflow check has already been done to ensure that the
403     // total size of diff_arr fits in a 32-bit unsigned integer.
404     assert(llvm_depr_GEP_type_check(diff_arr, pointee_type(diff_arr))); // LCOV_EXCL_LINE
405     auto *ptr
406         = builder.CreateInBoundsGEP(pointee_type(diff_arr), diff_arr,
407                                     builder.CreateAdd(builder.CreateMul(order, builder.getInt32(n_uvars)), u_idx));
408 
409     builder.CreateStore(val, ptr);
410 }
411 
412 // Compute the derivative of order "order" of a state variable.
413 // ex is the formula for the first-order derivative of the state variable (which
414 // is either a u variable or a number/param), n_uvars the number of variables in
415 // the decomposition, arr the array containing the derivatives of all u variables
416 // up to order - 1.
417 template <typename T>
taylor_compute_sv_diff(llvm_state & s,const expression & ex,const std::vector<llvm::Value * > & arr,llvm::Value * par_ptr,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size)418 llvm::Value *taylor_compute_sv_diff(llvm_state &s, const expression &ex, const std::vector<llvm::Value *> &arr,
419                                     llvm::Value *par_ptr, std::uint32_t n_uvars, std::uint32_t order,
420                                     std::uint32_t batch_size)
421 {
422     assert(order > 0u);
423 
424     auto &builder = s.builder();
425 
426     return std::visit(
427         [&](const auto &v) -> llvm::Value * {
428             using type = uncvref_t<decltype(v)>;
429 
430             if constexpr (std::is_same_v<type, variable>) {
431                 // Extract the index of the u variable in the expression
432                 // of the first-order derivative.
433                 const auto u_idx = uname_to_index(v.name());
434 
435                 // Fetch from arr the derivative
436                 // of order 'order - 1' of the u variable at u_idx. The index is:
437                 // (order - 1) * n_uvars + u_idx.
438                 auto ret = taylor_fetch_diff(arr, u_idx, order - 1u, n_uvars);
439 
440                 // We have to divide the derivative by order
441                 // to get the normalised derivative of the state variable.
442                 return builder.CreateFDiv(
443                     ret, vector_splat(builder, codegen<T>(s, number(static_cast<T>(order))), batch_size));
444             } else if constexpr (std::is_same_v<type, number> || std::is_same_v<type, param>) {
445                 // The first-order derivative is a constant.
446                 // If the first-order derivative is being requested,
447                 // do the codegen for the constant itself, otherwise
448                 // return 0. No need for normalization as the only
449                 // nonzero value that can be produced here is the first-order
450                 // derivative.
451                 if (order == 1u) {
452                     return taylor_codegen_numparam<T>(s, v, par_ptr, batch_size);
453                 } else {
454                     return vector_splat(builder, codegen<T>(s, number{0.}), batch_size);
455                 }
456             } else {
457                 assert(false);
458                 return nullptr;
459             }
460         },
461         ex.value());
462 }
463 
464 // Function to split the central part of the decomposition (i.e., the definitions of the u variables
465 // that do not represent state variables) into parallelisable segments. Within a segment,
466 // the definition of a u variable does not depend on any u variable defined within that segment.
467 // NOTE: the hidden deps are not considered as dependencies.
468 // NOTE: the segments in the return value will contain shallow copies of the
469 // expressions in dc.
taylor_segment_dc(const taylor_dc_t & dc,std::uint32_t n_eq)470 std::vector<taylor_dc_t> taylor_segment_dc(const taylor_dc_t &dc, std::uint32_t n_eq)
471 {
472     // Log runtime in trace mode.
473     spdlog::stopwatch sw;
474 
475     // Helper that takes in input the definition ex of a u variable, and returns
476     // in output the list of indices of the u variables on which ex depends.
477     auto udef_args_indices = [](const expression &ex) -> std::vector<std::uint32_t> {
478         return std::visit(
479             [](const auto &v) -> std::vector<std::uint32_t> {
480                 using type = detail::uncvref_t<decltype(v)>;
481 
482                 if constexpr (std::is_same_v<type, func>) {
483                     std::vector<std::uint32_t> retval;
484 
485                     for (const auto &arg : v.args()) {
486                         std::visit(
487                             [&retval](const auto &x) {
488                                 using tp = detail::uncvref_t<decltype(x)>;
489 
490                                 if constexpr (std::is_same_v<tp, variable>) {
491                                     retval.push_back(uname_to_index(x.name()));
492                                 } else if constexpr (!std::is_same_v<tp, number> && !std::is_same_v<tp, param>) {
493                                     throw std::invalid_argument(
494                                         "Invalid argument encountered in an element of a Taylor decomposition: the "
495                                         "argument is not a variable or a number/param");
496                                 }
497                             },
498                             arg.value());
499                     }
500 
501                     return retval;
502                 } else {
503                     throw std::invalid_argument("Invalid expression encountered in a Taylor decomposition: the "
504                                                 "expression is not a function");
505                 }
506             },
507             ex.value());
508     };
509 
510     // Init the return value.
511     std::vector<taylor_dc_t> s_dc;
512 
513     // cur_limit_idx is initially the index of the first
514     // u variable which is not a state variable.
515     auto cur_limit_idx = n_eq;
516     for (std::uint32_t i = n_eq; i < dc.size() - n_eq; ++i) {
517         // NOTE: at the very first iteration of this for loop,
518         // no block has been created yet. Do it now.
519         if (i == n_eq) {
520             assert(s_dc.empty());
521             s_dc.emplace_back();
522         } else {
523             assert(!s_dc.empty());
524         }
525 
526         const auto &[ex, deps] = dc[i];
527 
528         // Determine the u indices on which ex depends.
529         const auto u_indices = udef_args_indices(ex);
530 
531         if (std::any_of(u_indices.begin(), u_indices.end(),
532                         [cur_limit_idx](auto idx) { return idx >= cur_limit_idx; })) {
533             // The current expression depends on one or more variables
534             // within the current block. Start a new block and
535             // update cur_limit_idx with the start index of the new block.
536             s_dc.emplace_back();
537             cur_limit_idx = i;
538         }
539 
540         // Append ex to the current block.
541         s_dc.back().emplace_back(ex, deps);
542     }
543 
544 #if !defined(NDEBUG)
545     // Verify s_dc.
546 
547     decltype(dc.size()) counter = 0;
548     for (const auto &s : s_dc) {
549         // No segment can be empty.
550         assert(!s.empty());
551 
552         for (const auto &[ex, _] : s) {
553             // All the indices in the definitions of the
554             // u variables in the current block must be
555             // less than counter + n_eq (which is the starting
556             // index of the block).
557             const auto u_indices = udef_args_indices(ex);
558             assert(std::all_of(u_indices.begin(), u_indices.end(),
559                                [idx_limit = counter + n_eq](auto idx) { return idx < idx_limit; }));
560         }
561 
562         // Update the counter.
563         counter += s.size();
564     }
565 
566     assert(counter == dc.size() - n_eq * 2u);
567 #endif
568 
569     get_logger()->debug("Taylor N of segments: {}", s_dc.size());
570     get_logger()->trace("Taylor segment runtime: {}", sw);
571 
572     return s_dc;
573 }
574 
575 // Small helper to compute the size of a global array.
taylor_c_gl_arr_size(llvm::Value * v)576 std::uint32_t taylor_c_gl_arr_size(llvm::Value *v)
577 {
578     assert(llvm::isa<llvm::GlobalVariable>(v));
579 
580     return boost::numeric_cast<std::uint32_t>(
581         llvm::cast<llvm::ArrayType>(llvm::cast<llvm::PointerType>(v->getType())->getElementType())->getNumElements());
582 }
583 
584 // Helper to construct the global arrays needed for the computation of the
585 // derivatives of the state variables in compact mode. The first part of the
586 // return value is a set of 6 arrays:
587 // - the indices of the state variables whose time derivative is a u variable, paired to
588 // - the indices of the u variables appearing in the derivatives, and
589 // - the indices of the state variables whose time derivative is a constant, paired to
590 // - the values of said constants, and
591 // - the indices of the state variables whose time derivative is a param, paired to
592 // - the indices of the params.
593 // The second part of the return value is a boolean flag that will be true if
594 // the time derivatives of all state variables are u variables, false otherwise.
595 template <typename T>
596 std::pair<std::array<llvm::GlobalVariable *, 6>, bool>
taylor_c_make_sv_diff_globals(llvm_state & s,const taylor_dc_t & dc,std::uint32_t n_uvars)597 taylor_c_make_sv_diff_globals(llvm_state &s, const taylor_dc_t &dc, std::uint32_t n_uvars)
598 {
599     auto &context = s.context();
600     auto &builder = s.builder();
601     auto &module = s.module();
602 
603     // Build iteratively the output values as vectors of constants.
604     std::vector<llvm::Constant *> var_indices, vars, num_indices, nums, par_indices, pars;
605 
606     // Keep track of how many time derivatives
607     // of the state variables are u variables.
608     std::uint32_t n_der_vars = 0;
609 
610     // NOTE: the derivatives of the state variables are at the end of the decomposition.
611     for (auto i = n_uvars; i < boost::numeric_cast<std::uint32_t>(dc.size()); ++i) {
612         std::visit(
613             [&](const auto &v) {
614                 using type = uncvref_t<decltype(v)>;
615 
616                 if constexpr (std::is_same_v<type, variable>) {
617                     ++n_der_vars;
618                     // NOTE: remove from i the n_uvars offset to get the
619                     // true index of the state variable.
620                     var_indices.push_back(builder.getInt32(i - n_uvars));
621                     vars.push_back(builder.getInt32(uname_to_index(v.name())));
622                 } else if constexpr (std::is_same_v<type, number>) {
623                     num_indices.push_back(builder.getInt32(i - n_uvars));
624                     nums.push_back(llvm::cast<llvm::Constant>(codegen<T>(s, v)));
625                 } else if constexpr (std::is_same_v<type, param>) {
626                     par_indices.push_back(builder.getInt32(i - n_uvars));
627                     pars.push_back(builder.getInt32(v.idx()));
628                 } else {
629                     assert(false);
630                 }
631             },
632             dc[i].first.value());
633     }
634 
635     // Flag to signal that the time derivatives of all state variables are u variables.
636     assert(dc.size() >= n_uvars);
637     const auto all_der_vars = (n_der_vars == (dc.size() - n_uvars));
638 
639     assert(var_indices.size() == vars.size());
640     assert(num_indices.size() == nums.size());
641     assert(par_indices.size() == pars.size());
642 
643     // Turn the vectors into global read-only LLVM arrays.
644 
645     // Variables.
646     auto *var_arr_type
647         = llvm::ArrayType::get(llvm::Type::getInt32Ty(context), boost::numeric_cast<std::uint64_t>(var_indices.size()));
648 
649     auto *var_indices_arr = llvm::ConstantArray::get(var_arr_type, var_indices);
650     auto *g_var_indices = new llvm::GlobalVariable(module, var_indices_arr->getType(), true,
651                                                    llvm::GlobalVariable::InternalLinkage, var_indices_arr);
652 
653     auto *vars_arr = llvm::ConstantArray::get(var_arr_type, vars);
654     auto *g_vars
655         = new llvm::GlobalVariable(module, vars_arr->getType(), true, llvm::GlobalVariable::InternalLinkage, vars_arr);
656 
657     // Numbers.
658     auto *num_indices_arr_type
659         = llvm::ArrayType::get(llvm::Type::getInt32Ty(context), boost::numeric_cast<std::uint64_t>(num_indices.size()));
660     auto *num_indices_arr = llvm::ConstantArray::get(num_indices_arr_type, num_indices);
661     auto *g_num_indices = new llvm::GlobalVariable(module, num_indices_arr->getType(), true,
662                                                    llvm::GlobalVariable::InternalLinkage, num_indices_arr);
663 
664     auto nums_arr_type
665         = llvm::ArrayType::get(to_llvm_type<T>(context), boost::numeric_cast<std::uint64_t>(nums.size()));
666     auto nums_arr = llvm::ConstantArray::get(nums_arr_type, nums);
667     // NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
668     auto *g_nums
669         = new llvm::GlobalVariable(module, nums_arr->getType(), true, llvm::GlobalVariable::InternalLinkage, nums_arr);
670 
671     // Params.
672     auto *par_arr_type
673         = llvm::ArrayType::get(llvm::Type::getInt32Ty(context), boost::numeric_cast<std::uint64_t>(par_indices.size()));
674 
675     auto *par_indices_arr = llvm::ConstantArray::get(par_arr_type, par_indices);
676     auto *g_par_indices = new llvm::GlobalVariable(module, par_indices_arr->getType(), true,
677                                                    llvm::GlobalVariable::InternalLinkage, par_indices_arr);
678 
679     auto *pars_arr = llvm::ConstantArray::get(par_arr_type, pars);
680     auto *g_pars
681         = new llvm::GlobalVariable(module, pars_arr->getType(), true, llvm::GlobalVariable::InternalLinkage, pars_arr);
682 
683     return std::pair{std::array{g_var_indices, g_vars, g_num_indices, g_nums, g_par_indices, g_pars}, all_der_vars};
684 }
685 
686 // Helper to compute and store the derivatives of the state variables in compact mode at order 'order'.
687 // svd_gl is the return value of taylor_c_make_sv_diff_globals(), which contains
688 // the indices/constants necessary for the computation.
taylor_c_compute_sv_diffs(llvm_state & s,const std::pair<std::array<llvm::GlobalVariable *,6>,bool> & svd_gl,llvm::Value * diff_arr,llvm::Value * par_ptr,std::uint32_t n_uvars,llvm::Value * order,std::uint32_t batch_size)689 void taylor_c_compute_sv_diffs(llvm_state &s, const std::pair<std::array<llvm::GlobalVariable *, 6>, bool> &svd_gl,
690                                llvm::Value *diff_arr, llvm::Value *par_ptr, std::uint32_t n_uvars, llvm::Value *order,
691                                std::uint32_t batch_size)
692 {
693     assert(batch_size > 0u);
694 
695     // Fetch the global arrays and
696     // the all_der_vars flag.
697     const auto &sv_diff_gl = svd_gl.first;
698     const auto all_der_vars = svd_gl.second;
699 
700     auto &builder = s.builder();
701 
702     // Recover the number of state variables whose derivatives are given
703     // by u variables, numbers and params.
704     const auto n_vars = taylor_c_gl_arr_size(sv_diff_gl[0]);
705     const auto n_nums = taylor_c_gl_arr_size(sv_diff_gl[2]);
706     const auto n_pars = taylor_c_gl_arr_size(sv_diff_gl[4]);
707 
708     // Fetch the type stored in the array of derivatives.
709     auto *fp_vec_t = pointee_type(diff_arr);
710 
711     // Handle the u variables definitions.
712     llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_vars), [&](llvm::Value *cur_idx) {
713         // Fetch the index of the state variable.
714         // NOTE: if the time derivatives of all state variables are u variables, there's
715         // no need to lookup the index in the global array (which will just contain
716         // the values in the [0, n_vars] range).
717         auto *sv_idx = all_der_vars
718                            ? cur_idx
719                            : builder.CreateLoad(builder.getInt32Ty(),
720                                                 builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[0]), sv_diff_gl[0],
721                                                                           {builder.getInt32(0), cur_idx}));
722 
723         // Fetch the index of the u variable.
724         auto *u_idx = builder.CreateLoad(
725             builder.getInt32Ty(),
726             builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[1]), sv_diff_gl[1], {builder.getInt32(0), cur_idx}));
727 
728         // Fetch from diff_arr the derivative of order 'order - 1' of the u variable u_idx.
729         auto *ret = taylor_c_load_diff(s, diff_arr, n_uvars, builder.CreateSub(order, builder.getInt32(1)), u_idx);
730 
731         // We have to divide the derivative by 'order' in order
732         // to get the normalised derivative of the state variable.
733         ret = builder.CreateFDiv(
734             ret, vector_splat(builder, builder.CreateUIToFP(order, fp_vec_t->getScalarType()), batch_size));
735 
736         // Store the derivative.
737         taylor_c_store_diff(s, diff_arr, n_uvars, order, sv_idx, ret);
738     });
739 
740     // Handle the number definitions.
741     llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_nums), [&](llvm::Value *cur_idx) {
742         // Fetch the index of the state variable.
743         auto *sv_idx = builder.CreateLoad(
744             builder.getInt32Ty(),
745             builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[2]), sv_diff_gl[2], {builder.getInt32(0), cur_idx}));
746 
747         // Fetch the constant.
748         auto *num = builder.CreateLoad(
749             fp_vec_t->getScalarType(),
750             builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[3]), sv_diff_gl[3], {builder.getInt32(0), cur_idx}));
751 
752         // If the first-order derivative is being requested,
753         // do the codegen for the constant itself, otherwise
754         // return 0. No need for normalization as the only
755         // nonzero value that can be produced here is the first-order
756         // derivative.
757         auto *cmp_cond = builder.CreateICmpEQ(order, builder.getInt32(1));
758         auto ret = builder.CreateSelect(cmp_cond, vector_splat(builder, num, batch_size),
759                                         llvm::ConstantFP::get(fp_vec_t, 0.));
760 
761         // Store the derivative.
762         taylor_c_store_diff(s, diff_arr, n_uvars, order, sv_idx, ret);
763     });
764 
765     // Handle the param definitions.
766     llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_pars), [&](llvm::Value *cur_idx) {
767         // Fetch the index of the state variable.
768         auto *sv_idx = builder.CreateLoad(
769             builder.getInt32Ty(),
770             builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[4]), sv_diff_gl[4], {builder.getInt32(0), cur_idx}));
771 
772         // Fetch the index of the param.
773         auto *par_idx = builder.CreateLoad(
774             builder.getInt32Ty(),
775             builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[5]), sv_diff_gl[5], {builder.getInt32(0), cur_idx}));
776 
777         // If the first-order derivative is being requested,
778         // do the codegen for the constant itself, otherwise
779         // return 0. No need for normalization as the only
780         // nonzero value that can be produced here is the first-order
781         // derivative.
782         llvm_if_then_else(
783             s, builder.CreateICmpEQ(order, builder.getInt32(1)),
784             [&]() {
785                 // Derivative of order 1. Fetch the value from par_ptr.
786                 // NOTE: param{0} is unused, its only purpose is type tagging.
787                 taylor_c_store_diff(s, diff_arr, n_uvars, order, sv_idx,
788                                     taylor_c_diff_numparam_codegen(s, param{0}, par_idx, par_ptr, batch_size));
789             },
790             [&]() {
791                 // Derivative of order > 1, return 0.
792                 taylor_c_store_diff(s, diff_arr, n_uvars, order, sv_idx, llvm::ConstantFP::get(fp_vec_t, 0.));
793             });
794     });
795 }
796 
797 // Helper to convert the arguments of the definition of a u variable
798 // into a vector of variants. u variables will be converted to their indices,
799 // numbers will be unchanged, parameters will be converted to their indices.
800 // The hidden deps will also be converted to indices.
taylor_udef_to_variants(const expression & ex,const std::vector<std::uint32_t> & deps)801 auto taylor_udef_to_variants(const expression &ex, const std::vector<std::uint32_t> &deps)
802 {
803     return std::visit(
804         [&deps](const auto &v) -> std::vector<std::variant<std::uint32_t, number>> {
805             using type = detail::uncvref_t<decltype(v)>;
806 
807             if constexpr (std::is_same_v<type, func>) {
808                 std::vector<std::variant<std::uint32_t, number>> retval;
809 
810                 for (const auto &arg : v.args()) {
811                     std::visit(
812                         [&retval](const auto &x) {
813                             using tp = detail::uncvref_t<decltype(x)>;
814 
815                             if constexpr (std::is_same_v<tp, variable>) {
816                                 retval.emplace_back(uname_to_index(x.name()));
817                             } else if constexpr (std::is_same_v<tp, number>) {
818                                 retval.emplace_back(x);
819                             } else if constexpr (std::is_same_v<tp, param>) {
820                                 retval.emplace_back(x.idx());
821                             } else {
822                                 throw std::invalid_argument(
823                                     "Invalid argument encountered in an element of a Taylor decomposition: the "
824                                     "argument is not a variable or a number");
825                             }
826                         },
827                         arg.value());
828                 }
829 
830                 // Handle the hidden deps.
831                 for (auto idx : deps) {
832                     retval.emplace_back(idx);
833                 }
834 
835                 return retval;
836             } else {
837                 throw std::invalid_argument("Invalid expression encountered in a Taylor decomposition: the "
838                                             "expression is not a function");
839             }
840         },
841         ex.value());
842 }
843 
844 // Helper to convert a vector of variants into a variant of vectors.
845 // All elements of v must be of the same type, and v cannot be empty.
846 template <typename... T>
taylor_c_vv_transpose(const std::vector<std::variant<T...>> & v)847 auto taylor_c_vv_transpose(const std::vector<std::variant<T...>> &v)
848 {
849     assert(!v.empty());
850 
851     // Init the return value based on the type
852     // of the first element of v.
853     auto retval = std::visit(
854         [size = v.size()](const auto &x) {
855             using type = detail::uncvref_t<decltype(x)>;
856 
857             std::vector<type> tmp;
858             tmp.reserve(boost::numeric_cast<decltype(tmp.size())>(size));
859             tmp.push_back(x);
860 
861             return std::variant<std::vector<T>...>(std::move(tmp));
862         },
863         v[0]);
864 
865     // Append the other values from v.
866     for (decltype(v.size()) i = 1; i < v.size(); ++i) {
867         std::visit(
868             [&retval](const auto &x) {
869                 std::visit(
870                     [&x](auto &vv) {
871                         // The value type of retval.
872                         using scal_t = typename detail::uncvref_t<decltype(vv)>::value_type;
873 
874                         // The type of the current element of v.
875                         using x_t = detail::uncvref_t<decltype(x)>;
876 
877                         if constexpr (std::is_same_v<scal_t, x_t>) {
878                             vv.push_back(x);
879                         } else {
880                             throw std::invalid_argument(
881                                 "Inconsistent types detected while building the transposed sets of "
882                                 "arguments for the Taylor derivative functions in compact mode");
883                         }
884                     },
885                     retval);
886             },
887             v[i]);
888     }
889 
890     return retval;
891 }
892 
893 // Helper to check if a vector of indices consists of consecutive values:
894 // [n, n + 1, n + 2, ...]
895 // NOTE: requires a non-empty vector.
is_consecutive(const std::vector<std::uint32_t> & v)896 bool is_consecutive(const std::vector<std::uint32_t> &v)
897 {
898     assert(!v.empty());
899 
900     for (decltype(v.size()) i = 1; i < v.size(); ++i) {
901         // NOTE: the first check is to avoid potential
902         // negative overflow in the second check.
903         if (v[i] <= v[i - 1u] || v[i] - v[i - 1u] != 1u) {
904             return false;
905         }
906     }
907 
908     return true;
909 }
910 
911 // Functions the create the arguments generators for the functions that compute
912 // the Taylor derivatives in compact mode. The generators are created from vectors
913 // of either u var indices (taylor_c_make_arg_gen_vidx()) or floating-point constants
914 // (taylor_c_make_arg_gen_vc()).
taylor_c_make_arg_gen_vidx(llvm_state & s,const std::vector<std::uint32_t> & ind)915 std::function<llvm::Value *(llvm::Value *)> taylor_c_make_arg_gen_vidx(llvm_state &s,
916                                                                        const std::vector<std::uint32_t> &ind)
917 {
918     assert(!ind.empty());
919 
920     auto &builder = s.builder();
921 
922     // Check if all indices in ind are the same.
923     if (std::all_of(ind.begin() + 1, ind.end(), [&ind](const auto &n) { return n == ind[0]; })) {
924         // If all indices are the same, don't construct an array, just always return
925         // the same value.
926         return [num = builder.getInt32(ind[0])](llvm::Value *) -> llvm::Value * { return num; };
927     }
928 
929     // If ind consists of consecutive indices, we can replace
930     // the index array with a simple offset computation.
931     if (is_consecutive(ind)) {
932         return [&builder, start_idx = builder.getInt32(ind[0])](llvm::Value *cur_call_idx) -> llvm::Value * {
933             return builder.CreateAdd(start_idx, cur_call_idx);
934         };
935     }
936 
937     // Check if ind consists of a repeated pattern like [a, a, a, b, b, b, c, c, c, ...],
938     // that is, [a X n, b X n, c X n, ...], such that [a, b, c, ...] are consecutive numbers.
939     if (ind.size() > 1u) {
940         // Determine the candidate number of repetitions.
941         decltype(ind.size()) n_reps = 1;
942         for (decltype(ind.size()) i = 1; i < ind.size(); ++i) {
943             if (ind[i] == ind[i - 1u]) {
944                 ++n_reps;
945             } else {
946                 break;
947             }
948         }
949 
950         if (n_reps > 1u && (ind.size() % n_reps) == 0u) {
951             // There is an initial number of repetitions
952             // and the vector size is a multiple of that.
953             // See if the repetitions continue, and keep
954             // track of the repeated indices.
955             std::vector<std::uint32_t> rep_indices{ind[0]};
956 
957             bool rep_flag = true;
958 
959             // Iterate over the blocks of repetitions.
960             for (decltype(ind.size()) rep_idx = 1; rep_idx < ind.size() / n_reps; ++rep_idx) {
961                 for (decltype(ind.size()) i = 1; i < n_reps; ++i) {
962                     const auto cur_idx = rep_idx * n_reps + i;
963 
964                     if (ind[cur_idx] != ind[cur_idx - 1u]) {
965                         rep_flag = false;
966                         break;
967                     }
968                 }
969 
970                 if (rep_flag) {
971                     rep_indices.push_back(ind[rep_idx * n_reps]);
972                 } else {
973                     break;
974                 }
975             }
976 
977             if (rep_flag && is_consecutive(rep_indices)) {
978                 // The pattern is  [a X n, b X n, c X n, ...] and [a, b, c, ...]
979                 // are consecutive numbers. The m-th value in the array can thus
980                 // be computed as a + floor(m / n).
981 
982 #if !defined(NDEBUG)
983                 // Double-check the result in debug mode.
984                 std::vector<std::uint32_t> checker;
985                 for (decltype(ind.size()) i = 0; i < ind.size(); ++i) {
986                     checker.push_back(boost::numeric_cast<std::uint32_t>(ind[0] + i / n_reps));
987                 }
988                 assert(checker == ind);
989 #endif
990 
991                 return [&builder, start_idx = builder.getInt32(rep_indices[0]),
992                         n_reps = builder.getInt32(boost::numeric_cast<std::uint32_t>(n_reps))](
993                            llvm::Value *cur_call_idx) -> llvm::Value * {
994                     return builder.CreateAdd(start_idx, builder.CreateUDiv(cur_call_idx, n_reps));
995                 };
996             }
997         }
998     }
999 
1000     auto &md = s.module();
1001 
1002     // Generate the array of indices as llvm constants.
1003     std::vector<llvm::Constant *> tmp_c_vec;
1004     tmp_c_vec.reserve(ind.size());
1005     for (const auto &val : ind) {
1006         tmp_c_vec.push_back(builder.getInt32(val));
1007     }
1008 
1009     // Create the array type.
1010     auto *arr_type = llvm::ArrayType::get(tmp_c_vec[0]->getType(), boost::numeric_cast<std::uint64_t>(ind.size()));
1011     assert(arr_type != nullptr);
1012 
1013     // Create the constant array as a global read-only variable.
1014     auto *const_arr = llvm::ConstantArray::get(arr_type, tmp_c_vec);
1015     assert(const_arr != nullptr);
1016     // NOTE: naked new here is fine, gvar will be registered in the module
1017     // object and cleaned up when the module is destroyed.
1018     auto *gvar
1019         = new llvm::GlobalVariable(md, const_arr->getType(), true, llvm::GlobalVariable::InternalLinkage, const_arr);
1020 
1021     // Return the generator.
1022     return [gvar, arr_type, &builder](llvm::Value *cur_call_idx) -> llvm::Value * {
1023         assert(llvm_depr_GEP_type_check(gvar, arr_type)); // LCOV_EXCL_LINE
1024         return builder.CreateLoad(builder.getInt32Ty(),
1025                                   builder.CreateInBoundsGEP(arr_type, gvar, {builder.getInt32(0), cur_call_idx}));
1026     };
1027 }
1028 
1029 template <typename T>
taylor_c_make_arg_gen_vc(llvm_state & s,const std::vector<number> & vc)1030 std::function<llvm::Value *(llvm::Value *)> taylor_c_make_arg_gen_vc(llvm_state &s, const std::vector<number> &vc)
1031 {
1032     assert(!vc.empty());
1033 
1034     // Check if all the numbers are the same.
1035     // NOTE: the comparison operator of number will consider two numbers of different
1036     // type but equal value to be equal.
1037     if (std::all_of(vc.begin() + 1, vc.end(), [&vc](const auto &n) { return n == vc[0]; })) {
1038         // If all constants are the same, don't construct an array, just always return
1039         // the same value.
1040         return [num = codegen<T>(s, vc[0])](llvm::Value *) -> llvm::Value * { return num; };
1041     }
1042 
1043     // Generate the array of constants as llvm constants.
1044     std::vector<llvm::Constant *> tmp_c_vec;
1045     tmp_c_vec.reserve(vc.size());
1046     for (const auto &val : vc) {
1047         tmp_c_vec.push_back(llvm::cast<llvm::Constant>(codegen<T>(s, val)));
1048     }
1049 
1050     // Create the array type.
1051     auto *arr_type = llvm::ArrayType::get(tmp_c_vec[0]->getType(), boost::numeric_cast<std::uint64_t>(vc.size()));
1052     assert(arr_type != nullptr);
1053 
1054     // Create the constant array as a global read-only variable.
1055     auto *const_arr = llvm::ConstantArray::get(arr_type, tmp_c_vec);
1056     assert(const_arr != nullptr);
1057     // NOTE: naked new here is fine, gvar will be registered in the module
1058     // object and cleaned up when the module is destroyed.
1059     auto *gvar = new llvm::GlobalVariable(s.module(), const_arr->getType(), true, llvm::GlobalVariable::InternalLinkage,
1060                                           const_arr);
1061 
1062     // Return the generator.
1063     return [gvar, arr_type, &s](llvm::Value *cur_call_idx) -> llvm::Value * {
1064         auto &builder = s.builder();
1065 
1066         assert(llvm_depr_GEP_type_check(gvar, arr_type)); // LCOV_EXCL_LINE
1067         return builder.CreateLoad(arr_type->getArrayElementType(),
1068                                   builder.CreateInBoundsGEP(arr_type, gvar, {builder.getInt32(0), cur_call_idx}));
1069     };
1070 }
1071 
1072 // Comparision operator for LLVM functions based on their names.
1073 struct llvm_func_name_compare {
operator ()heyoka::detail::__anon89701e160111::llvm_func_name_compare1074     bool operator()(const llvm::Function *f0, const llvm::Function *f1) const
1075     {
1076         return f0->getName() < f1->getName();
1077     }
1078 };
1079 
1080 // For each segment in s_dc, this function will return a dict mapping an LLVM function
1081 // f for the computation of a Taylor derivative to a size and a vector of std::functions. For example, one entry
1082 // in the return value will read something like:
1083 // {f : (2, [g_0, g_1, g_2])}
1084 // The meaning in this example is that the arity of f is 3 and it will be called with 2 different
1085 // sets of arguments. The g_i functions are expected to be called with input argument j in [0, 1]
1086 // to yield the value of the i-th function argument for f at the j-th invocation.
1087 template <typename T>
taylor_build_function_maps(llvm_state & s,const std::vector<taylor_dc_t> & s_dc,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t batch_size,bool high_accuracy)1088 auto taylor_build_function_maps(llvm_state &s, const std::vector<taylor_dc_t> &s_dc, std::uint32_t n_eq,
1089                                 std::uint32_t n_uvars, std::uint32_t batch_size, bool high_accuracy)
1090 {
1091     // Log runtime in trace mode.
1092     spdlog::stopwatch sw;
1093 
1094     // Init the return value.
1095     // NOTE: use maps with name-based comparison for the functions. This ensures that the order in which these
1096     // functions are invoked in taylor_compute_jet_compact_mode() is always the same. If we used directly pointer
1097     // comparisons instead, the order could vary across different executions and different platforms. The name
1098     // mangling we do when creating the function names should ensure that there are no possible name collisions.
1099     std::vector<
1100         std::map<llvm::Function *, std::pair<std::uint32_t, std::vector<std::function<llvm::Value *(llvm::Value *)>>>,
1101                  llvm_func_name_compare>>
1102         retval;
1103 
1104     // Variable to keep track of the u variable
1105     // on whose definition we are operating.
1106     auto cur_u_idx = n_eq;
1107     for (const auto &seg : s_dc) {
1108         // This structure maps an LLVM function to sets of arguments
1109         // with which the function is to be called. For instance, if function
1110         // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map
1111         // will contain {f : [[a, b, c], [d, e, f]]}.
1112         // After construction, we have verified that for each function
1113         // in the map the sets of arguments have all the same size.
1114         std::unordered_map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
1115 
1116         for (const auto &ex : seg) {
1117             // Get the function for the computation of the derivative.
1118             auto func = taylor_c_diff_func<T>(s, ex.first, n_uvars, batch_size, high_accuracy);
1119 
1120             // Insert the function into tmp_map.
1121             const auto [it, is_new_func] = tmp_map.try_emplace(func);
1122 
1123             assert(is_new_func || !it->second.empty());
1124 
1125             // Convert the variables/constants in the current dc
1126             // element into a set of indices/constants.
1127             const auto cdiff_args = taylor_udef_to_variants(ex.first, ex.second);
1128 
1129             if (!is_new_func && it->second.back().size() - 1u != cdiff_args.size()) {
1130                 throw std::invalid_argument(
1131                     "Inconsistent arity detected in a Taylor derivative function in compact "
1132                     "mode: the same function is being called with both {} and {} arguments"_format(
1133                         it->second.back().size() - 1u, cdiff_args.size()));
1134             }
1135 
1136             // Add the new set of arguments.
1137             it->second.emplace_back();
1138             // Add the idx of the u variable.
1139             it->second.back().emplace_back(cur_u_idx);
1140             // Add the actual function arguments.
1141             it->second.back().insert(it->second.back().end(), cdiff_args.begin(), cdiff_args.end());
1142 
1143             ++cur_u_idx;
1144         }
1145 
1146         // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
1147         // to {f : [[a, d], [b, e], [c, f]]}.
1148         std::unordered_map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
1149             tmp_map_transpose;
1150         for (const auto &[func, vv] : tmp_map) {
1151             assert(!vv.empty());
1152 
1153             // Add the function.
1154             const auto [it, ins_status] = tmp_map_transpose.try_emplace(func);
1155             assert(ins_status);
1156 
1157             const auto n_calls = vv.size();
1158             const auto n_args = vv[0].size();
1159             // NOTE: n_args must be at least 1 because the u idx
1160             // is prepended to the actual function arguments in
1161             // the tmp_map entries.
1162             assert(n_args >= 1u);
1163 
1164             for (decltype(vv[0].size()) i = 0; i < n_args; ++i) {
1165                 // Build the vector of values corresponding
1166                 // to the current argument index.
1167                 std::vector<std::variant<std::uint32_t, number>> tmp_c_vec;
1168                 for (decltype(vv.size()) j = 0; j < n_calls; ++j) {
1169                     tmp_c_vec.push_back(vv[j][i]);
1170                 }
1171 
1172                 // Turn tmp_c_vec (a vector of variants) into a variant
1173                 // of vectors, and insert the result.
1174                 it->second.push_back(taylor_c_vv_transpose(tmp_c_vec));
1175             }
1176         }
1177 
1178         // Add a new entry in retval for the current segment.
1179         retval.emplace_back();
1180         auto &a_map = retval.back();
1181 
1182         for (const auto &[func, vv] : tmp_map_transpose) {
1183             assert(!vv.empty());
1184 
1185             // Add the function.
1186             const auto [it, ins_status] = a_map.try_emplace(func);
1187             assert(ins_status);
1188 
1189             // Set the number of calls for this function.
1190             it->second.first
1191                 = std::visit([](const auto &x) { return boost::numeric_cast<std::uint32_t>(x.size()); }, vv[0]);
1192             assert(it->second.first > 0u);
1193 
1194             // Create the g functions for each argument.
1195             for (const auto &v : vv) {
1196                 it->second.second.push_back(std::visit(
1197                     [&s](const auto &x) {
1198                         using type = detail::uncvref_t<decltype(x)>;
1199 
1200                         if constexpr (std::is_same_v<type, std::vector<std::uint32_t>>) {
1201                             return taylor_c_make_arg_gen_vidx(s, x);
1202                         } else {
1203                             return taylor_c_make_arg_gen_vc<T>(s, x);
1204                         }
1205                     },
1206                     v));
1207             }
1208         }
1209     }
1210 
1211     get_logger()->trace("Taylor build function maps runtime: {}", sw);
1212 
1213     return retval;
1214 }
1215 
1216 // Helper for the computation of a jet of derivatives in compact mode,
1217 // used in taylor_compute_jet() below.
1218 template <typename T>
taylor_compute_jet_compact_mode(llvm_state & s,llvm::Value * order0,llvm::Value * par_ptr,llvm::Value * time_ptr,const taylor_dc_t & dc,const std::vector<std::uint32_t> & sv_funcs_dc,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy)1219 llvm::Value *taylor_compute_jet_compact_mode(llvm_state &s, llvm::Value *order0, llvm::Value *par_ptr,
1220                                              llvm::Value *time_ptr, const taylor_dc_t &dc,
1221                                              const std::vector<std::uint32_t> &sv_funcs_dc, std::uint32_t n_eq,
1222                                              std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
1223                                              bool high_accuracy)
1224 {
1225     auto &builder = s.builder();
1226 
1227     // Split dc into segments.
1228     const auto s_dc = taylor_segment_dc(dc, n_eq);
1229 
1230     // Generate the function maps.
1231     const auto f_maps = taylor_build_function_maps<T>(s, s_dc, n_eq, n_uvars, batch_size, high_accuracy);
1232 
1233     // Log the runtime of IR construction in trace mode.
1234     spdlog::stopwatch sw;
1235 
1236     // Generate the global arrays for the computation of the derivatives
1237     // of the state variables.
1238     const auto svd_gl = taylor_c_make_sv_diff_globals<T>(s, dc, n_uvars);
1239 
1240     // Determine the maximum u variable index appearing in sv_funcs_dc, or zero
1241     // if sv_funcs_dc is empty.
1242     const auto max_svf_idx
1243         = sv_funcs_dc.empty() ? std::uint32_t(0) : *std::max_element(sv_funcs_dc.begin(), sv_funcs_dc.end());
1244 
1245     // Prepare the array that will contain the jet of derivatives.
1246     // We will be storing all the derivatives of the u variables
1247     // up to order 'order - 1', the derivatives of order
1248     // 'order' of the state variables and the derivatives
1249     // of order 'order' of the sv_funcs.
1250     // NOTE: the array size is specified as a 64-bit integer in the
1251     // LLVM API.
1252     // NOTE: fp_type is the original, scalar floating-point type.
1253     // It will be turned into a vector type (if necessary) by
1254     // make_vector_type() below.
1255     // NOTE: if sv_funcs_dc is empty, or if all its indices are not greater
1256     // than the indices of the state variables, then we don't need additional
1257     // slots after the sv derivatives. If we need additional slots, allocate
1258     // another full column of derivatives, as it is complicated at this stage
1259     // to know exactly how many slots we will need.
1260     auto *fp_type = llvm::cast<llvm::PointerType>(order0->getType())->getElementType();
1261     auto *fp_vec_type = make_vector_type(fp_type, batch_size);
1262     auto *array_type
1263         = llvm::ArrayType::get(fp_vec_type, (max_svf_idx < n_eq) ? (n_uvars * order + n_eq) : (n_uvars * (order + 1u)));
1264 
1265     // Make the global array and fetch a pointer to its first element.
1266     // NOTE: we use a global array rather than a local one here because
1267     // its size can grow quite large, which can lead to stack overflow issues.
1268     // This has of course consequences in terms of thread safety, which
1269     // we will have to document.
1270     auto diff_arr_gvar = make_global_zero_array(s.module(), array_type);
1271     assert(llvm_depr_GEP_type_check(diff_arr_gvar, array_type)); // LCOV_EXCL_LINE
1272     auto *diff_arr = builder.CreateInBoundsGEP(array_type, diff_arr_gvar, {builder.getInt32(0), builder.getInt32(0)});
1273 
1274     // Copy over the order-0 derivatives of the state variables.
1275     // NOTE: overflow checking is already done in the parent function.
1276     llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1277         // Fetch the pointer from order0.
1278         assert(llvm_depr_GEP_type_check(order0, fp_type)); // LCOV_EXCL_LINE
1279         auto *ptr
1280             = builder.CreateInBoundsGEP(fp_type, order0, builder.CreateMul(cur_var_idx, builder.getInt32(batch_size)));
1281 
1282         // Load as a vector.
1283         auto *vec = load_vector_from_memory(builder, ptr, batch_size);
1284 
1285         // Store into diff_arr.
1286         assert(llvm_depr_GEP_type_check(diff_arr, fp_vec_type)); // LCOV_EXCL_LINE
1287         builder.CreateStore(vec, builder.CreateInBoundsGEP(fp_vec_type, diff_arr, cur_var_idx));
1288     });
1289 
1290     // Helper to compute and store the derivatives of order cur_order
1291     // of the u variables which are not state variables.
1292     auto compute_u_diffs = [&](llvm::Value *cur_order) {
1293         for (const auto &map : f_maps) {
1294             for (const auto &p : map) {
1295                 // The LLVM function for the computation of the
1296                 // derivative in compact mode.
1297                 const auto &func = p.first;
1298 
1299                 // The number of func calls.
1300                 const auto ncalls = p.second.first;
1301 
1302                 // The generators for the arguments of func.
1303                 const auto &gens = p.second.second;
1304 
1305                 assert(ncalls > 0u);
1306                 assert(!gens.empty());
1307                 assert(std::all_of(gens.begin(), gens.end(), [](const auto &f) { return static_cast<bool>(f); }));
1308 
1309                 // Loop over the number of calls.
1310                 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
1311                     // Create the u variable index from the first generator.
1312                     auto u_idx = gens[0](cur_call_idx);
1313 
1314                     // Initialise the vector of arguments with which func must be called. The following
1315                     // initial arguments are always present:
1316                     // - current Taylor order,
1317                     // - u index of the variable,
1318                     // - array of derivatives,
1319                     // - pointer to the param values,
1320                     // - pointer to the time value(s).
1321                     std::vector<llvm::Value *> args{cur_order, u_idx, diff_arr, par_ptr, time_ptr};
1322 
1323                     // Create the other arguments via the generators.
1324                     for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
1325                         args.push_back(gens[i](cur_call_idx));
1326                     }
1327 
1328                     // Calculate the derivative and store the result.
1329                     taylor_c_store_diff(s, diff_arr, n_uvars, cur_order, u_idx, builder.CreateCall(func, args));
1330                 });
1331             }
1332         }
1333     };
1334 
1335     // Compute the order-0 derivatives (i.e., the initial values)
1336     // for all u variables which are not state variables.
1337     compute_u_diffs(builder.getInt32(0));
1338 
1339     // Compute all derivatives up to order 'order - 1'.
1340     llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(order), [&](llvm::Value *cur_order) {
1341         // State variables first.
1342         taylor_c_compute_sv_diffs(s, svd_gl, diff_arr, par_ptr, n_uvars, cur_order, batch_size);
1343 
1344         // The other u variables.
1345         compute_u_diffs(cur_order);
1346     });
1347 
1348     // Compute the last-order derivatives for the state variables.
1349     taylor_c_compute_sv_diffs(s, svd_gl, diff_arr, par_ptr, n_uvars, builder.getInt32(order), batch_size);
1350 
1351     // Compute the last-order derivatives for the sv_funcs, if any. Because the sv funcs
1352     // correspond to u variables in the decomposition, we will have to compute the
1353     // last-order derivatives of the u variables until we are sure all sv_funcs derivatives
1354     // have been properly computed.
1355 
1356     // Monitor the starting index of the current
1357     // segment while iterating on f_maps.
1358     auto cur_start_u_idx = n_eq;
1359 
1360     // NOTE: this is a slight repetition of compute_u_diffs() with minor modifications.
1361     for (const auto &map : f_maps) {
1362         if (cur_start_u_idx > max_svf_idx) {
1363             // We computed all the necessary derivatives, break out.
1364             // NOTE: if we did not have sv_funcs to begin with,
1365             // max_svf_idx is zero and we exit at the first iteration
1366             // without doing anything. If all sv funcs alias state variables,
1367             // then max_svf_idx < n_eq and thus we also exit immediately
1368             // at the first iteration.
1369             break;
1370         }
1371 
1372         for (const auto &p : map) {
1373             const auto &func = p.first;
1374             const auto ncalls = p.second.first;
1375             const auto &gens = p.second.second;
1376 
1377             assert(ncalls > 0u);
1378             assert(!gens.empty());
1379             assert(std::all_of(gens.begin(), gens.end(), [](const auto &f) { return static_cast<bool>(f); }));
1380 
1381             llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
1382                 auto u_idx = gens[0](cur_call_idx);
1383 
1384                 std::vector<llvm::Value *> args{builder.getInt32(order), u_idx, diff_arr, par_ptr, time_ptr};
1385 
1386                 for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
1387                     args.push_back(gens[i](cur_call_idx));
1388                 }
1389 
1390                 taylor_c_store_diff(s, diff_arr, n_uvars, builder.getInt32(order), u_idx,
1391                                     builder.CreateCall(func, args));
1392             });
1393 
1394             // Update cur_start_u_idx taking advantage of the fact
1395             // that each block in a segment processes the derivatives
1396             // of exactly ncalls u variables.
1397             cur_start_u_idx += ncalls;
1398         }
1399     }
1400 
1401     get_logger()->trace("Taylor IR creation compact mode runtime: {}", sw);
1402 
1403     // Return the array of derivatives of the u variables.
1404     return diff_arr;
1405 }
1406 
1407 // Given an input pointer 'in', load the first n * batch_size values in it as n vectors
1408 // with size batch_size. If batch_size is 1, the values will be loaded as scalars.
taylor_load_values(llvm_state & s,llvm::Value * in,std::uint32_t n,std::uint32_t batch_size)1409 auto taylor_load_values(llvm_state &s, llvm::Value *in, std::uint32_t n, std::uint32_t batch_size)
1410 {
1411     assert(batch_size > 0u);
1412 
1413     auto &builder = s.builder();
1414 
1415     std::vector<llvm::Value *> retval;
1416     for (std::uint32_t i = 0; i < n; ++i) {
1417         // Fetch the pointer from in.
1418         // NOTE: overflow checking is done in the parent function.
1419         assert(llvm_depr_GEP_type_check(in, pointee_type(in))); // LCOV_EXCL_LINE
1420         auto *ptr = builder.CreateInBoundsGEP(pointee_type(in), in, builder.getInt32(i * batch_size));
1421 
1422         // Load the value in vector mode.
1423         retval.push_back(load_vector_from_memory(builder, ptr, batch_size));
1424     }
1425 
1426     return retval;
1427 }
1428 
1429 // Small helper to deduce the number of parameters
1430 // present in the Taylor decomposition of an ODE system.
1431 // NOTE: this will also include the functions of state variables,
1432 // as they are part of the decomposition.
1433 // NOTE: the first few entries in the decomposition are the mapping
1434 // u variables -> state variables. These never contain any param
1435 // by construction.
1436 template <typename T>
n_pars_in_dc(const T & dc)1437 std::uint32_t n_pars_in_dc(const T &dc)
1438 {
1439     std::uint32_t retval = 0;
1440 
1441     for (const auto &p : dc) {
1442         retval = std::max(retval, get_param_size(p.first));
1443     }
1444 
1445     return retval;
1446 }
1447 
1448 // Helper function to compute the jet of Taylor derivatives up to a given order. n_eq
1449 // is the number of equations/variables in the ODE sys, dc its Taylor decomposition,
1450 // n_uvars the total number of u variables in the decomposition.
1451 // order is the max derivative order desired, batch_size the batch size.
1452 // order0 is a pointer to an array of (at least) n_eq * batch_size scalar elements
1453 // containing the derivatives of order 0. par_ptr is a pointer to an array containing
1454 // the numerical values of the parameters, time_ptr a pointer to the time value(s).
1455 // sv_funcs are the indices, in the decomposition, of the functions of state
1456 // variables.
1457 //
1458 // The return value is a variant containing either:
1459 // - in compact mode, the array containing the derivatives of all u variables,
1460 // - otherwise, the jet of derivatives of the state variables and sv_funcs
1461 //   up to order 'order'.
1462 template <typename T>
1463 std::variant<llvm::Value *, std::vector<llvm::Value *>>
taylor_compute_jet(llvm_state & s,llvm::Value * order0,llvm::Value * par_ptr,llvm::Value * time_ptr,const taylor_dc_t & dc,const std::vector<std::uint32_t> & sv_funcs_dc,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size,bool compact_mode,bool high_accuracy)1464 taylor_compute_jet(llvm_state &s, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr,
1465                    const taylor_dc_t &dc, const std::vector<std::uint32_t> &sv_funcs_dc, std::uint32_t n_eq,
1466                    std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, bool compact_mode,
1467                    bool high_accuracy)
1468 {
1469     assert(batch_size > 0u);
1470     assert(n_eq > 0u);
1471     assert(order > 0u);
1472 
1473     // Make sure we can represent n_uvars * (order + 1) as a 32-bit
1474     // unsigned integer. This is the maximum total number of derivatives we will have to compute
1475     // and store, with the +1 taking into account the extra slots that might be needed by sv_funcs_dc.
1476     // If sv_funcs_dc is empty, we need only n_uvars * order + n_eq derivatives.
1477     // LCOV_EXCL_START
1478     if (order == std::numeric_limits<std::uint32_t>::max()
1479         || n_uvars > std::numeric_limits<std::uint32_t>::max() / (order + 1u)) {
1480         throw std::overflow_error(
1481             "An overflow condition was detected in the computation of a jet of Taylor derivatives");
1482     }
1483 
1484     // We also need to be able to index up to n_eq * batch_size in order0.
1485     if (n_eq > std::numeric_limits<std::uint32_t>::max() / batch_size) {
1486         throw std::overflow_error(
1487             "An overflow condition was detected in the computation of a jet of Taylor derivatives");
1488     }
1489     // LCOV_EXCL_STOP
1490 
1491     if (compact_mode) {
1492         // In compact mode, let's ensure that we can index into par_ptr using std::uint32_t.
1493         // NOTE: in default mode the check is done inside taylor_codegen_numparam_par().
1494 
1495         // Deduce the size of the param array from the expressions in the decomposition.
1496         const auto param_size = n_pars_in_dc(dc);
1497         // LCOV_EXCL_START
1498         if (param_size > std::numeric_limits<std::uint32_t>::max() / batch_size) {
1499             throw std::overflow_error(
1500                 "An overflow condition was detected in the computation of a jet of Taylor derivatives in compact mode");
1501         }
1502         // LCOV_EXCL_STOP
1503 
1504         return taylor_compute_jet_compact_mode<T>(s, order0, par_ptr, time_ptr, dc, sv_funcs_dc, n_eq, n_uvars, order,
1505                                                   batch_size, high_accuracy);
1506     } else {
1507         // Log the runtime of IR construction in trace mode.
1508         spdlog::stopwatch sw;
1509 
1510         // Init the derivatives array with the order 0 of the state variables.
1511         auto diff_arr = taylor_load_values(s, order0, n_eq, batch_size);
1512 
1513         // Compute the order-0 derivatives of the other u variables.
1514         for (auto i = n_eq; i < n_uvars; ++i) {
1515             diff_arr.push_back(taylor_diff<T>(s, dc[i].first, dc[i].second, diff_arr, par_ptr, time_ptr, n_uvars, 0, i,
1516                                               batch_size, high_accuracy));
1517         }
1518 
1519         // Compute the derivatives order by order, starting from 1 to order excluded.
1520         // We will compute the highest derivatives of the state variables separately
1521         // in the last step.
1522         for (std::uint32_t cur_order = 1; cur_order < order; ++cur_order) {
1523             // Begin with the state variables.
1524             // NOTE: the derivatives of the state variables
1525             // are at the end of the decomposition vector.
1526             for (auto i = n_uvars; i < boost::numeric_cast<std::uint32_t>(dc.size()); ++i) {
1527                 diff_arr.push_back(
1528                     taylor_compute_sv_diff<T>(s, dc[i].first, diff_arr, par_ptr, n_uvars, cur_order, batch_size));
1529             }
1530 
1531             // Now the other u variables.
1532             for (auto i = n_eq; i < n_uvars; ++i) {
1533                 diff_arr.push_back(taylor_diff<T>(s, dc[i].first, dc[i].second, diff_arr, par_ptr, time_ptr, n_uvars,
1534                                                   cur_order, i, batch_size, high_accuracy));
1535             }
1536         }
1537 
1538         // Compute the last-order derivatives for the state variables.
1539         for (auto i = n_uvars; i < boost::numeric_cast<std::uint32_t>(dc.size()); ++i) {
1540             diff_arr.push_back(
1541                 taylor_compute_sv_diff<T>(s, dc[i].first, diff_arr, par_ptr, n_uvars, order, batch_size));
1542         }
1543 
1544         // If there are sv funcs, we need to compute their last-order derivatives too:
1545         // we will need to compute the derivatives of the u variables up to
1546         // the maximum index in sv_funcs_dc.
1547         const auto max_svf_idx
1548             = sv_funcs_dc.empty() ? std::uint32_t(0) : *std::max_element(sv_funcs_dc.begin(), sv_funcs_dc.end());
1549 
1550         // NOTE: if there are no sv_funcs, max_svf_idx is set to zero
1551         // above, thus we never enter the loop.
1552         // NOTE: <= because max_svf_idx is an index, not a size.
1553         for (std::uint32_t i = n_eq; i <= max_svf_idx; ++i) {
1554             diff_arr.push_back(taylor_diff<T>(s, dc[i].first, dc[i].second, diff_arr, par_ptr, time_ptr, n_uvars, order,
1555                                               i, batch_size, high_accuracy));
1556         }
1557 
1558 #if !defined(NDEBUG)
1559         if (sv_funcs_dc.empty()) {
1560             assert(diff_arr.size() == static_cast<decltype(diff_arr.size())>(n_uvars) * order + n_eq);
1561         } else {
1562             // NOTE: we use std::max<std::uint32_t>(n_eq, max_svf_idx + 1u) here because
1563             // the sv funcs could all be aliases of the state variables themselves,
1564             // in which case in the previous loop we ended up appending nothing.
1565             assert(diff_arr.size()
1566                    == static_cast<decltype(diff_arr.size())>(n_uvars) * order
1567                           + std::max<std::uint32_t>(n_eq, max_svf_idx + 1u));
1568         }
1569 #endif
1570 
1571         // Extract the derivatives of the state variables and sv_funcs from diff_arr.
1572         std::vector<llvm::Value *> retval;
1573         for (std::uint32_t o = 0; o <= order; ++o) {
1574             for (std::uint32_t var_idx = 0; var_idx < n_eq; ++var_idx) {
1575                 retval.push_back(taylor_fetch_diff(diff_arr, var_idx, o, n_uvars));
1576             }
1577             for (auto idx : sv_funcs_dc) {
1578                 retval.push_back(taylor_fetch_diff(diff_arr, idx, o, n_uvars));
1579             }
1580         }
1581 
1582         get_logger()->trace("Taylor IR creation default mode runtime: {}", sw);
1583 
1584         return retval;
1585     }
1586 }
1587 
1588 // Helper to generate the LLVM code to store the Taylor coefficients of the state variables and
1589 // the sv funcs into an external array. The Taylor polynomials are stored in row-major order,
1590 // first the state variables and after the sv funcs. For use in the adaptive timestepper implementations.
taylor_write_tc(llvm_state & s,const std::variant<llvm::Value *,std::vector<llvm::Value * >> & diff_variant,const std::vector<std::uint32_t> & sv_funcs_dc,llvm::Value * svf_ptr,llvm::Value * tc_ptr,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size)1591 void taylor_write_tc(llvm_state &s, const std::variant<llvm::Value *, std::vector<llvm::Value *>> &diff_variant,
1592                      const std::vector<std::uint32_t> &sv_funcs_dc, llvm::Value *svf_ptr, llvm::Value *tc_ptr,
1593                      std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size)
1594 {
1595     assert(batch_size != 0u);
1596 #if !defined(NDEBUG)
1597     if (diff_variant.index() == 0u) {
1598         // Compact mode.
1599         assert(sv_funcs_dc.empty() == !svf_ptr);
1600     } else {
1601         // Non-compact mode.
1602         assert(svf_ptr == nullptr);
1603     }
1604 #endif
1605 
1606     auto &builder = s.builder();
1607 
1608     // Convert to std::uint32_t for overflow checking and use below.
1609     const auto n_sv_funcs = boost::numeric_cast<std::uint32_t>(sv_funcs_dc.size());
1610 
1611     // Overflow checking: ensure we can index into
1612     // tc_ptr using std::uint32_t.
1613     // NOTE: this is the same check done in taylor_add_jet_impl().
1614     // LCOV_EXCL_START
1615     if (order == std::numeric_limits<std::uint32_t>::max()
1616         || (order + 1u) > std::numeric_limits<std::uint32_t>::max() / batch_size
1617         || n_eq > std::numeric_limits<std::uint32_t>::max() - n_sv_funcs
1618         || n_eq + n_sv_funcs > std::numeric_limits<std::uint32_t>::max() / ((order + 1u) * batch_size)) {
1619         throw std::overflow_error("An overflow condition was detected while generating the code for writing the Taylor "
1620                                   "polynomials of an ODE system into the output array");
1621     }
1622     // LCOV_EXCL_STOP
1623 
1624     if (diff_variant.index() == 0u) {
1625         // Compact mode.
1626 
1627         auto *diff_arr = std::get<llvm::Value *>(diff_variant);
1628 
1629         // Write out the Taylor coefficients for the state variables.
1630         llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var) {
1631             llvm_loop_u32(s, builder.getInt32(0), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)),
1632                           [&](llvm::Value *cur_order) {
1633                               // Load the value of the derivative from diff_arr.
1634                               auto *diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_var);
1635 
1636                               // Compute the index in the output pointer.
1637                               auto *out_idx = builder.CreateAdd(
1638                                   builder.CreateMul(builder.getInt32((order + 1u) * batch_size), cur_var),
1639                                   builder.CreateMul(cur_order, builder.getInt32(batch_size)));
1640 
1641                               // Store into tc_ptr.
1642                               // LCOV_EXCL_START
1643                               assert(llvm_depr_GEP_type_check(tc_ptr, pointee_type(tc_ptr)));
1644                               // LCOV_EXCL_STOP
1645                               store_vector_to_memory(
1646                                   builder, builder.CreateInBoundsGEP(pointee_type(tc_ptr), tc_ptr, out_idx), diff_val);
1647                           });
1648         });
1649 
1650         // Write out the Taylor coefficients for the sv funcs, if necessary.
1651         if (svf_ptr != nullptr) {
1652             llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_sv_funcs), [&](llvm::Value *arr_idx) {
1653                 // Fetch the u var index from svf_ptr.
1654                 assert(llvm_depr_GEP_type_check(svf_ptr, builder.getInt32Ty())); // LCOV_EXCL_LINE
1655                 auto *cur_idx = builder.CreateLoad(builder.getInt32Ty(),
1656                                                    builder.CreateInBoundsGEP(builder.getInt32Ty(), svf_ptr, arr_idx));
1657 
1658                 llvm_loop_u32(
1659                     s, builder.getInt32(0), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)),
1660                     [&](llvm::Value *cur_order) {
1661                         // Load the derivative value from diff_arr.
1662                         auto *diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_idx);
1663 
1664                         // Compute the index in the output pointer.
1665                         auto *out_idx
1666                             = builder.CreateAdd(builder.CreateMul(builder.getInt32((order + 1u) * batch_size),
1667                                                                   builder.CreateAdd(builder.getInt32(n_eq), arr_idx)),
1668                                                 builder.CreateMul(cur_order, builder.getInt32(batch_size)));
1669 
1670                         // Store into tc_ptr.
1671                         // LCOV_EXCL_START
1672                         assert(llvm_depr_GEP_type_check(tc_ptr, pointee_type(tc_ptr)));
1673                         // LCOV_EXCL_STOP
1674                         store_vector_to_memory(
1675                             builder, builder.CreateInBoundsGEP(pointee_type(tc_ptr), tc_ptr, out_idx), diff_val);
1676                     });
1677             });
1678         }
1679     } else {
1680         // Non-compact mode.
1681 
1682         const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_variant);
1683 
1684         for (std::uint32_t j = 0; j < n_eq; ++j) {
1685             for (decltype(diff_arr.size()) cur_order = 0; cur_order <= order; ++cur_order) {
1686                 // Index in the jet of derivatives.
1687                 // NOTE: in non-compact mode, diff_arr contains the derivatives only of the
1688                 // state variables and sv_variable (not all u vars), hence the indexing
1689                 // is cur_order * (n_eq + n_sv_funcs) + j.
1690                 const auto arr_idx = cur_order * (n_eq + n_sv_funcs) + j;
1691                 assert(arr_idx < diff_arr.size());
1692                 auto *const val = diff_arr[arr_idx];
1693 
1694                 // Index in tc_ptr.
1695                 const auto out_idx = (order + 1u) * batch_size * j + cur_order * batch_size;
1696 
1697                 // Write to tc_ptr.
1698                 assert(llvm_depr_GEP_type_check(tc_ptr, pointee_type(tc_ptr))); // LCOV_EXCL_LINE
1699                 auto *out_ptr = builder.CreateInBoundsGEP(pointee_type(tc_ptr), tc_ptr,
1700                                                           builder.getInt32(static_cast<std::uint32_t>(out_idx)));
1701                 store_vector_to_memory(builder, out_ptr, val);
1702             }
1703         }
1704 
1705         for (std::uint32_t j = 0; j < n_sv_funcs; ++j) {
1706             for (decltype(diff_arr.size()) cur_order = 0; cur_order <= order; ++cur_order) {
1707                 // Index in the jet of derivatives.
1708                 const auto arr_idx = cur_order * (n_eq + n_sv_funcs) + n_eq + j;
1709                 assert(arr_idx < diff_arr.size());
1710                 auto *const val = diff_arr[arr_idx];
1711 
1712                 // Index in tc_ptr.
1713                 const auto out_idx = (order + 1u) * batch_size * (n_eq + j) + cur_order * batch_size;
1714 
1715                 // Write to tc_ptr.
1716                 assert(llvm_depr_GEP_type_check(tc_ptr, pointee_type(tc_ptr))); // LCOV_EXCL_LINE
1717                 auto *out_ptr = builder.CreateInBoundsGEP(pointee_type(tc_ptr), tc_ptr,
1718                                                           builder.getInt32(static_cast<std::uint32_t>(out_idx)));
1719                 store_vector_to_memory(builder, out_ptr, val);
1720             }
1721         }
1722     }
1723 }
1724 
1725 // Add to s an adaptive timestepper function with support for events. This timestepper will *not*
1726 // propagate the state of the system. Instead, its output will be the jet of derivatives
1727 // of all state variables and event equations, and the deduced timestep value(s).
1728 template <typename T, typename U>
taylor_add_adaptive_step_with_events(llvm_state & s,const std::string & name,const U & sys,T tol,std::uint32_t batch_size,bool,bool compact_mode,const std::vector<expression> & evs,bool high_accuracy)1729 auto taylor_add_adaptive_step_with_events(llvm_state &s, const std::string &name, const U &sys, T tol,
1730                                           std::uint32_t batch_size, bool, bool compact_mode,
1731                                           const std::vector<expression> &evs, bool high_accuracy)
1732 {
1733     using std::isfinite;
1734 
1735     assert(!s.is_compiled());
1736     assert(batch_size != 0u);
1737     assert(isfinite(tol) && tol > 0);
1738 
1739     // Determine the order from the tolerance.
1740     const auto order = taylor_order_from_tol(tol);
1741 
1742     // Record the number of equations/variables.
1743     const auto n_eq = boost::numeric_cast<std::uint32_t>(sys.size());
1744 
1745     // Decompose the system of equations.
1746     auto [dc, ev_dc] = taylor_decompose(sys, evs);
1747 
1748     // Compute the number of u variables.
1749     assert(dc.size() > n_eq);
1750     const auto n_uvars = boost::numeric_cast<std::uint32_t>(dc.size() - n_eq);
1751 
1752     auto &builder = s.builder();
1753     auto &context = s.context();
1754 
1755     // Prepare the function prototype. The arguments are:
1756     // - pointer to the output jet of derivative (write only),
1757     // - pointer to the current state vector (read only),
1758     // - pointer to the parameters (read only),
1759     // - pointer to the time value(s) (read only),
1760     // - pointer to the array of max timesteps (read & write),
1761     // - pointer to the max_abs_state output variable (write only).
1762     // These pointers cannot overlap.
1763     std::vector<llvm::Type *> fargs(6, llvm::PointerType::getUnqual(to_llvm_type<T>(context)));
1764     // The function does not return anything.
1765     auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
1766     assert(ft != nullptr);
1767     // Now create the function.
1768     auto *f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, name, &s.module());
1769     // LCOV_EXCL_START
1770     if (f == nullptr) {
1771         throw std::invalid_argument(
1772             "Unable to create a function for an adaptive Taylor stepper with name '{}'"_format(name));
1773     }
1774     // LCOV_EXCL_STOP
1775 
1776     // Set the names/attributes of the function arguments.
1777     auto *jet_ptr = f->args().begin();
1778     jet_ptr->setName("jet_ptr");
1779     jet_ptr->addAttr(llvm::Attribute::NoCapture);
1780     jet_ptr->addAttr(llvm::Attribute::NoAlias);
1781     jet_ptr->addAttr(llvm::Attribute::WriteOnly);
1782 
1783     auto *state_ptr = jet_ptr + 1;
1784     state_ptr->setName("state_ptr");
1785     state_ptr->addAttr(llvm::Attribute::NoCapture);
1786     state_ptr->addAttr(llvm::Attribute::NoAlias);
1787     state_ptr->addAttr(llvm::Attribute::ReadOnly);
1788 
1789     auto *par_ptr = state_ptr + 1;
1790     par_ptr->setName("par_ptr");
1791     par_ptr->addAttr(llvm::Attribute::NoCapture);
1792     par_ptr->addAttr(llvm::Attribute::NoAlias);
1793     par_ptr->addAttr(llvm::Attribute::ReadOnly);
1794 
1795     auto *time_ptr = par_ptr + 1;
1796     time_ptr->setName("time_ptr");
1797     time_ptr->addAttr(llvm::Attribute::NoCapture);
1798     time_ptr->addAttr(llvm::Attribute::NoAlias);
1799     time_ptr->addAttr(llvm::Attribute::ReadOnly);
1800 
1801     auto *h_ptr = time_ptr + 1;
1802     h_ptr->setName("h_ptr");
1803     h_ptr->addAttr(llvm::Attribute::NoCapture);
1804     h_ptr->addAttr(llvm::Attribute::NoAlias);
1805 
1806     auto *max_abs_state_ptr = h_ptr + 1;
1807     max_abs_state_ptr->setName("max_abs_state_ptr");
1808     max_abs_state_ptr->addAttr(llvm::Attribute::NoCapture);
1809     max_abs_state_ptr->addAttr(llvm::Attribute::NoAlias);
1810     max_abs_state_ptr->addAttr(llvm::Attribute::WriteOnly);
1811 
1812     // Create a new basic block to start insertion into.
1813     auto *bb = llvm::BasicBlock::Create(context, "entry", f);
1814     assert(bb != nullptr); // LCOV_EXCL_LINE
1815     builder.SetInsertPoint(bb);
1816 
1817     // Create a global read-only array containing the values in ev_dc, if there
1818     // are any and we are in compact mode (otherwise, svf_ptr will be null).
1819     auto *svf_ptr = compact_mode ? taylor_c_make_sv_funcs_arr(s, ev_dc) : nullptr;
1820 
1821     // Compute the jet of derivatives at the given order.
1822     auto diff_variant = taylor_compute_jet<T>(s, state_ptr, par_ptr, time_ptr, dc, ev_dc, n_eq, n_uvars, order,
1823                                               batch_size, compact_mode, high_accuracy);
1824 
1825     // Determine the integration timestep.
1826     auto h = taylor_determine_h<T>(s, diff_variant, ev_dc, svf_ptr, h_ptr, n_eq, n_uvars, order, batch_size,
1827                                    max_abs_state_ptr);
1828 
1829     // Store h to memory.
1830     store_vector_to_memory(builder, h_ptr, h);
1831 
1832     // Copy the jet of derivatives to jet_ptr.
1833     taylor_write_tc(s, diff_variant, ev_dc, svf_ptr, jet_ptr, n_eq, n_uvars, order, batch_size);
1834 
1835     // Create the return value.
1836     builder.CreateRetVoid();
1837 
1838     // Verify the function.
1839     s.verify_function(f);
1840 
1841     // Run the optimisation pass.
1842     // NOTE: this does nothing currently, as the optimisation
1843     // level is set to zero from the outside.
1844     s.optimise();
1845 
1846     return std::tuple{std::move(dc), order};
1847 }
1848 
1849 // Run the Horner scheme to propagate an ODE state via the evaluation of the Taylor polynomials.
1850 // diff_var contains either the derivatives for all u variables (in compact mode) or only
1851 // for the state variables (non-compact mode). The evaluation point (i.e., the timestep)
1852 // is h. The evaluation is run in parallel over the polynomials of all the state
1853 // variables.
1854 std::variant<llvm::Value *, std::vector<llvm::Value *>>
taylor_run_multihorner(llvm_state & s,const std::variant<llvm::Value *,std::vector<llvm::Value * >> & diff_var,llvm::Value * h,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t,bool compact_mode)1855 taylor_run_multihorner(llvm_state &s, const std::variant<llvm::Value *, std::vector<llvm::Value *>> &diff_var,
1856                        llvm::Value *h, std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t,
1857                        bool compact_mode)
1858 {
1859     auto &builder = s.builder();
1860 
1861     if (compact_mode) {
1862         // Compact mode.
1863         auto *diff_arr = std::get<llvm::Value *>(diff_var);
1864 
1865         // Create the array storing the results of the evaluation.
1866         auto *fp_vec_t = pointee_type(diff_arr);
1867         auto *array_type = llvm::ArrayType::get(fp_vec_t, n_eq);
1868         auto *array_inst = builder.CreateAlloca(array_type);
1869         assert(llvm_depr_GEP_type_check(array_inst, array_type)); // LCOV_EXCL_LINE
1870         auto *res_arr = builder.CreateInBoundsGEP(array_type, array_inst, {builder.getInt32(0), builder.getInt32(0)});
1871 
1872         // Init the return value, filling it with the values of the
1873         // coefficients of the highest-degree monomial in each polynomial.
1874         llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1875             // Load the value from diff_arr and store it in res_arr.
1876             assert(llvm_depr_GEP_type_check(res_arr, fp_vec_t)); // LCOV_EXCL_LINE
1877             builder.CreateStore(taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order), cur_var_idx),
1878                                 builder.CreateInBoundsGEP(fp_vec_t, res_arr, cur_var_idx));
1879         });
1880 
1881         // Run the evaluation.
1882         llvm_loop_u32(
1883             s, builder.getInt32(1), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)),
1884             [&](llvm::Value *cur_order) {
1885                 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1886                     // Load the current poly coeff from diff_arr.
1887                     // NOTE: we are loading the coefficients backwards wrt the order, hence
1888                     // we specify order - cur_order.
1889                     auto *cf = taylor_c_load_diff(s, diff_arr, n_uvars,
1890                                                   builder.CreateSub(builder.getInt32(order), cur_order), cur_var_idx);
1891 
1892                     // Accumulate in res_arr.
1893                     assert(llvm_depr_GEP_type_check(res_arr, fp_vec_t)); // LCOV_EXCL_LINE
1894                     auto *res_ptr = builder.CreateInBoundsGEP(fp_vec_t, res_arr, cur_var_idx);
1895                     builder.CreateStore(
1896                         builder.CreateFAdd(cf, builder.CreateFMul(builder.CreateLoad(fp_vec_t, res_ptr), h)), res_ptr);
1897                 });
1898             });
1899 
1900         return res_arr;
1901     } else {
1902         // Non-compact mode.
1903         const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_var);
1904 
1905         // Init the return value, filling it with the values of the
1906         // coefficients of the highest-degree monomial in each polynomial.
1907         std::vector<llvm::Value *> res_arr;
1908         for (std::uint32_t i = 0; i < n_eq; ++i) {
1909             res_arr.push_back(diff_arr[(n_eq * order) + i]);
1910         }
1911 
1912         // Run the Horner scheme simultaneously for all polynomials.
1913         for (std::uint32_t i = 1; i <= order; ++i) {
1914             for (std::uint32_t j = 0; j < n_eq; ++j) {
1915                 res_arr[j] = builder.CreateFAdd(diff_arr[(order - i) * n_eq + j], builder.CreateFMul(res_arr[j], h));
1916             }
1917         }
1918 
1919         return res_arr;
1920     }
1921 }
1922 
1923 // Same as taylor_run_multihorner(), but instead of the Horner scheme this implementation uses
1924 // a compensated summation over the naive evaluation of monomials.
1925 std::variant<llvm::Value *, std::vector<llvm::Value *>>
taylor_run_ceval(llvm_state & s,const std::variant<llvm::Value *,std::vector<llvm::Value * >> & diff_var,llvm::Value * h,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,bool,bool compact_mode)1926 taylor_run_ceval(llvm_state &s, const std::variant<llvm::Value *, std::vector<llvm::Value *>> &diff_var, llvm::Value *h,
1927                  std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, bool, bool compact_mode)
1928 {
1929     auto &builder = s.builder();
1930 
1931     if (compact_mode) {
1932         // Compact mode.
1933         auto *diff_arr = std::get<llvm::Value *>(diff_var);
1934 
1935         // Create the arrays storing the results of the evaluation and the running compensations.
1936         auto *fp_vec_t = pointee_type(diff_arr);
1937         auto *array_type = llvm::ArrayType::get(fp_vec_t, n_eq);
1938         auto *res_arr_inst = builder.CreateAlloca(array_type);
1939         auto *comp_arr_inst = builder.CreateAlloca(array_type);
1940         // LCOV_EXCL_START
1941         assert(llvm_depr_GEP_type_check(res_arr_inst, array_type));
1942         assert(llvm_depr_GEP_type_check(comp_arr_inst, array_type));
1943         // LCOV_EXCL_STOP
1944         auto *res_arr = builder.CreateInBoundsGEP(array_type, res_arr_inst, {builder.getInt32(0), builder.getInt32(0)});
1945         auto *comp_arr
1946             = builder.CreateInBoundsGEP(array_type, comp_arr_inst, {builder.getInt32(0), builder.getInt32(0)});
1947 
1948         // Init res_arr with the order-0 coefficients, and the running
1949         // compensations with zero.
1950         llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1951             // Load the value from diff_arr.
1952             assert(llvm_depr_GEP_type_check(diff_arr, fp_vec_t)); // LCOV_EXCL_LINE
1953             auto *val = builder.CreateLoad(fp_vec_t, builder.CreateInBoundsGEP(fp_vec_t, diff_arr, cur_var_idx));
1954 
1955             // Store it in res_arr.
1956             assert(llvm_depr_GEP_type_check(res_arr, fp_vec_t)); // LCOV_EXCL_LINE
1957             builder.CreateStore(val, builder.CreateInBoundsGEP(fp_vec_t, res_arr, cur_var_idx));
1958 
1959             // Zero-init the element in comp_arr.
1960             assert(llvm_depr_GEP_type_check(comp_arr, fp_vec_t)); // LCOV_EXCL_LINE
1961             builder.CreateStore(llvm::ConstantFP::get(fp_vec_t, 0.),
1962                                 builder.CreateInBoundsGEP(fp_vec_t, comp_arr, cur_var_idx));
1963         });
1964 
1965         // Init the running updater for the powers of h.
1966         auto *cur_h = builder.CreateAlloca(h->getType());
1967         builder.CreateStore(h, cur_h);
1968 
1969         // Run the evaluation.
1970         llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(order + 1u), [&](llvm::Value *cur_order) {
1971             // Load the current power of h.
1972             auto *cur_h_val = builder.CreateLoad(fp_vec_t, cur_h);
1973 
1974             llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1975                 // Evaluate the current monomial.
1976                 auto *cf = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_var_idx);
1977                 auto *tmp = builder.CreateFMul(cf, cur_h_val);
1978 
1979                 // Compute the quantities for the compensation.
1980                 assert(llvm_depr_GEP_type_check(comp_arr, fp_vec_t)); // LCOV_EXCL_LINE
1981                 auto *comp_ptr = builder.CreateInBoundsGEP(fp_vec_t, comp_arr, cur_var_idx);
1982                 assert(llvm_depr_GEP_type_check(res_arr, fp_vec_t)); // LCOV_EXCL_LINE
1983                 auto *res_ptr = builder.CreateInBoundsGEP(fp_vec_t, res_arr, cur_var_idx);
1984                 auto *y = builder.CreateFSub(tmp, builder.CreateLoad(fp_vec_t, comp_ptr));
1985                 auto *cur_res = builder.CreateLoad(fp_vec_t, res_ptr);
1986                 auto *t = builder.CreateFAdd(cur_res, y);
1987 
1988                 // Update the compensation and the return value.
1989                 builder.CreateStore(builder.CreateFSub(builder.CreateFSub(t, cur_res), y), comp_ptr);
1990                 builder.CreateStore(t, res_ptr);
1991             });
1992 
1993             // Update the value of h.
1994             builder.CreateStore(builder.CreateFMul(cur_h_val, h), cur_h);
1995         });
1996 
1997         return res_arr;
1998     } else {
1999         // Non-compact mode.
2000         const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_var);
2001 
2002         // Init the return values with the order-0 monomials, and the running
2003         // compensations with zero.
2004         std::vector<llvm::Value *> res_arr, comp_arr;
2005         for (std::uint32_t i = 0; i < n_eq; ++i) {
2006             res_arr.push_back(diff_arr[i]);
2007             comp_arr.push_back(llvm::ConstantFP::get(diff_arr[i]->getType(), 0.));
2008         }
2009 
2010         // Evaluate and sum.
2011         auto *cur_h = h;
2012         for (std::uint32_t i = 1; i <= order; ++i) {
2013             for (std::uint32_t j = 0; j < n_eq; ++j) {
2014                 // Evaluate the current monomial.
2015                 auto *tmp = builder.CreateFMul(diff_arr[i * n_eq + j], cur_h);
2016 
2017                 // Compute the quantities for the compensation.
2018                 auto *y = builder.CreateFSub(tmp, comp_arr[j]);
2019                 auto *t = builder.CreateFAdd(res_arr[j], y);
2020 
2021                 // Update the compensation and the return value.
2022                 comp_arr[j] = builder.CreateFSub(builder.CreateFSub(t, res_arr[j]), y);
2023                 res_arr[j] = t;
2024             }
2025 
2026             // Update the power of h.
2027             cur_h = builder.CreateFMul(cur_h, h);
2028         }
2029 
2030         return res_arr;
2031     }
2032 }
2033 
2034 // NOTE: in compact mode, care must be taken when adding multiple stepper functions to the same llvm state
2035 // with the same floating-point type, batch size and number of u variables. The potential issue there
2036 // is that when the first stepper is added, the compact mode AD functions are created and then optimised.
2037 // The optimisation pass might alter the functions in a way that makes them incompatible with subsequent
2038 // uses in the second stepper (e.g., an argument might be removed from the signature because it is a
2039 // compile-time constant). A workaround to avoid issues is to set the optimisation level to zero
2040 // in the state, add the 2 steppers and then run a single optimisation pass. This is what we do
2041 // in the integrators' ctors.
2042 // NOTE: document this eventually.
2043 template <typename T, typename U>
taylor_add_adaptive_step(llvm_state & s,const std::string & name,const U & sys,T tol,std::uint32_t batch_size,bool high_accuracy,bool compact_mode)2044 auto taylor_add_adaptive_step(llvm_state &s, const std::string &name, const U &sys, T tol, std::uint32_t batch_size,
2045                               bool high_accuracy, bool compact_mode)
2046 {
2047     using std::isfinite;
2048 
2049     assert(!s.is_compiled());
2050     assert(batch_size > 0u);
2051     assert(isfinite(tol) && tol > 0);
2052 
2053     // Determine the order from the tolerance.
2054     const auto order = taylor_order_from_tol(tol);
2055 
2056     // Record the number of equations/variables.
2057     const auto n_eq = boost::numeric_cast<std::uint32_t>(sys.size());
2058 
2059     // Decompose the system of equations.
2060     // NOTE: no sv_funcs needed for this stepper.
2061     auto [dc, sv_funcs_dc] = taylor_decompose(sys, {});
2062 
2063     assert(sv_funcs_dc.empty());
2064 
2065     // Compute the number of u variables.
2066     assert(dc.size() > n_eq);
2067     const auto n_uvars = boost::numeric_cast<std::uint32_t>(dc.size() - n_eq);
2068 
2069     auto &builder = s.builder();
2070     auto &context = s.context();
2071 
2072     // Prepare the function prototype. The arguments are:
2073     // - pointer to the current state vector (read & write),
2074     // - pointer to the parameters (read only),
2075     // - pointer to the time value(s) (read only),
2076     // - pointer to the array of max timesteps (read & write),
2077     // - pointer to the Taylor coefficients output (write only).
2078     // These pointers cannot overlap.
2079     auto *fp_t = to_llvm_type<T>(context);
2080     auto *fp_vec_t = make_vector_type(fp_t, batch_size);
2081     std::vector<llvm::Type *> fargs(5, llvm::PointerType::getUnqual(fp_t));
2082     // The function does not return anything.
2083     auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
2084     assert(ft != nullptr);
2085     // Now create the function.
2086     auto *f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, name, &s.module());
2087     if (f == nullptr) {
2088         throw std::invalid_argument(
2089             "Unable to create a function for an adaptive Taylor stepper with name '{}'"_format(name));
2090     }
2091 
2092     // Set the names/attributes of the function arguments.
2093     auto *state_ptr = f->args().begin();
2094     state_ptr->setName("state_ptr");
2095     state_ptr->addAttr(llvm::Attribute::NoCapture);
2096     state_ptr->addAttr(llvm::Attribute::NoAlias);
2097 
2098     auto *par_ptr = state_ptr + 1;
2099     par_ptr->setName("par_ptr");
2100     par_ptr->addAttr(llvm::Attribute::NoCapture);
2101     par_ptr->addAttr(llvm::Attribute::NoAlias);
2102     par_ptr->addAttr(llvm::Attribute::ReadOnly);
2103 
2104     auto *time_ptr = par_ptr + 1;
2105     time_ptr->setName("time_ptr");
2106     time_ptr->addAttr(llvm::Attribute::NoCapture);
2107     time_ptr->addAttr(llvm::Attribute::NoAlias);
2108     time_ptr->addAttr(llvm::Attribute::ReadOnly);
2109 
2110     auto *h_ptr = time_ptr + 1;
2111     h_ptr->setName("h_ptr");
2112     h_ptr->addAttr(llvm::Attribute::NoCapture);
2113     h_ptr->addAttr(llvm::Attribute::NoAlias);
2114 
2115     auto *tc_ptr = h_ptr + 1;
2116     tc_ptr->setName("tc_ptr");
2117     tc_ptr->addAttr(llvm::Attribute::NoCapture);
2118     tc_ptr->addAttr(llvm::Attribute::NoAlias);
2119     tc_ptr->addAttr(llvm::Attribute::WriteOnly);
2120 
2121     // Create a new basic block to start insertion into.
2122     auto *bb = llvm::BasicBlock::Create(context, "entry", f);
2123     assert(bb != nullptr);
2124     builder.SetInsertPoint(bb);
2125 
2126     // Compute the jet of derivatives at the given order.
2127     auto diff_variant = taylor_compute_jet<T>(s, state_ptr, par_ptr, time_ptr, dc, {}, n_eq, n_uvars, order, batch_size,
2128                                               compact_mode, high_accuracy);
2129 
2130     // Determine the integration timestep.
2131     auto h = taylor_determine_h<T>(s, diff_variant, sv_funcs_dc, nullptr, h_ptr, n_eq, n_uvars, order, batch_size,
2132                                    nullptr);
2133 
2134     // Evaluate the Taylor polynomials, producing the updated state of the system.
2135     auto new_state_var
2136         = high_accuracy ? taylor_run_ceval(s, diff_variant, h, n_eq, n_uvars, order, high_accuracy, compact_mode)
2137                         : taylor_run_multihorner(s, diff_variant, h, n_eq, n_uvars, order, batch_size, compact_mode);
2138 
2139     // Store the new state.
2140     // NOTE: no need to perform overflow check on n_eq * batch_size,
2141     // as in taylor_compute_jet() we already checked.
2142     if (compact_mode) {
2143         auto new_state = std::get<llvm::Value *>(new_state_var);
2144 
2145         llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
2146             assert(llvm_depr_GEP_type_check(new_state, fp_vec_t)); // LCOV_EXCL_LINE
2147             auto val = builder.CreateLoad(fp_vec_t, builder.CreateInBoundsGEP(fp_vec_t, new_state, cur_var_idx));
2148             assert(llvm_depr_GEP_type_check(state_ptr, fp_t)); // LCOV_EXCL_LINE
2149             store_vector_to_memory(builder,
2150                                    builder.CreateInBoundsGEP(
2151                                        fp_t, state_ptr, builder.CreateMul(cur_var_idx, builder.getInt32(batch_size))),
2152                                    val);
2153         });
2154     } else {
2155         const auto &new_state = std::get<std::vector<llvm::Value *>>(new_state_var);
2156 
2157         assert(new_state.size() == n_eq);
2158 
2159         for (std::uint32_t var_idx = 0; var_idx < n_eq; ++var_idx) {
2160             assert(llvm_depr_GEP_type_check(state_ptr, fp_t)); // LCOV_EXCL_LINE
2161             store_vector_to_memory(builder,
2162                                    builder.CreateInBoundsGEP(fp_t, state_ptr, builder.getInt32(var_idx * batch_size)),
2163                                    new_state[var_idx]);
2164         }
2165     }
2166 
2167     // Store the timesteps that were used.
2168     store_vector_to_memory(builder, h_ptr, h);
2169 
2170     // Write the Taylor coefficients, if requested.
2171     auto nptr = llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(to_llvm_type<T>(s.context())));
2172     llvm_if_then_else(
2173         s, builder.CreateICmpNE(tc_ptr, nptr),
2174         [&]() {
2175             // tc_ptr is not null: copy the Taylor coefficients
2176             // for the state variables.
2177             taylor_write_tc(s, diff_variant, {}, nullptr, tc_ptr, n_eq, n_uvars, order, batch_size);
2178         },
2179         [&]() {
2180             // Taylor coefficients were not requested,
2181             // don't do anything in this branch.
2182         });
2183 
2184     // Create the return value.
2185     builder.CreateRetVoid();
2186 
2187     // Verify the function.
2188     s.verify_function(f);
2189 
2190     // Run the optimisation pass.
2191     // NOTE: this does nothing currently, as the optimisation
2192     // level is set to zero from the outside.
2193     s.optimise();
2194 
2195     return std::tuple{std::move(dc), order};
2196 }
2197 
2198 } // namespace
2199 
2200 template <typename T>
2201 template <typename U>
finalise_ctor_impl(const U & sys,std::vector<T> state,T time,T tol,bool high_accuracy,bool compact_mode,std::vector<T> pars,std::vector<t_event_t> tes,std::vector<nt_event_t> ntes)2202 void taylor_adaptive_impl<T>::finalise_ctor_impl(const U &sys, std::vector<T> state, T time, T tol, bool high_accuracy,
2203                                                  bool compact_mode, std::vector<T> pars, std::vector<t_event_t> tes,
2204                                                  std::vector<nt_event_t> ntes)
2205 {
2206 #if defined(HEYOKA_ARCH_PPC)
2207     if constexpr (std::is_same_v<T, long double>) {
2208         throw not_implemented_error("'long double' computations are not supported on PowerPC");
2209     }
2210 #endif
2211 
2212     using std::isfinite;
2213 
2214     // Assign the data members.
2215     m_state = std::move(state);
2216     m_time = dfloat<T>(time);
2217     m_pars = std::move(pars);
2218     m_high_accuracy = high_accuracy;
2219     m_compact_mode = compact_mode;
2220 
2221     // Check input params.
2222     if (std::any_of(m_state.begin(), m_state.end(), [](const auto &x) { return !isfinite(x); })) {
2223         throw std::invalid_argument(
2224             "A non-finite value was detected in the initial state of an adaptive Taylor integrator");
2225     }
2226 
2227     if (m_state.size() != sys.size()) {
2228         throw std::invalid_argument(
2229             "Inconsistent sizes detected in the initialization of an adaptive Taylor "
2230             "integrator: the state vector has a dimension of {}, while the number of equations is {}"_format(
2231                 m_state.size(), sys.size()));
2232     }
2233 
2234     if (!isfinite(m_time)) {
2235         throw std::invalid_argument(
2236             "Cannot initialise an adaptive Taylor integrator with a non-finite initial time of {}"_format(
2237                 static_cast<T>(m_time)));
2238     }
2239 
2240     if (!isfinite(tol) || tol <= 0) {
2241         throw std::invalid_argument(
2242             "The tolerance in an adaptive Taylor integrator must be finite and positive, but it is {} instead"_format(
2243                 tol));
2244     }
2245 
2246     // Store the tolerance.
2247     m_tol = tol;
2248 
2249     // Store the dimension of the system.
2250     m_dim = boost::numeric_cast<std::uint32_t>(sys.size());
2251 
2252     // Do we have events?
2253     const auto with_events = !tes.empty() || !ntes.empty();
2254 
2255     // Temporarily disable optimisations in s, so that
2256     // we don't optimise twice when adding the step
2257     // and then the d_out.
2258     std::optional<opt_disabler> od(m_llvm);
2259 
2260     // Add the stepper function.
2261     if (with_events) {
2262         std::vector<expression> ee;
2263         // NOTE: no need for deep copies of the expressions: ee is never mutated
2264         // and we will be deep-copying it anyway when we do the decomposition.
2265         for (const auto &ev : tes) {
2266             ee.push_back(ev.get_expression());
2267         }
2268         for (const auto &ev : ntes) {
2269             ee.push_back(ev.get_expression());
2270         }
2271 
2272         std::tie(m_dc, m_order) = taylor_add_adaptive_step_with_events<T>(m_llvm, "step_e", sys, tol, 1, high_accuracy,
2273                                                                           compact_mode, ee, high_accuracy);
2274     } else {
2275         std::tie(m_dc, m_order) = taylor_add_adaptive_step<T>(m_llvm, "step", sys, tol, 1, high_accuracy, compact_mode);
2276     }
2277 
2278     // Fix m_pars' size, if necessary.
2279     const auto npars = n_pars_in_dc(m_dc);
2280     if (m_pars.size() < npars) {
2281         m_pars.resize(boost::numeric_cast<decltype(m_pars.size())>(npars));
2282     } else if (m_pars.size() > npars) {
2283         throw std::invalid_argument(
2284             "Excessive number of parameter values passed to the constructor of an adaptive "
2285             "Taylor integrator: {} parameter values were passed, but the ODE system contains only {} parameters"_format(
2286                 m_pars.size(), npars));
2287     }
2288 
2289     // Log runtimes in trace mode.
2290     spdlog::stopwatch sw;
2291 
2292     // Add the function for the computation of
2293     // the dense output.
2294     taylor_add_d_out_function<T>(m_llvm, m_dim, m_order, 1, high_accuracy);
2295 
2296     get_logger()->trace("Taylor dense output runtime: {}", sw);
2297     sw.reset();
2298 
2299     // Restore the original optimisation level in s.
2300     od.reset();
2301 
2302     // Run the optimisation pass manually.
2303     m_llvm.optimise();
2304 
2305     get_logger()->trace("Taylor global opt pass runtime: {}", sw);
2306     sw.reset();
2307 
2308     // Run the jit.
2309     m_llvm.compile();
2310 
2311     get_logger()->trace("Taylor LLVM compilation runtime: {}", sw);
2312 
2313     // Fetch the stepper.
2314     if (with_events) {
2315         m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
2316     } else {
2317         m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
2318     }
2319 
2320     // Fetch the function to compute the dense output.
2321     m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
2322 
2323     // Setup the vector for the Taylor coefficients.
2324     // LCOV_EXCL_START
2325     if (m_order == std::numeric_limits<std::uint32_t>::max()
2326         || m_state.size() > std::numeric_limits<decltype(m_tc.size())>::max() / (m_order + 1u)) {
2327         throw std::overflow_error("Overflow detected in the initialisation of an adaptive Taylor integrator: the order "
2328                                   "or the state size is too large");
2329     }
2330     // LCOV_EXCL_STOP
2331 
2332     m_tc.resize(m_state.size() * (m_order + 1u));
2333 
2334     // Setup the vector for the dense output.
2335     m_d_out.resize(m_state.size());
2336 
2337     // Init the event data structure if needed.
2338     // NOTE: this can be done in parallel with the rest of the constructor,
2339     // once we have m_order/m_dim and we are done using tes/ntes.
2340     if (with_events) {
2341         m_ed_data = std::make_unique<ed_data>(std::move(tes), std::move(ntes), m_order, m_dim);
2342     }
2343 }
2344 
2345 template <typename T>
taylor_adaptive_impl()2346 taylor_adaptive_impl<T>::taylor_adaptive_impl()
2347     : taylor_adaptive_impl({prime("x"_var) = 0_dbl}, {T(0)}, kw::tol = T(1e-1))
2348 {
2349 }
2350 
2351 template <typename T>
taylor_adaptive_impl(const taylor_adaptive_impl & other)2352 taylor_adaptive_impl<T>::taylor_adaptive_impl(const taylor_adaptive_impl &other)
2353     : m_state(other.m_state), m_time(other.m_time), m_llvm(other.m_llvm), m_dim(other.m_dim), m_order(other.m_order),
2354       m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy), m_compact_mode(other.m_compact_mode),
2355       m_pars(other.m_pars), m_tc(other.m_tc), m_last_h(other.m_last_h), m_d_out(other.m_d_out),
2356       m_ed_data(other.m_ed_data ? std::make_unique<ed_data>(*other.m_ed_data) : nullptr)
2357 {
2358     // NOTE: make explicit deep copy of the decomposition.
2359     m_dc.reserve(other.m_dc.size());
2360     for (const auto &[ex, deps] : other.m_dc) {
2361         m_dc.emplace_back(copy(ex), deps);
2362     }
2363 
2364     if (m_ed_data) {
2365         m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
2366     } else {
2367         m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
2368     }
2369     m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
2370 }
2371 
2372 template <typename T>
2373 taylor_adaptive_impl<T>::taylor_adaptive_impl(taylor_adaptive_impl &&) noexcept = default;
2374 
2375 template <typename T>
operator =(const taylor_adaptive_impl & other)2376 taylor_adaptive_impl<T> &taylor_adaptive_impl<T>::operator=(const taylor_adaptive_impl &other)
2377 {
2378     if (this != &other) {
2379         *this = taylor_adaptive_impl(other);
2380     }
2381 
2382     return *this;
2383 }
2384 
2385 template <typename T>
2386 taylor_adaptive_impl<T> &taylor_adaptive_impl<T>::operator=(taylor_adaptive_impl &&) noexcept = default;
2387 
2388 template <typename T>
2389 taylor_adaptive_impl<T>::~taylor_adaptive_impl() = default;
2390 
2391 // NOTE: the save/load patterns mimic the copy constructor logic.
2392 template <typename T>
2393 template <typename Archive>
save_impl(Archive & ar,unsigned) const2394 void taylor_adaptive_impl<T>::save_impl(Archive &ar, unsigned) const
2395 {
2396     ar << m_state;
2397     ar << m_time;
2398     ar << m_llvm;
2399     ar << m_dim;
2400     ar << m_dc;
2401     ar << m_order;
2402     ar << m_tol;
2403     ar << m_high_accuracy;
2404     ar << m_compact_mode;
2405     ar << m_pars;
2406     ar << m_tc;
2407     ar << m_last_h;
2408     ar << m_d_out;
2409     ar << m_ed_data;
2410 }
2411 
2412 template <typename T>
2413 template <typename Archive>
load_impl(Archive & ar,unsigned version)2414 void taylor_adaptive_impl<T>::load_impl(Archive &ar, unsigned version)
2415 {
2416     // LCOV_EXCL_START
2417     if (version < static_cast<unsigned>(boost::serialization::version<taylor_adaptive_impl<T>>::type::value)) {
2418         throw std::invalid_argument("Unable to load a taylor_adaptive integrator: "
2419                                     "the archive version ({}) is too old"_format(version));
2420     }
2421     // LCOV_EXCL_STOP
2422 
2423     ar >> m_state;
2424     ar >> m_time;
2425     ar >> m_llvm;
2426     ar >> m_dim;
2427     ar >> m_dc;
2428     ar >> m_order;
2429     ar >> m_tol;
2430     ar >> m_high_accuracy;
2431     ar >> m_compact_mode;
2432     ar >> m_pars;
2433     ar >> m_tc;
2434     ar >> m_last_h;
2435     ar >> m_d_out;
2436     ar >> m_ed_data;
2437 
2438     // Recover the function pointers.
2439     if (m_ed_data) {
2440         m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
2441     } else {
2442         m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
2443     }
2444     m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
2445 }
2446 
2447 template <typename T>
save(boost::archive::binary_oarchive & ar,unsigned v) const2448 void taylor_adaptive_impl<T>::save(boost::archive::binary_oarchive &ar, unsigned v) const
2449 {
2450     save_impl(ar, v);
2451 }
2452 
2453 template <typename T>
load(boost::archive::binary_iarchive & ar,unsigned v)2454 void taylor_adaptive_impl<T>::load(boost::archive::binary_iarchive &ar, unsigned v)
2455 {
2456     load_impl(ar, v);
2457 }
2458 
2459 // Implementation detail to make a single integration timestep.
2460 // The magnitude of the timestep is automatically deduced, but it will
2461 // always be not greater than abs(max_delta_t). The propagation
2462 // is done forward in time if max_delta_t >= 0, backwards in
2463 // time otherwise.
2464 //
2465 // The function will return a pair, containing
2466 // a flag describing the outcome of the integration,
2467 // and the integration timestep that was used.
2468 template <typename T>
step_impl(T max_delta_t,bool wtc)2469 std::tuple<taylor_outcome, T> taylor_adaptive_impl<T>::step_impl(T max_delta_t, bool wtc)
2470 {
2471     using std::abs;
2472     using std::isfinite;
2473 
2474 #if !defined(NDEBUG)
2475     // NOTE: this is the only precondition on max_delta_t.
2476     using std::isnan;
2477     assert(!isnan(max_delta_t)); // LCOV_EXCL_LINE
2478 #endif
2479 
2480     auto h = max_delta_t;
2481 
2482     if (m_step_f.index() == 0u) {
2483         assert(!m_ed_data); // LCOV_EXCL_LINE
2484 
2485         // Invoke the vanilla stepper.
2486         std::get<0>(m_step_f)(m_state.data(), m_pars.data(), &m_time.hi, &h, wtc ? m_tc.data() : nullptr);
2487 
2488         // Update the time.
2489         m_time += h;
2490 
2491         // Store the last timestep.
2492         m_last_h = h;
2493 
2494         // Check if the time or the state vector are non-finite at the
2495         // end of the timestep.
2496         if (!isfinite(m_time)
2497             || std::any_of(m_state.cbegin(), m_state.cend(), [](const auto &x) { return !isfinite(x); })) {
2498             return std::tuple{taylor_outcome::err_nf_state, h};
2499         }
2500 
2501         return std::tuple{h == max_delta_t ? taylor_outcome::time_limit : taylor_outcome::success, h};
2502     } else {
2503         assert(m_ed_data); // LCOV_EXCL_LINE
2504 
2505         auto &ed_data = *m_ed_data;
2506 
2507         // Invoke the stepper for event handling. We will record the norm infinity of the state vector +
2508         // event equations at the beginning of the timestep for later use.
2509         T max_abs_state;
2510         std::get<1>(m_step_f)(ed_data.m_ev_jet.data(), m_state.data(), m_pars.data(), &m_time.hi, &h, &max_abs_state);
2511 
2512         // Compute the maximum absolute error on the Taylor series of the event equations, which we will use for
2513         // automatic cooldown deduction. If max_abs_state is not finite, set it to inf so that
2514         // in ed_data.detect_events() we skip event detection altogether.
2515         T g_eps;
2516         if (isfinite(max_abs_state)) {
2517             // Are we in absolute or relative error control mode?
2518             const auto abs_or_rel = max_abs_state < 1;
2519 
2520             // Estimate the size of the largest remainder in the Taylor
2521             // series of both the dynamical equations and the events.
2522             const auto max_r_size = abs_or_rel ? m_tol : (m_tol * max_abs_state);
2523 
2524             // NOTE: depending on m_tol, max_r_size is arbitrarily small, but the real
2525             // integration error cannot be too small due to floating-point truncation.
2526             // This is the case for instance if we use sub-epsilon integration tolerances
2527             // to achieve Brouwer's law. In such a case, we cap the value of g_eps,
2528             // using eps * max_abs_state as an estimation of the smallest number
2529             // that can be resolved with the current floating-point type.
2530             // NOTE: the if condition in the next line is equivalent, in relative
2531             // error control mode, to:
2532             // if (m_tol < std::numeric_limits<T>::epsilon())
2533             if (max_r_size < std::numeric_limits<T>::epsilon() * max_abs_state) {
2534                 g_eps = std::numeric_limits<T>::epsilon() * max_abs_state;
2535             } else {
2536                 g_eps = max_r_size;
2537             }
2538         } else {
2539             g_eps = std::numeric_limits<T>::infinity();
2540         }
2541 
2542         // Write unconditionally the tcs.
2543         std::copy(ed_data.m_ev_jet.data(), ed_data.m_ev_jet.data() + m_dim * (m_order + 1u), m_tc.data());
2544 
2545         // Do the event detection.
2546         ed_data.detect_events(h, m_order, m_dim, g_eps);
2547 
2548         // NOTE: before this point, we did not alter
2549         // any user-visible data in the integrator (just
2550         // temporary memory). From here until we start invoking
2551         // the callbacks, everything is noexcept, so we don't
2552         // risk leaving the integrator in a half-baked state.
2553 
2554         // Sort the events by time.
2555         // NOTE: the time coordinates in m_d_(n)tes is relative
2556         // to the beginning of the timestep. It will be negative
2557         // for backward integration, thus we compare using
2558         // abs() so that the first events are those which
2559         // happen closer to the beginning of the timestep.
2560         // NOTE: the checks inside ed_data.detect_events() ensure
2561         // that we can safely sort the events' times.
2562         auto cmp = [](const auto &ev0, const auto &ev1) { return abs(std::get<1>(ev0)) < abs(std::get<1>(ev1)); };
2563         std::sort(ed_data.m_d_tes.begin(), ed_data.m_d_tes.end(), cmp);
2564         std::sort(ed_data.m_d_ntes.begin(), ed_data.m_d_ntes.end(), cmp);
2565 
2566         // If we have terminal events we need
2567         // to update the value of h.
2568         if (!ed_data.m_d_tes.empty()) {
2569             h = std::get<1>(ed_data.m_d_tes[0]);
2570         }
2571 
2572         // Update the state.
2573         m_d_out_f(m_state.data(), ed_data.m_ev_jet.data(), &h);
2574 
2575         // Update the time.
2576         m_time += h;
2577 
2578         // Store the last timestep.
2579         m_last_h = h;
2580 
2581         // Check if the time or the state vector are non-finite at the
2582         // end of the timestep.
2583         if (!isfinite(m_time)
2584             || std::any_of(m_state.cbegin(), m_state.cend(), [](const auto &x) { return !isfinite(x); })) {
2585             // Let's also reset the cooldown values, as at this point
2586             // they have become useless.
2587             reset_cooldowns();
2588 
2589             return std::tuple{taylor_outcome::err_nf_state, h};
2590         }
2591 
2592         // Update the cooldowns.
2593         for (auto &cd : ed_data.m_te_cooldowns) {
2594             if (cd) {
2595                 // Check if the timestep we just took
2596                 // brought this event outside the cooldown.
2597                 auto tmp = cd->first + h;
2598 
2599                 if (abs(tmp) >= cd->second) {
2600                     // We are now outside the cooldown period
2601                     // for this event, reset cd.
2602                     cd.reset();
2603                 } else {
2604                     // Still in cooldown, update the
2605                     // time spent in cooldown.
2606                     cd->first = tmp;
2607                 }
2608             }
2609         }
2610 
2611         // If we don't have terminal events, we will invoke the callbacks
2612         // of *all* the non-terminal events. Otherwise, we need to figure
2613         // out which non-terminal events do not happen because their time
2614         // coordinate is past the the first terminal event.
2615         const auto ntes_end_it
2616             = ed_data.m_d_tes.empty()
2617                   ? ed_data.m_d_ntes.end()
2618                   : std::lower_bound(ed_data.m_d_ntes.begin(), ed_data.m_d_ntes.end(), h,
2619                                      [](const auto &ev, const auto &t) { return abs(std::get<1>(ev)) < abs(t); });
2620 
2621         // Invoke the callbacks of the non-terminal events, which are guaranteed
2622         // to happen before the first terminal event.
2623         for (auto it = ed_data.m_d_ntes.begin(); it != ntes_end_it; ++it) {
2624             const auto &t = *it;
2625             const auto &cb = ed_data.m_ntes[std::get<0>(t)].get_callback();
2626             assert(cb); // LCOV_EXCL_LINE
2627             cb(*this, static_cast<T>(m_time - m_last_h + std::get<1>(t)), std::get<2>(t));
2628         }
2629 
2630         // The return value of the first
2631         // terminal event's callback. It will be
2632         // unused if there are no terminal events.
2633         bool te_cb_ret = false;
2634 
2635         if (!ed_data.m_d_tes.empty()) {
2636             // Fetch the first terminal event.
2637             const auto te_idx = std::get<0>(ed_data.m_d_tes[0]);
2638             assert(te_idx < ed_data.m_tes.size()); // LCOV_EXCL_LINE
2639             const auto &te = ed_data.m_tes[te_idx];
2640 
2641             // Set the corresponding cooldown.
2642             if (te.get_cooldown() >= 0) {
2643                 // Cooldown explicitly provided by the user, use it.
2644                 ed_data.m_te_cooldowns[te_idx].emplace(0, te.get_cooldown());
2645             } else {
2646                 // Deduce the cooldown automatically.
2647                 // NOTE: if g_eps is not finite, we skipped event detection
2648                 // altogether and thus we never end up here. If the derivative
2649                 // of the event equation is not finite, the event is also skipped.
2650                 ed_data.m_te_cooldowns[te_idx].emplace(0,
2651                                                        taylor_deduce_cooldown(g_eps, std::get<4>(ed_data.m_d_tes[0])));
2652             }
2653 
2654             // Invoke the callback of the first terminal event, if it has one.
2655             if (te.get_callback()) {
2656                 te_cb_ret = te.get_callback()(*this, std::get<2>(ed_data.m_d_tes[0]), std::get<3>(ed_data.m_d_tes[0]));
2657             }
2658         }
2659 
2660         if (ed_data.m_d_tes.empty()) {
2661             // No terminal events detected, return success or time limit.
2662             return std::tuple{h == max_delta_t ? taylor_outcome::time_limit : taylor_outcome::success, h};
2663         } else {
2664             // Terminal event detected. Fetch its index.
2665             const auto ev_idx = static_cast<std::int64_t>(std::get<0>(ed_data.m_d_tes[0]));
2666 
2667             // NOTE: if te_cb_ret is true, it means that the terminal event has
2668             // a callback and its invocation returned true (meaning that the
2669             // integration should continue). Otherwise, either the terminal event
2670             // has no callback or its callback returned false, meaning that the
2671             // integration must stop.
2672             return std::tuple{taylor_outcome{te_cb_ret ? ev_idx : (-ev_idx - 1)}, h};
2673         }
2674     }
2675 }
2676 
2677 template <typename T>
step(bool wtc)2678 std::tuple<taylor_outcome, T> taylor_adaptive_impl<T>::step(bool wtc)
2679 {
2680     // NOTE: time limit +inf means integration forward in time
2681     // and no time limit.
2682     return step_impl(std::numeric_limits<T>::infinity(), wtc);
2683 }
2684 
2685 template <typename T>
step_backward(bool wtc)2686 std::tuple<taylor_outcome, T> taylor_adaptive_impl<T>::step_backward(bool wtc)
2687 {
2688     return step_impl(-std::numeric_limits<T>::infinity(), wtc);
2689 }
2690 
2691 template <typename T>
step(T max_delta_t,bool wtc)2692 std::tuple<taylor_outcome, T> taylor_adaptive_impl<T>::step(T max_delta_t, bool wtc)
2693 {
2694     using std::isnan;
2695 
2696     if (isnan(max_delta_t)) {
2697         throw std::invalid_argument(
2698             "A NaN max_delta_t was passed to the step() function of an adaptive Taylor integrator");
2699     }
2700 
2701     return step_impl(max_delta_t, wtc);
2702 }
2703 
2704 // Reset all cooldowns for the terminal events.
2705 template <typename T>
reset_cooldowns()2706 void taylor_adaptive_impl<T>::reset_cooldowns()
2707 {
2708     if (!m_ed_data) {
2709         throw std::invalid_argument("No events were defined for this integrator");
2710     }
2711 
2712     for (auto &cd : m_ed_data->m_te_cooldowns) {
2713         cd.reset();
2714     }
2715 }
2716 
2717 template <typename T>
2718 std::tuple<taylor_outcome, T, T, std::size_t, std::optional<continuous_output<T>>>
propagate_until_impl(const dfloat<T> & t,std::size_t max_steps,T max_delta_t,std::function<bool (taylor_adaptive_impl &)> cb,bool wtc,bool with_c_out)2719 taylor_adaptive_impl<T>::propagate_until_impl(const dfloat<T> &t, std::size_t max_steps, T max_delta_t,
2720                                               std::function<bool(taylor_adaptive_impl &)> cb, bool wtc, bool with_c_out)
2721 {
2722     using std::abs;
2723     using std::isfinite;
2724     using std::isnan;
2725 
2726     // Check the current time.
2727     if (!isfinite(m_time)) {
2728         throw std::invalid_argument("Cannot invoke the propagate_until() function of an adaptive Taylor integrator if "
2729                                     "the current time is not finite");
2730     }
2731 
2732     // Check the final time.
2733     if (!isfinite(t)) {
2734         throw std::invalid_argument(
2735             "A non-finite time was passed to the propagate_until() function of an adaptive Taylor integrator");
2736     }
2737 
2738     // Check max_delta_t.
2739     if (isnan(max_delta_t)) {
2740         throw std::invalid_argument(
2741             "A nan max_delta_t was passed to the propagate_until() function of an adaptive Taylor integrator");
2742     }
2743     if (max_delta_t <= 0) {
2744         throw std::invalid_argument(
2745             "A non-positive max_delta_t was passed to the propagate_until() function of an adaptive Taylor integrator");
2746     }
2747 
2748     // If with_c_out is true, we always need to write the Taylor coefficients.
2749     wtc = wtc || with_c_out;
2750 
2751     // These vectors are used in the construction of the continuous output.
2752     // If continuous output is not requested, they will remain empty.
2753     std::vector<T> c_out_tcs, c_out_times_hi, c_out_times_lo;
2754     if (with_c_out) {
2755         // Push in the starting time.
2756         c_out_times_hi.push_back(m_time.hi);
2757         c_out_times_lo.push_back(m_time.lo);
2758     }
2759 
2760     // Initial values for the counters
2761     // and the min/max abs of the integration
2762     // timesteps.
2763     // NOTE: iter_counter is for keeping track of the max_steps
2764     // limits, step_counter counts the number of timesteps performed
2765     // with a nonzero h. Most of the time these two quantities
2766     // will be identical, apart from corner cases.
2767     std::size_t iter_counter = 0, step_counter = 0;
2768     T min_h = std::numeric_limits<T>::infinity(), max_h(0);
2769 
2770     // Init the remaining time.
2771     auto rem_time = t - m_time;
2772 
2773     // Check it.
2774     if (!isfinite(rem_time)) {
2775         throw std::invalid_argument("The final time passed to the propagate_until() function of an adaptive Taylor "
2776                                     "integrator results in an overflow condition");
2777     }
2778 
2779     // Cache the integration direction.
2780     const auto t_dir = (rem_time >= T(0));
2781 
2782     // Helper to create the continuous output object.
2783     auto make_c_out = [&]() -> std::optional<continuous_output<T>> {
2784         if (with_c_out) {
2785             if (c_out_times_hi.size() < 2u) {
2786                 // NOTE: this means that no successful steps
2787                 // were taken.
2788                 return {};
2789             }
2790 
2791             // Construct the return value.
2792             continuous_output<T> ret(m_llvm.make_similar());
2793 
2794             // Fill in the data.
2795             ret.m_tcs = std::move(c_out_tcs);
2796             ret.m_times_hi = std::move(c_out_times_hi);
2797             ret.m_times_lo = std::move(c_out_times_lo);
2798 
2799             // Prepare the output vector.
2800             ret.m_output.resize(boost::numeric_cast<decltype(ret.m_output.size())>(m_dim));
2801 
2802             // Add the continuous output function.
2803             ret.add_c_out_function(m_order, m_dim, m_high_accuracy);
2804 
2805             return std::optional{std::move(ret)};
2806         } else {
2807             return {};
2808         }
2809     };
2810 
2811     // Helper to update the continuous output data after a timestep.
2812     auto update_c_out = [&]() {
2813         if (with_c_out) {
2814 #if !defined(NDEBUG)
2815             const dfloat<T> prev_time(c_out_times_hi.back(), c_out_times_lo.back());
2816 #endif
2817 
2818             c_out_times_hi.push_back(m_time.hi);
2819             c_out_times_lo.push_back(m_time.lo);
2820 
2821 #if !defined(NDEBUG)
2822             const dfloat<T> new_time(c_out_times_hi.back(), c_out_times_lo.back());
2823             assert(isfinite(new_time));
2824             if (t_dir) {
2825                 assert(!(new_time < prev_time));
2826             } else {
2827                 assert(!(new_time > prev_time));
2828             }
2829 #endif
2830 
2831             c_out_tcs.insert(c_out_tcs.end(), m_tc.begin(), m_tc.end());
2832         }
2833     };
2834 
2835     while (true) {
2836         // Compute the max integration times for this timestep.
2837         // NOTE: rem_time is guaranteed to be finite: we check it explicitly above
2838         // and we keep on decreasing its magnitude at the following iterations.
2839         // If some non-finite state/time is generated in
2840         // the step function, the integration will be stopped.
2841         assert((rem_time >= T(0)) == t_dir); // LCOV_EXCL_LINE
2842         const auto dt_limit
2843             = t_dir ? std::min(dfloat<T>(max_delta_t), rem_time) : std::max(dfloat<T>(-max_delta_t), rem_time);
2844         // NOTE: if dt_limit is zero, step_impl() will always return time_limit.
2845         const auto [res, h] = step_impl(static_cast<T>(dt_limit), wtc);
2846 
2847         if (res != taylor_outcome::success && res != taylor_outcome::time_limit && res < taylor_outcome{0}) {
2848             // Something went wrong in the propagation of the timestep, or we reached
2849             // a stopping terminal event
2850             if (res > taylor_outcome::success) {
2851                 // In case of a stopping terminal event, we still want to update
2852                 // the continuous output and the step counter.
2853                 update_c_out();
2854                 step_counter += static_cast<std::size_t>(h != 0);
2855             }
2856 
2857             return std::tuple{res, min_h, max_h, step_counter, make_c_out()};
2858         }
2859 
2860         // Update the continuous output data.
2861         update_c_out();
2862 
2863         // Update the number of iterations.
2864         ++iter_counter;
2865 
2866         // Update the number of steps.
2867         step_counter += static_cast<std::size_t>(h != 0);
2868 
2869         // Update min_h/max_h, but only if the outcome is success (otherwise
2870         // the step was artificially clamped either by a time limit or
2871         // by a terminal event).
2872         if (res == taylor_outcome::success) {
2873             const auto abs_h = abs(h);
2874             min_h = std::min(min_h, abs_h);
2875             max_h = std::max(max_h, abs_h);
2876         }
2877 
2878         // The step was successful, execute the callback if applicable.
2879         if (cb && !cb(*this)) {
2880             // Interruption via callback.
2881             return std::tuple{taylor_outcome::cb_stop, min_h, max_h, step_counter, make_c_out()};
2882         }
2883 
2884         // Break out if the final time is reached,
2885         // NOTE: here we check h == rem_time, instead of just
2886         // res == time_limit, because clamping via max_delta_t
2887         // could also result in time_limit.
2888         if (h == static_cast<T>(rem_time)) {
2889             assert(res == taylor_outcome::time_limit); // LCOV_EXCL_LINE
2890             return std::tuple{taylor_outcome::time_limit, min_h, max_h, step_counter, make_c_out()};
2891         }
2892 
2893         // Check the iteration limit.
2894         // NOTE: if max_steps is 0 (i.e., no limit on the number of steps),
2895         // then this condition will never trigger as by this point we are
2896         // sure iter_counter is at least 1.
2897         if (iter_counter == max_steps) {
2898             return std::tuple{taylor_outcome::step_limit, min_h, max_h, step_counter, make_c_out()};
2899         }
2900 
2901         // Update the remaining time.
2902         // NOTE: in principle, due to the fact that
2903         // dt_limit is computed in extended
2904         // precision but it is cast to normal precision
2905         // before being added to m_time in step_impl(),
2906         // there could be numerical inconsistencies
2907         // at the last timestep which could result in rem_time
2908         // not exactly zero or with a flipped sign.
2909         // For this function, this should
2910         // not matter because if static_cast<T>(rem_time) was used
2911         // as an integration step, we will have already exited
2912         // above before anything bad can
2913         // happen. If anything smaller than static_cast<T>(rem_time)
2914         // was used as timestep, rem_time should not change sign.
2915         rem_time = t - m_time;
2916     }
2917 }
2918 
2919 template <typename T>
2920 std::tuple<taylor_outcome, T, T, std::size_t, std::vector<T>>
propagate_grid_impl(const std::vector<T> & grid,std::size_t max_steps,T max_delta_t,std::function<bool (taylor_adaptive_impl &)> cb)2921 taylor_adaptive_impl<T>::propagate_grid_impl(const std::vector<T> &grid, std::size_t max_steps, T max_delta_t,
2922                                              std::function<bool(taylor_adaptive_impl &)> cb)
2923 {
2924     using std::abs;
2925     using std::isfinite;
2926     using std::isnan;
2927 
2928     if (!isfinite(m_time)) {
2929         throw std::invalid_argument(
2930             "Cannot invoke propagate_grid() in an adaptive Taylor integrator if the current time is not finite");
2931     }
2932 
2933     // Check max_delta_t.
2934     if (isnan(max_delta_t)) {
2935         throw std::invalid_argument(
2936             "A nan max_delta_t was passed to the propagate_grid() function of an adaptive Taylor integrator");
2937     }
2938     if (max_delta_t <= 0) {
2939         throw std::invalid_argument(
2940             "A non-positive max_delta_t was passed to the propagate_grid() function of an adaptive Taylor integrator");
2941     }
2942 
2943     // Check the grid.
2944     if (grid.empty()) {
2945         throw std::invalid_argument(
2946             "Cannot invoke propagate_grid() in an adaptive Taylor integrator if the time grid is empty");
2947     }
2948 
2949     constexpr auto nf_err_msg
2950         = "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator";
2951     constexpr auto ig_err_msg
2952         = "A non-monotonic time grid was passed to propagate_grid() in an adaptive Taylor integrator";
2953     // Check the first point.
2954     if (!isfinite(grid[0])) {
2955         throw std::invalid_argument(nf_err_msg);
2956     }
2957     if (grid.size() > 1u) {
2958         // Establish the direction of the grid from
2959         // the first two points.
2960         if (!isfinite(grid[1])) {
2961             throw std::invalid_argument(nf_err_msg);
2962         }
2963         if (grid[1] == grid[0]) {
2964             throw std::invalid_argument(ig_err_msg);
2965         }
2966         const auto grid_direction = grid[1] > grid[0];
2967 
2968         // Check that the remaining points are finite and that
2969         // they are ordered monotonically.
2970         for (decltype(grid.size()) i = 2; i < grid.size(); ++i) {
2971             if (!isfinite(grid[i])) {
2972                 throw std::invalid_argument(nf_err_msg);
2973             }
2974 
2975             if ((grid[i] > grid[i - 1u]) != grid_direction) {
2976                 throw std::invalid_argument(ig_err_msg);
2977             }
2978         }
2979     }
2980 
2981     // Pre-allocate the return value.
2982     std::vector<T> retval;
2983     // LCOV_EXCL_START
2984     if (get_dim() > std::numeric_limits<decltype(retval.size())>::max() / grid.size()) {
2985         throw std::overflow_error("Overflow detected in the creation of the return value of propagate_grid() in an "
2986                                   "adaptive Taylor integrator");
2987     }
2988     // LCOV_EXCL_STOP
2989     retval.reserve(grid.size() * get_dim());
2990 
2991     // Initial values for the counters
2992     // and the min/max abs of the integration
2993     // timesteps.
2994     // NOTE: iter_counter is for keeping track of the max_steps
2995     // limits, step_counter counts the number of timesteps performed
2996     // with a nonzero h. Most of the time these two quantities
2997     // will be identical, apart from corner cases.
2998     std::size_t iter_counter = 0, step_counter = 0;
2999     T min_h = std::numeric_limits<T>::infinity(), max_h(0);
3000 
3001     // Propagate the system up to the first grid point.
3002     // NOTE: this may not be needed strictly speaking if
3003     // the time is already grid[0], but it will ensure that
3004     // m_last_h is properly updated.
3005     // NOTE: this will *not* write the TCs, but, because we
3006     // know that the grid is strictly monotonic, we know that we
3007     // will take at least 1 TC-writing timestep before starting
3008     // to use the dense output.
3009     // NOTE: use the same max_steps for the initial propagation,
3010     // and don't pass the callback.
3011     const auto oc = std::get<0>(propagate_until(grid[0], kw::max_delta_t = max_delta_t, kw::max_steps = max_steps));
3012 
3013     if (oc != taylor_outcome::time_limit && oc < taylor_outcome{0}) {
3014         // The outcome is not time_limit and it is not a continuing
3015         // terminal event. This means that a non-finite state was
3016         // encountered, or a stopping terminal event triggered, or
3017         // the step limit was hit.
3018         return std::tuple{oc, min_h, max_h, step_counter, std::move(retval)};
3019     }
3020 
3021     // Add the first result to retval.
3022     retval.insert(retval.end(), m_state.begin(), m_state.end());
3023 
3024     // Init the remaining time.
3025     auto rem_time = grid.back() - m_time;
3026 
3027     // Check it.
3028     if (!isfinite(rem_time)) {
3029         throw std::invalid_argument("The final time passed to the propagate_grid() function of an adaptive Taylor "
3030                                     "integrator results in an overflow condition");
3031     }
3032 
3033     // Cache the integration direction.
3034     const auto t_dir = (rem_time >= T(0));
3035 
3036     // Iterate over the remaining grid points.
3037     for (decltype(grid.size()) cur_grid_idx = 1; cur_grid_idx < grid.size();) {
3038         // Establish the time range of the last
3039         // taken timestep.
3040         // NOTE: t0 < t1.
3041         const auto t0 = std::min(m_time, m_time - m_last_h);
3042         const auto t1 = std::max(m_time, m_time - m_last_h);
3043 
3044         // Compute the state of the system via dense output for as many grid
3045         // points as possible, i.e., as long as the grid times
3046         // fall within the validity range for the dense output.
3047         while (true) {
3048             // Fetch the current time target.
3049             const auto cur_tt = grid[cur_grid_idx];
3050 
3051             // NOTE: we force processing of all remaining grid points
3052             // if we are at the last timestep. We do this in order to avoid
3053             // numerical issues when deciding if the last grid point
3054             // falls within the range of validity of the dense output.
3055             if ((cur_tt >= t0 && cur_tt <= t1) || (rem_time == dfloat<T>(T(0)))) {
3056                 // The current time target falls within the range of
3057                 // validity of the dense output. Compute the dense
3058                 // output in cur_tt.
3059                 update_d_output(cur_tt);
3060 
3061                 // Add the result to retval.
3062                 retval.insert(retval.end(), m_d_out.begin(), m_d_out.end());
3063             } else {
3064                 // Cannot use dense output on the current time target,
3065                 // need to take another step.
3066                 break;
3067             }
3068 
3069             // Move to the next time target, or break out
3070             // if we have no more.
3071             if (++cur_grid_idx == grid.size()) {
3072                 break;
3073             }
3074         }
3075 
3076         if (cur_grid_idx == grid.size()) {
3077             // No more grid points, exit.
3078             break;
3079         }
3080 
3081         // Take the next step, making sure to write the Taylor coefficients
3082         // and to cap the timestep size so that we don't go past the
3083         // last grid point and we don't use a timestep exceeding max_delta_t.
3084         // NOTE: rem_time is guaranteed to be finite: we check it explicitly above
3085         // and we keep on decreasing its magnitude at the following iterations.
3086         // If some non-finite state/time is generated in
3087         // the step function, the integration will be stopped.
3088         assert((rem_time >= T(0)) == t_dir); // LCOV_EXCL_LINE
3089         const auto dt_limit
3090             = t_dir ? std::min(dfloat<T>(max_delta_t), rem_time) : std::max(dfloat<T>(-max_delta_t), rem_time);
3091         const auto [res, h] = step_impl(static_cast<T>(dt_limit), true);
3092 
3093         if (res != taylor_outcome::success && res != taylor_outcome::time_limit && res < taylor_outcome{0}) {
3094             // Something went wrong in the propagation of the timestep, or we reached
3095             // a stopping terminal event.
3096 
3097             if (res > taylor_outcome::success) {
3098                 // In case of a stopping terminal event, we still want to update
3099                 // the step counter.
3100                 step_counter += static_cast<std::size_t>(h != 0);
3101             }
3102 
3103             return std::tuple{res, min_h, max_h, step_counter, std::move(retval)};
3104         }
3105 
3106         // Update the number of iterations.
3107         ++iter_counter;
3108 
3109         // Update the number of steps.
3110         step_counter += static_cast<std::size_t>(h != 0);
3111 
3112         // Update min_h/max_h, but only if the outcome is success (otherwise
3113         // the step was artificially clamped either by a time limit or
3114         // by a terminal event).
3115         if (res == taylor_outcome::success) {
3116             const auto abs_h = abs(h);
3117             min_h = std::min(min_h, abs_h);
3118             max_h = std::max(max_h, abs_h);
3119         }
3120 
3121         // Step successful: invoke the callback, if needed.
3122         if (cb && !cb(*this)) {
3123             // Interruption via callback.
3124             return std::tuple{taylor_outcome::cb_stop, min_h, max_h, step_counter, std::move(retval)};
3125         }
3126 
3127         // Check the iteration limit.
3128         // NOTE: if max_steps is 0 (i.e., no limit on the number of steps),
3129         // then this condition will never trigger as by this point we are
3130         // sure iter_counter is at least 1.
3131         if (iter_counter == max_steps) {
3132             return std::tuple{taylor_outcome::step_limit, min_h, max_h, step_counter, std::move(retval)};
3133         }
3134 
3135         // Update the remaining time.
3136         // NOTE: if static_cast<T>(rem_time) was used as a timestep,
3137         // it means that we hit the time limit. Force rem_time to zero
3138         // to signal this, avoiding inconsistencies with grid.back() - m_time
3139         // not going exactly to zero due to numerical issues. A zero rem_time
3140         // will also force the processing of all remaining grid points.
3141         if (h == static_cast<T>(rem_time)) {
3142             assert(res == taylor_outcome::time_limit); // LCOV_EXCL_LINE
3143             rem_time = dfloat<T>(T(0));
3144         } else {
3145             rem_time = grid.back() - m_time;
3146         }
3147     }
3148 
3149     // Everything went well, return time_limit.
3150     return std::tuple{taylor_outcome::time_limit, min_h, max_h, step_counter, std::move(retval)};
3151 }
3152 
3153 template <typename T>
get_llvm_state() const3154 const llvm_state &taylor_adaptive_impl<T>::get_llvm_state() const
3155 {
3156     return m_llvm;
3157 }
3158 
3159 template <typename T>
get_decomposition() const3160 const taylor_dc_t &taylor_adaptive_impl<T>::get_decomposition() const
3161 {
3162     return m_dc;
3163 }
3164 
3165 template <typename T>
get_order() const3166 std::uint32_t taylor_adaptive_impl<T>::get_order() const
3167 {
3168     return m_order;
3169 }
3170 
3171 template <typename T>
get_tol() const3172 T taylor_adaptive_impl<T>::get_tol() const
3173 {
3174     return m_tol;
3175 }
3176 
3177 template <typename T>
get_high_accuracy() const3178 bool taylor_adaptive_impl<T>::get_high_accuracy() const
3179 {
3180     return m_high_accuracy;
3181 }
3182 
3183 template <typename T>
get_compact_mode() const3184 bool taylor_adaptive_impl<T>::get_compact_mode() const
3185 {
3186     return m_compact_mode;
3187 }
3188 
3189 template <typename T>
get_dim() const3190 std::uint32_t taylor_adaptive_impl<T>::get_dim() const
3191 {
3192     return m_dim;
3193 }
3194 
3195 template <typename T>
update_d_output(T time,bool rel_time)3196 const std::vector<T> &taylor_adaptive_impl<T>::update_d_output(T time, bool rel_time)
3197 {
3198     // NOTE: "time" needs to be translated
3199     // because m_d_out_f expects a time coordinate
3200     // with respect to the starting time t0 of
3201     // the *previous* timestep.
3202     if (rel_time) {
3203         // Time coordinate relative to the current time.
3204         const auto h = m_last_h + time;
3205 
3206         m_d_out_f(m_d_out.data(), m_tc.data(), &h);
3207     } else {
3208         // Absolute time coordinate.
3209         const auto h = time - (m_time - m_last_h);
3210 
3211         m_d_out_f(m_d_out.data(), m_tc.data(), &h.hi);
3212     }
3213 
3214     return m_d_out;
3215 }
3216 
3217 // Explicit instantiation of the implementation classes/functions.
3218 // NOTE: on Windows apparently it is necessary to declare that
3219 // these instantiations are meant to be dll-exported.
3220 template class taylor_adaptive_impl<double>;
3221 
3222 template HEYOKA_DLL_PUBLIC void taylor_adaptive_impl<double>::finalise_ctor_impl(const std::vector<expression> &,
3223                                                                                  std::vector<double>, double, double,
3224                                                                                  bool, bool, std::vector<double>,
3225                                                                                  std::vector<t_event_t>,
3226                                                                                  std::vector<nt_event_t>);
3227 
3228 template HEYOKA_DLL_PUBLIC void
3229 taylor_adaptive_impl<double>::finalise_ctor_impl(const std::vector<std::pair<expression, expression>> &,
3230                                                  std::vector<double>, double, double, bool, bool, std::vector<double>,
3231                                                  std::vector<t_event_t>, std::vector<nt_event_t>);
3232 
3233 template class taylor_adaptive_impl<long double>;
3234 
3235 template HEYOKA_DLL_PUBLIC void
3236 taylor_adaptive_impl<long double>::finalise_ctor_impl(const std::vector<expression> &, std::vector<long double>,
3237                                                       long double, long double, bool, bool, std::vector<long double>,
3238                                                       std::vector<t_event_t>, std::vector<nt_event_t>);
3239 
3240 template HEYOKA_DLL_PUBLIC void taylor_adaptive_impl<long double>::finalise_ctor_impl(
3241     const std::vector<std::pair<expression, expression>> &, std::vector<long double>, long double, long double, bool,
3242     bool, std::vector<long double>, std::vector<t_event_t>, std::vector<nt_event_t>);
3243 
3244 #if defined(HEYOKA_HAVE_REAL128)
3245 
3246 template class taylor_adaptive_impl<mppp::real128>;
3247 
3248 template HEYOKA_DLL_PUBLIC void taylor_adaptive_impl<mppp::real128>::finalise_ctor_impl(
3249     const std::vector<expression> &, std::vector<mppp::real128>, mppp::real128, mppp::real128, bool, bool,
3250     std::vector<mppp::real128>, std::vector<t_event_t>, std::vector<nt_event_t>);
3251 
3252 template HEYOKA_DLL_PUBLIC void taylor_adaptive_impl<mppp::real128>::finalise_ctor_impl(
3253     const std::vector<std::pair<expression, expression>> &, std::vector<mppp::real128>, mppp::real128, mppp::real128,
3254     bool, bool, std::vector<mppp::real128>, std::vector<t_event_t>, std::vector<nt_event_t>);
3255 
3256 #endif
3257 
3258 } // namespace detail
3259 
3260 namespace detail
3261 {
3262 
3263 template <typename T>
3264 template <typename U>
finalise_ctor_impl(const U & sys,std::vector<T> state,std::uint32_t batch_size,std::vector<T> time,T tol,bool high_accuracy,bool compact_mode,std::vector<T> pars,std::vector<t_event_t> tes,std::vector<nt_event_t> ntes)3265 void taylor_adaptive_batch_impl<T>::finalise_ctor_impl(const U &sys, std::vector<T> state, std::uint32_t batch_size,
3266                                                        std::vector<T> time, T tol, bool high_accuracy,
3267                                                        bool compact_mode, std::vector<T> pars,
3268                                                        std::vector<t_event_t> tes, std::vector<nt_event_t> ntes)
3269 {
3270 #if defined(HEYOKA_ARCH_PPC)
3271     if constexpr (std::is_same_v<T, long double>) {
3272         throw not_implemented_error("'long double' computations are not supported on PowerPC");
3273     }
3274 #endif
3275 
3276     using std::isfinite;
3277 
3278     // Init the data members.
3279     m_batch_size = batch_size;
3280     m_state = std::move(state);
3281     m_time_hi = std::move(time);
3282     m_time_lo.resize(m_time_hi.size());
3283     m_pars = std::move(pars);
3284     m_high_accuracy = high_accuracy;
3285     m_compact_mode = compact_mode;
3286 
3287     // Check input params.
3288     if (m_batch_size == 0u) {
3289         throw std::invalid_argument("The batch size in an adaptive Taylor integrator cannot be zero");
3290     }
3291 
3292     if (std::any_of(m_state.begin(), m_state.end(), [](const auto &x) { return !isfinite(x); })) {
3293         throw std::invalid_argument(
3294             "A non-finite value was detected in the initial state of an adaptive Taylor integrator");
3295     }
3296 
3297     if (m_state.size() % m_batch_size != 0u) {
3298         throw std::invalid_argument(
3299             "Invalid size detected in the initialization of an adaptive Taylor "
3300             "integrator: the state vector has a size of {}, which is not a multiple of the batch size ({})"_format(
3301                 m_state.size(), m_batch_size));
3302     }
3303 
3304     if (m_state.size() / m_batch_size != sys.size()) {
3305         throw std::invalid_argument(
3306             "Inconsistent sizes detected in the initialization of an adaptive Taylor "
3307             "integrator: the state vector has a dimension of {} and a batch size of {}, "
3308             "while the number of equations is {}"_format(m_state.size() / m_batch_size, m_batch_size, sys.size()));
3309     }
3310 
3311     if (m_time_hi.size() != m_batch_size) {
3312         throw std::invalid_argument(
3313             "Invalid size detected in the initialization of an adaptive Taylor "
3314             "integrator: the time vector has a size of {}, which is not equal to the batch size ({})"_format(
3315                 m_time_hi.size(), m_batch_size));
3316     }
3317     // NOTE: no need to check m_time_lo for finiteness, as it
3318     // was inited to zero already.
3319     if (std::any_of(m_time_hi.begin(), m_time_hi.end(), [](const auto &x) { return !isfinite(x); })) {
3320         throw std::invalid_argument(
3321             "A non-finite initial time was detected in the initialisation of an adaptive Taylor integrator");
3322     }
3323 
3324     if (!isfinite(tol) || tol <= 0) {
3325         throw std::invalid_argument(
3326             "The tolerance in an adaptive Taylor integrator must be finite and positive, but it is {} instead"_format(
3327                 tol));
3328     }
3329 
3330     // Store the tolerance.
3331     m_tol = tol;
3332 
3333     // Store the dimension of the system.
3334     m_dim = boost::numeric_cast<std::uint32_t>(sys.size());
3335 
3336     // Do we have events?
3337     const auto with_events = !tes.empty() || !ntes.empty();
3338 
3339     // Temporarily disable optimisations in s, so that
3340     // we don't optimise twice when adding the step
3341     // and then the d_out.
3342     std::optional<opt_disabler> od(m_llvm);
3343 
3344     // Add the stepper function.
3345     if (with_events) {
3346         std::vector<expression> ee;
3347         // NOTE: no need for deep copies of the expressions: ee is never mutated
3348         // and we will be deep-copying it anyway when we do the decomposition.
3349         for (const auto &ev : tes) {
3350             ee.push_back(ev.get_expression());
3351         }
3352         for (const auto &ev : ntes) {
3353             ee.push_back(ev.get_expression());
3354         }
3355 
3356         std::tie(m_dc, m_order) = taylor_add_adaptive_step_with_events<T>(
3357             m_llvm, "step_e", sys, tol, batch_size, high_accuracy, compact_mode, ee, high_accuracy);
3358     } else {
3359         std::tie(m_dc, m_order)
3360             = taylor_add_adaptive_step<T>(m_llvm, "step", sys, tol, batch_size, high_accuracy, compact_mode);
3361     }
3362 
3363     // Fix m_pars' size, if necessary.
3364     const auto npars = n_pars_in_dc(m_dc);
3365     // LCOV_EXCL_START
3366     if (npars > std::numeric_limits<std::uint32_t>::max() / m_batch_size) {
3367         throw std::overflow_error("Overflow detected when computing the size of the parameter array in an adaptive "
3368                                   "Taylor integrator in batch mode");
3369     }
3370     // LCOV_EXCL_STOP
3371     if (m_pars.size() < npars * m_batch_size) {
3372         m_pars.resize(boost::numeric_cast<decltype(m_pars.size())>(npars * m_batch_size));
3373     } else if (m_pars.size() > npars * m_batch_size) {
3374         throw std::invalid_argument("Excessive number of parameter values passed to the constructor of an adaptive "
3375                                     "Taylor integrator in batch mode: {} parameter values were passed, but the ODE "
3376                                     "system contains only {} parameters "
3377                                     "(in batches of {})"_format(m_pars.size(), npars, m_batch_size));
3378     }
3379 
3380     // Log runtimes in trace mode.
3381     spdlog::stopwatch sw;
3382 
3383     // Add the function for the computation of
3384     // the dense output.
3385     taylor_add_d_out_function<T>(m_llvm, m_dim, m_order, m_batch_size, high_accuracy);
3386 
3387     get_logger()->trace("Taylor batch dense output runtime: {}", sw);
3388     sw.reset();
3389 
3390     // Restore the original optimisation level in s.
3391     od.reset();
3392 
3393     // Run the optimisation pass manually.
3394     m_llvm.optimise();
3395 
3396     get_logger()->trace("Taylor batch global opt pass runtime: {}", sw);
3397     sw.reset();
3398 
3399     // Run the jit.
3400     m_llvm.compile();
3401 
3402     get_logger()->trace("Taylor batch LLVM compilation runtime: {}", sw);
3403 
3404     // Fetch the stepper.
3405     if (with_events) {
3406         m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
3407     } else {
3408         m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
3409     }
3410 
3411     // Fetch the function to compute the dense output.
3412     m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
3413 
3414     // Setup the vector for the Taylor coefficients.
3415     // LCOV_EXCL_START
3416     if (m_order == std::numeric_limits<std::uint32_t>::max()
3417         || m_state.size() > std::numeric_limits<decltype(m_tc.size())>::max() / (m_order + 1u)) {
3418         throw std::overflow_error(
3419             "Overflow detected in the initialisation of an adaptive Taylor integrator in batch mode: the order "
3420             "or the state size is too large");
3421     }
3422     // LCOV_EXCL_STOP
3423 
3424     // NOTE: the size of m_state.size() already takes
3425     // into account the batch size.
3426     m_tc.resize(m_state.size() * (m_order + 1u));
3427 
3428     // Setup m_last_h.
3429     m_last_h.resize(boost::numeric_cast<decltype(m_last_h.size())>(batch_size));
3430 
3431     // Setup the vector for the dense output.
3432     // NOTE: the size of m_state.size() already takes
3433     // into account the batch size.
3434     m_d_out.resize(m_state.size());
3435 
3436     // Prepare the temp vectors.
3437     m_pinf.resize(m_batch_size, std::numeric_limits<T>::infinity());
3438     m_minf.resize(m_batch_size, -std::numeric_limits<T>::infinity());
3439     // LCOV_EXCL_START
3440     if (m_batch_size > std::numeric_limits<std::uint32_t>::max() / 2u) {
3441         throw std::overflow_error("Overflow detected in the initialisation of an adaptive Taylor integrator in batch "
3442                                   "mode: the batch size is too large");
3443     }
3444     // LCOV_EXCL_STOP
3445     // NOTE: make it twice as big because we need twice the storage in step_impl().
3446     m_delta_ts.resize(boost::numeric_cast<decltype(m_delta_ts.size())>(m_batch_size * 2u));
3447 
3448     // NOTE: init the outcome to success, the rest to zero.
3449     m_step_res.resize(boost::numeric_cast<decltype(m_step_res.size())>(m_batch_size),
3450                       std::tuple{taylor_outcome::success, T(0)});
3451     m_prop_res.resize(boost::numeric_cast<decltype(m_prop_res.size())>(m_batch_size),
3452                       std::tuple{taylor_outcome::success, T(0), T(0), std::size_t(0)});
3453 
3454     m_ts_count.resize(boost::numeric_cast<decltype(m_ts_count.size())>(m_batch_size));
3455     m_min_abs_h.resize(m_batch_size);
3456     m_max_abs_h.resize(m_batch_size);
3457     m_cur_max_delta_ts.resize(m_batch_size);
3458     m_pfor_ts.resize(boost::numeric_cast<decltype(m_pfor_ts.size())>(m_batch_size));
3459     m_t_dir.resize(boost::numeric_cast<decltype(m_t_dir.size())>(m_batch_size));
3460     m_rem_time.resize(m_batch_size);
3461 
3462     m_d_out_time.resize(m_batch_size);
3463 
3464     // Init the event data structure if needed.
3465     // NOTE: this can be done in parallel with the rest of the constructor,
3466     // once we have m_order/m_dim/m_batch_size and we are done using tes/ntes.
3467     if (with_events) {
3468         m_ed_data = std::make_unique<ed_data>(std::move(tes), std::move(ntes), m_order, m_dim, m_batch_size);
3469     }
3470 }
3471 
3472 template <typename T>
taylor_adaptive_batch_impl()3473 taylor_adaptive_batch_impl<T>::taylor_adaptive_batch_impl()
3474     : taylor_adaptive_batch_impl({prime("x"_var) = 0_dbl}, {T(0)}, 1u, kw::tol = T(1e-1))
3475 {
3476 }
3477 
3478 template <typename T>
taylor_adaptive_batch_impl(const taylor_adaptive_batch_impl & other)3479 taylor_adaptive_batch_impl<T>::taylor_adaptive_batch_impl(const taylor_adaptive_batch_impl &other)
3480     // NOTE: make a manual copy of all members, apart from the function pointers.
3481     : m_batch_size(other.m_batch_size), m_state(other.m_state), m_time_hi(other.m_time_hi), m_time_lo(other.m_time_lo),
3482       m_llvm(other.m_llvm), m_dim(other.m_dim), m_order(other.m_order), m_tol(other.m_tol),
3483       m_high_accuracy(other.m_high_accuracy), m_compact_mode(other.m_compact_mode), m_pars(other.m_pars),
3484       m_tc(other.m_tc), m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_pinf(other.m_pinf), m_minf(other.m_minf),
3485       m_delta_ts(other.m_delta_ts), m_step_res(other.m_step_res), m_prop_res(other.m_prop_res),
3486       m_ts_count(other.m_ts_count), m_min_abs_h(other.m_min_abs_h), m_max_abs_h(other.m_max_abs_h),
3487       m_cur_max_delta_ts(other.m_cur_max_delta_ts), m_pfor_ts(other.m_pfor_ts), m_t_dir(other.m_t_dir),
3488       m_rem_time(other.m_rem_time), m_d_out_time(other.m_d_out_time),
3489       m_ed_data(other.m_ed_data ? std::make_unique<ed_data>(*other.m_ed_data) : nullptr)
3490 {
3491     // NOTE: make explicit deep copy of the decomposition.
3492     m_dc.reserve(other.m_dc.size());
3493     for (const auto &[ex, deps] : other.m_dc) {
3494         m_dc.emplace_back(copy(ex), deps);
3495     }
3496 
3497     if (m_ed_data) {
3498         m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
3499     } else {
3500         m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
3501     }
3502     m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
3503 }
3504 
3505 template <typename T>
3506 taylor_adaptive_batch_impl<T>::taylor_adaptive_batch_impl(taylor_adaptive_batch_impl &&) noexcept = default;
3507 
3508 template <typename T>
operator =(const taylor_adaptive_batch_impl & other)3509 taylor_adaptive_batch_impl<T> &taylor_adaptive_batch_impl<T>::operator=(const taylor_adaptive_batch_impl &other)
3510 {
3511     if (this != &other) {
3512         *this = taylor_adaptive_batch_impl(other);
3513     }
3514 
3515     return *this;
3516 }
3517 
3518 template <typename T>
3519 taylor_adaptive_batch_impl<T> &
3520 taylor_adaptive_batch_impl<T>::operator=(taylor_adaptive_batch_impl &&) noexcept = default;
3521 
3522 template <typename T>
3523 taylor_adaptive_batch_impl<T>::~taylor_adaptive_batch_impl() = default;
3524 
3525 // NOTE: the save/load patterns mimic the copy constructor logic.
3526 template <typename T>
3527 template <typename Archive>
save_impl(Archive & ar,unsigned) const3528 void taylor_adaptive_batch_impl<T>::save_impl(Archive &ar, unsigned) const
3529 {
3530     // NOTE: save all members, apart from the function pointers.
3531     ar << m_batch_size;
3532     ar << m_state;
3533     ar << m_time_hi;
3534     ar << m_time_lo;
3535     ar << m_llvm;
3536     ar << m_dim;
3537     ar << m_dc;
3538     ar << m_order;
3539     ar << m_tol;
3540     ar << m_high_accuracy;
3541     ar << m_compact_mode;
3542     ar << m_pars;
3543     ar << m_tc;
3544     ar << m_last_h;
3545     ar << m_d_out;
3546     ar << m_pinf;
3547     ar << m_minf;
3548     ar << m_delta_ts;
3549     ar << m_step_res;
3550     ar << m_prop_res;
3551     ar << m_ts_count;
3552     ar << m_min_abs_h;
3553     ar << m_max_abs_h;
3554     ar << m_cur_max_delta_ts;
3555     ar << m_pfor_ts;
3556     ar << m_t_dir;
3557     ar << m_rem_time;
3558     ar << m_d_out_time;
3559     ar << m_ed_data;
3560 }
3561 
3562 template <typename T>
3563 template <typename Archive>
load_impl(Archive & ar,unsigned version)3564 void taylor_adaptive_batch_impl<T>::load_impl(Archive &ar, unsigned version)
3565 {
3566     // LCOV_EXCL_START
3567     if (version < static_cast<unsigned>(boost::serialization::version<taylor_adaptive_batch_impl<T>>::type::value)) {
3568         throw std::invalid_argument("Unable to load a taylor_adaptive_batch integrator: "
3569                                     "the archive version ({}) is too old"_format(version));
3570     }
3571     // LCOV_EXCL_STOP
3572 
3573     ar >> m_batch_size;
3574     ar >> m_state;
3575     ar >> m_time_hi;
3576     ar >> m_time_lo;
3577     ar >> m_llvm;
3578     ar >> m_dim;
3579     ar >> m_dc;
3580     ar >> m_order;
3581     ar >> m_tol;
3582     ar >> m_high_accuracy;
3583     ar >> m_compact_mode;
3584     ar >> m_pars;
3585     ar >> m_tc;
3586     ar >> m_last_h;
3587     ar >> m_d_out;
3588     ar >> m_pinf;
3589     ar >> m_minf;
3590     ar >> m_delta_ts;
3591     ar >> m_step_res;
3592     ar >> m_prop_res;
3593     ar >> m_ts_count;
3594     ar >> m_min_abs_h;
3595     ar >> m_max_abs_h;
3596     ar >> m_cur_max_delta_ts;
3597     ar >> m_pfor_ts;
3598     ar >> m_t_dir;
3599     ar >> m_rem_time;
3600     ar >> m_d_out_time;
3601     ar >> m_ed_data;
3602 
3603     // Recover the function pointers.
3604     if (m_ed_data) {
3605         m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
3606     } else {
3607         m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
3608     }
3609     m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
3610 }
3611 
3612 template <typename T>
save(boost::archive::binary_oarchive & ar,unsigned v) const3613 void taylor_adaptive_batch_impl<T>::save(boost::archive::binary_oarchive &ar, unsigned v) const
3614 {
3615     save_impl(ar, v);
3616 }
3617 
3618 template <typename T>
load(boost::archive::binary_iarchive & ar,unsigned v)3619 void taylor_adaptive_batch_impl<T>::load(boost::archive::binary_iarchive &ar, unsigned v)
3620 {
3621     load_impl(ar, v);
3622 }
3623 
3624 template <typename T>
set_time(const std::vector<T> & new_time)3625 void taylor_adaptive_batch_impl<T>::set_time(const std::vector<T> &new_time)
3626 {
3627     // Check the dimensionality of new_time.
3628     if (new_time.size() != m_batch_size) {
3629         throw std::invalid_argument(
3630             "Invalid number of new times specified in a Taylor integrator in batch mode: the batch size is {}, "
3631             "but the number of specified times is {}"_format(m_batch_size, new_time.size()));
3632     }
3633 
3634     // Copy over the new times.
3635     // NOTE: do not use std::copy(), as it gives UB if new_time
3636     // and m_time_hi are the same object.
3637     for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3638         m_time_hi[i] = new_time[i];
3639     }
3640     // Reset the lo part.
3641     std::fill(m_time_lo.begin(), m_time_lo.end(), T(0));
3642 }
3643 
3644 // Implementation detail to make a single integration timestep.
3645 // The magnitude of the timestep is automatically deduced for each
3646 // state vector, but it will always be not greater than
3647 // the absolute value of the corresponding element in max_delta_ts.
3648 // For each state vector, the propagation
3649 // is done forward in time if max_delta_t >= 0, backwards in
3650 // time otherwise.
3651 //
3652 // The function will write to res a pair for each state
3653 // vector, containing a flag describing the outcome of the integration
3654 // and the integration timestep that was used.
3655 template <typename T>
step_impl(const std::vector<T> & max_delta_ts,bool wtc)3656 void taylor_adaptive_batch_impl<T>::step_impl(const std::vector<T> &max_delta_ts, bool wtc)
3657 {
3658     using std::abs;
3659     using std::isfinite;
3660 
3661     // LCOV_EXCL_START
3662     // Check preconditions.
3663     assert(max_delta_ts.size() == m_batch_size);
3664     assert(std::none_of(max_delta_ts.begin(), max_delta_ts.end(), [](const auto &x) {
3665         using std::isnan;
3666         return isnan(x);
3667     }));
3668 
3669     // Sanity check.
3670     assert(m_step_res.size() == m_batch_size);
3671     // LCOV_EXCL_STOP
3672 
3673     // Copy max_delta_ts to the tmp buffer, twice.
3674     std::copy(max_delta_ts.begin(), max_delta_ts.end(), m_delta_ts.data());
3675     std::copy(max_delta_ts.begin(), max_delta_ts.end(), m_delta_ts.data() + m_batch_size);
3676 
3677     // Helper to check if the state vector of a batch element
3678     // contains a non-finite value.
3679     auto check_nf_batch = [this](std::uint32_t batch_idx) {
3680         for (std::uint32_t i = 0; i < m_dim; ++i) {
3681             if (!isfinite(m_state[i * m_batch_size + batch_idx])) {
3682                 return true;
3683             }
3684         }
3685         return false;
3686     };
3687 
3688     if (m_step_f.index() == 0u) {
3689         assert(!m_ed_data); // LCOV_EXCL_LINE
3690 
3691         std::get<0>(m_step_f)(m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(),
3692                               wtc ? m_tc.data() : nullptr);
3693 
3694         // Update the times and the last timesteps, and write out the result.
3695         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3696             // The timestep that was actually used for
3697             // this batch element.
3698             const auto h = m_delta_ts[i];
3699 
3700             // Compute the new time in double-length arithmetic.
3701             const auto new_time = dfloat<T>(m_time_hi[i], m_time_lo[i]) + h;
3702             m_time_hi[i] = new_time.hi;
3703             m_time_lo[i] = new_time.lo;
3704 
3705             // Update the size of the last timestep.
3706             m_last_h[i] = h;
3707 
3708             if (!isfinite(new_time) || check_nf_batch(i)) {
3709                 // Either the new time or state contain non-finite values,
3710                 // return an error condition.
3711                 m_step_res[i] = std::tuple{taylor_outcome::err_nf_state, h};
3712             } else {
3713                 m_step_res[i] = std::tuple{
3714                     // NOTE: use here the original value of
3715                     // max_delta_ts, stored at the end of m_delta_ts,
3716                     // in case max_delta_ts aliases integrator data
3717                     // which was modified during the step.
3718                     h == m_delta_ts[m_batch_size + i] ? taylor_outcome::time_limit : taylor_outcome::success, h};
3719             }
3720         }
3721     } else {
3722         assert(m_ed_data); // LCOV_EXCL_LINE
3723 
3724         auto &ed_data = *m_ed_data;
3725 
3726         // Invoke the stepper for event handling. We will record the norm infinity of the state vector +
3727         // event equations at the beginning of the timestep for later use.
3728         std::get<1>(m_step_f)(ed_data.m_ev_jet.data(), m_state.data(), m_pars.data(), m_time_hi.data(),
3729                               m_delta_ts.data(), ed_data.m_max_abs_state.data());
3730 
3731         // Compute the maximum absolute error on the Taylor series of the event equations, which we will use for
3732         // automatic cooldown deduction. If max_abs_state is not finite, set it to inf so that
3733         // in ed_data.detect_events() we skip event detection altogether.
3734         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3735             const auto max_abs_state = ed_data.m_max_abs_state[i];
3736 
3737             if (isfinite(max_abs_state)) {
3738                 // Are we in absolute or relative error control mode?
3739                 const auto abs_or_rel = max_abs_state < 1;
3740 
3741                 // Estimate the size of the largest remainder in the Taylor
3742                 // series of both the dynamical equations and the events.
3743                 const auto max_r_size = abs_or_rel ? m_tol : (m_tol * max_abs_state);
3744 
3745                 // NOTE: depending on m_tol, max_r_size is arbitrarily small, but the real
3746                 // integration error cannot be too small due to floating-point truncation.
3747                 // This is the case for instance if we use sub-epsilon integration tolerances
3748                 // to achieve Brouwer's law. In such a case, we cap the value of g_eps,
3749                 // using eps * max_abs_state as an estimation of the smallest number
3750                 // that can be resolved with the current floating-point type.
3751                 // NOTE: the if condition in the next line is equivalent, in relative
3752                 // error control mode, to:
3753                 // if (m_tol < std::numeric_limits<T>::epsilon())
3754                 if (max_r_size < std::numeric_limits<T>::epsilon() * max_abs_state) {
3755                     ed_data.m_g_eps[i] = std::numeric_limits<T>::epsilon() * max_abs_state;
3756                 } else {
3757                     ed_data.m_g_eps[i] = max_r_size;
3758                 }
3759             } else {
3760                 ed_data.m_g_eps[i] = std::numeric_limits<T>::infinity();
3761             }
3762         }
3763 
3764         // Write unconditionally the tcs.
3765         std::copy(ed_data.m_ev_jet.data(), ed_data.m_ev_jet.data() + m_dim * (m_order + 1u) * m_batch_size,
3766                   m_tc.data());
3767 
3768         // Do the event detection.
3769         ed_data.detect_events(m_delta_ts.data(), m_order, m_dim, m_batch_size);
3770 
3771         // NOTE: before this point, we did not alter
3772         // any user-visible data in the integrator (just
3773         // temporary memory). From here until we start invoking
3774         // the callbacks, everything is noexcept, so we don't
3775         // risk leaving the integrator in a half-baked state.
3776 
3777         // Sort the events by time.
3778         // NOTE: the time coordinates in m_d_(n)tes is relative
3779         // to the beginning of the timestep. It will be negative
3780         // for backward integration, thus we compare using
3781         // abs() so that the first events are those which
3782         // happen closer to the beginning of the timestep.
3783         // NOTE: the checks inside ed_data.detect_events() ensure
3784         // that we can safely sort the events' times.
3785         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3786             auto cmp = [](const auto &ev0, const auto &ev1) { return abs(std::get<1>(ev0)) < abs(std::get<1>(ev1)); };
3787             std::sort(ed_data.m_d_tes[i].begin(), ed_data.m_d_tes[i].end(), cmp);
3788             std::sort(ed_data.m_d_ntes[i].begin(), ed_data.m_d_ntes[i].end(), cmp);
3789 
3790             // If we have terminal events we need
3791             // to update the value of h.
3792             if (!ed_data.m_d_tes[i].empty()) {
3793                 m_delta_ts[i] = std::get<1>(ed_data.m_d_tes[i][0]);
3794             }
3795         }
3796 
3797         // Update the state.
3798         m_d_out_f(m_state.data(), ed_data.m_ev_jet.data(), m_delta_ts.data());
3799 
3800         // We will use this to capture the first exception thrown
3801         // by a callback, if any.
3802         std::exception_ptr eptr;
3803 
3804         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3805             const auto h = m_delta_ts[i];
3806 
3807             // Compute the new time in double-length arithmetic.
3808             const auto new_time = dfloat<T>(m_time_hi[i], m_time_lo[i]) + h;
3809             m_time_hi[i] = new_time.hi;
3810             m_time_lo[i] = new_time.lo;
3811 
3812             // Store the last timestep.
3813             m_last_h[i] = h;
3814 
3815             // Check if the time or the state vector are non-finite at the
3816             // end of the timestep.
3817             if (!isfinite(new_time) || check_nf_batch(i)) {
3818                 // Let's also reset the cooldown values for this batch index,
3819                 // as at this point they have become useless.
3820                 reset_cooldowns(i);
3821 
3822                 m_step_res[i] = std::tuple{taylor_outcome::err_nf_state, h};
3823 
3824                 // Move to the next batch element.
3825                 continue;
3826             }
3827 
3828             // Update the cooldowns.
3829             for (auto &cd : ed_data.m_te_cooldowns[i]) {
3830                 if (cd) {
3831                     // Check if the timestep we just took
3832                     // brought this event outside the cooldown.
3833                     auto tmp = cd->first + h;
3834 
3835                     if (abs(tmp) >= cd->second) {
3836                         // We are now outside the cooldown period
3837                         // for this event, reset cd.
3838                         cd.reset();
3839                     } else {
3840                         // Still in cooldown, update the
3841                         // time spent in cooldown.
3842                         cd->first = tmp;
3843                     }
3844                 }
3845             }
3846 
3847             // If we don't have terminal events, we will invoke the callbacks
3848             // of *all* the non-terminal events. Otherwise, we need to figure
3849             // out which non-terminal events do not happen because their time
3850             // coordinate is past the the first terminal event.
3851             const auto ntes_end_it
3852                 = ed_data.m_d_tes[i].empty()
3853                       ? ed_data.m_d_ntes[i].end()
3854                       : std::lower_bound(ed_data.m_d_ntes[i].begin(), ed_data.m_d_ntes[i].end(), h,
3855                                          [](const auto &ev, const auto &t) { return abs(std::get<1>(ev)) < abs(t); });
3856 
3857             // Invoke the callbacks of the non-terminal events, which are guaranteed
3858             // to happen before the first terminal event.
3859             for (auto it = ed_data.m_d_ntes[i].begin(); it != ntes_end_it; ++it) {
3860                 const auto &t = *it;
3861                 const auto &cb = ed_data.m_ntes[std::get<0>(t)].get_callback();
3862                 assert(cb); // LCOV_EXCL_LINE
3863                 try {
3864                     cb(*this, static_cast<T>(new_time - m_last_h[i] + std::get<1>(t)), std::get<2>(t), i);
3865                 } catch (...) {
3866                     if (!eptr) {
3867                         eptr = std::current_exception();
3868                     }
3869                 }
3870             }
3871 
3872             // The return value of the first
3873             // terminal event's callback. It will be
3874             // unused if there are no terminal events.
3875             bool te_cb_ret = false;
3876 
3877             if (!ed_data.m_d_tes[i].empty()) {
3878                 // Fetch the first terminal event.
3879                 const auto te_idx = std::get<0>(ed_data.m_d_tes[i][0]);
3880                 assert(te_idx < ed_data.m_tes.size()); // LCOV_EXCL_LINE
3881                 const auto &te = ed_data.m_tes[te_idx];
3882 
3883                 // Set the corresponding cooldown.
3884                 if (te.get_cooldown() >= 0) {
3885                     // Cooldown explicitly provided by the user, use it.
3886                     ed_data.m_te_cooldowns[i][te_idx].emplace(0, te.get_cooldown());
3887                 } else {
3888                     // Deduce the cooldown automatically.
3889                     // NOTE: if m_g_eps[i] is not finite, we skipped event detection
3890                     // altogether and thus we never end up here. If the derivative
3891                     // of the event equation is not finite, the event is also skipped.
3892                     ed_data.m_te_cooldowns[i][te_idx].emplace(
3893                         0, taylor_deduce_cooldown(ed_data.m_g_eps[i], std::get<4>(ed_data.m_d_tes[i][0])));
3894                 }
3895 
3896                 // Invoke the callback of the first terminal event, if it has one.
3897                 if (te.get_callback()) {
3898                     try {
3899                         te_cb_ret = te.get_callback()(*this, std::get<2>(ed_data.m_d_tes[i][0]),
3900                                                       std::get<3>(ed_data.m_d_tes[i][0]), i);
3901                     } catch (...) {
3902                         if (!eptr) {
3903                             eptr = std::current_exception();
3904                         }
3905                     }
3906                 }
3907             }
3908 
3909             if (ed_data.m_d_tes[i].empty()) {
3910                 // No terminal events detected, return success or time limit.
3911                 m_step_res[i] = std::tuple{
3912                     // NOTE: use here the original value of
3913                     // max_delta_ts, stored at the end of m_delta_ts,
3914                     // in case max_delta_ts aliases integrator data
3915                     // which was modified during the step.
3916                     h == m_delta_ts[m_batch_size + i] ? taylor_outcome::time_limit : taylor_outcome::success, h};
3917             } else {
3918                 // Terminal event detected. Fetch its index.
3919                 const auto ev_idx = static_cast<std::int64_t>(std::get<0>(ed_data.m_d_tes[i][0]));
3920 
3921                 // NOTE: if te_cb_ret is true, it means that the terminal event has
3922                 // a callback and its invocation returned true (meaning that the
3923                 // integration should continue). Otherwise, either the terminal event
3924                 // has no callback or its callback returned false, meaning that the
3925                 // integration must stop.
3926                 m_step_res[i] = std::tuple{taylor_outcome{te_cb_ret ? ev_idx : (-ev_idx - 1)}, h};
3927             }
3928         }
3929 
3930         // Check if any callback threw an exception, and re-throw
3931         // it in case.
3932         if (eptr) {
3933             std::rethrow_exception(eptr);
3934         }
3935     }
3936 }
3937 
3938 template <typename T>
step(bool wtc)3939 void taylor_adaptive_batch_impl<T>::step(bool wtc)
3940 {
3941     step_impl(m_pinf, wtc);
3942 }
3943 
3944 template <typename T>
step_backward(bool wtc)3945 void taylor_adaptive_batch_impl<T>::step_backward(bool wtc)
3946 {
3947     step_impl(m_minf, wtc);
3948 }
3949 
3950 template <typename T>
step(const std::vector<T> & max_delta_ts,bool wtc)3951 void taylor_adaptive_batch_impl<T>::step(const std::vector<T> &max_delta_ts, bool wtc)
3952 {
3953     // Check the dimensionality of max_delta_ts.
3954     if (max_delta_ts.size() != m_batch_size) {
3955         throw std::invalid_argument(
3956             "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is {}, "
3957             "but the number of specified timesteps is {}"_format(m_batch_size, max_delta_ts.size()));
3958     }
3959 
3960     // Make sure no values in max_delta_ts are nan.
3961     if (std::any_of(max_delta_ts.begin(), max_delta_ts.end(), [](const auto &x) {
3962             using std::isnan;
3963             return isnan(x);
3964         })) {
3965         throw std::invalid_argument(
3966             "Cannot invoke the step() function of an adaptive Taylor integrator in batch mode if "
3967             "one of the max timesteps is nan");
3968     }
3969 
3970     step_impl(max_delta_ts, wtc);
3971 }
3972 
3973 template <typename T>
propagate_for_impl(const std::vector<T> & delta_ts,std::size_t max_steps,const std::vector<T> & max_delta_ts,std::function<bool (taylor_adaptive_batch_impl &)> cb,bool wtc,bool with_c_out)3974 std::optional<continuous_output_batch<T>> taylor_adaptive_batch_impl<T>::propagate_for_impl(
3975     const std::vector<T> &delta_ts, std::size_t max_steps, const std::vector<T> &max_delta_ts,
3976     std::function<bool(taylor_adaptive_batch_impl &)> cb, bool wtc, bool with_c_out)
3977 {
3978     // Check the dimensionality of delta_ts.
3979     if (delta_ts.size() != m_batch_size) {
3980         throw std::invalid_argument("Invalid number of time intervals specified in a Taylor integrator in batch mode: "
3981                                     "the batch size is {}, but the number of specified time intervals is {}"_format(
3982                                         m_batch_size, delta_ts.size()));
3983     }
3984 
3985     for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3986         m_pfor_ts[i] = dfloat<T>(m_time_hi[i], m_time_lo[i]) + delta_ts[i];
3987     }
3988 
3989     // NOTE: max_delta_ts is checked in propagate_until_impl().
3990     return propagate_until_impl(m_pfor_ts, max_steps, max_delta_ts, std::move(cb), wtc, with_c_out);
3991 }
3992 
3993 template <typename T>
propagate_until_impl(const std::vector<dfloat<T>> & ts,std::size_t max_steps,const std::vector<T> & max_delta_ts,std::function<bool (taylor_adaptive_batch_impl &)> cb,bool wtc,bool with_c_out)3994 std::optional<continuous_output_batch<T>> taylor_adaptive_batch_impl<T>::propagate_until_impl(
3995     const std::vector<dfloat<T>> &ts, std::size_t max_steps, const std::vector<T> &max_delta_ts,
3996     std::function<bool(taylor_adaptive_batch_impl &)> cb, bool wtc, bool with_c_out)
3997 {
3998     using std::abs;
3999     using std::isfinite;
4000     using std::isnan;
4001 
4002     // NOTE: this function is called from either the other propagate_until() overload,
4003     // or propagate_for(). In both cases, we have already set up correctly the dimension of ts.
4004     assert(ts.size() == m_batch_size); // LCOV_EXCL_LINE
4005 
4006     // Check the current times.
4007     if (std::any_of(m_time_hi.begin(), m_time_hi.end(), [](const auto &t) { return !isfinite(t); })
4008         || std::any_of(m_time_lo.begin(), m_time_lo.end(), [](const auto &t) { return !isfinite(t); })) {
4009         throw std::invalid_argument(
4010             "Cannot invoke the propagate_until() function of an adaptive Taylor integrator in batch mode if "
4011             "one of the current times is not finite");
4012     }
4013 
4014     // Check the final times.
4015     if (std::any_of(ts.begin(), ts.end(), [](const auto &t) { return !isfinite(t); })) {
4016         throw std::invalid_argument("A non-finite time was passed to the propagate_until() function of an adaptive "
4017                                     "Taylor integrator in batch mode");
4018     }
4019 
4020     // Check max_delta_ts.
4021     if (max_delta_ts.size() != m_batch_size) {
4022         throw std::invalid_argument(
4023             "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is {}, "
4024             "but the number of specified timesteps is {}"_format(m_batch_size, max_delta_ts.size()));
4025     }
4026     for (const auto &dt : max_delta_ts) {
4027         if (isnan(dt)) {
4028             throw std::invalid_argument("A nan max_delta_t was passed to the propagate_until() function of an adaptive "
4029                                         "Taylor integrator in batch mode");
4030         }
4031         if (dt <= 0) {
4032             throw std::invalid_argument("A non-positive max_delta_t was passed to the propagate_until() function of an "
4033                                         "adaptive Taylor integrator in batch mode");
4034         }
4035     }
4036 
4037     // If with_c_out is true, we always need to write the Taylor coefficients.
4038     wtc = wtc || with_c_out;
4039 
4040     // These vectors are used in the construction of the continuous output.
4041     // If continuous output is not requested, they will remain empty.
4042     std::vector<T> c_out_tcs, c_out_times_hi, c_out_times_lo;
4043     if (with_c_out) {
4044         // Push in the starting time.
4045         c_out_times_hi.insert(c_out_times_hi.end(), m_time_hi.begin(), m_time_hi.end());
4046         c_out_times_lo.insert(c_out_times_lo.end(), m_time_lo.begin(), m_time_lo.end());
4047     }
4048 
4049     // Reset the counters and the min/max abs(h) vectors.
4050     std::size_t iter_counter = 0;
4051     for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4052         m_ts_count[i] = 0;
4053         m_min_abs_h[i] = std::numeric_limits<T>::infinity();
4054         m_max_abs_h[i] = 0;
4055     }
4056 
4057     // Compute the integration directions and init
4058     // the remaining times.
4059     for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4060         m_rem_time[i] = ts[i] - dfloat<T>(m_time_hi[i], m_time_lo[i]);
4061         if (!isfinite(m_rem_time[i])) {
4062             throw std::invalid_argument("The final time passed to the propagate_until() function of an adaptive Taylor "
4063                                         "integrator in batch mode results in an overflow condition");
4064         }
4065 
4066         m_t_dir[i] = (m_rem_time[i] >= T(0));
4067     }
4068 
4069     // Helper to create the continuous output object.
4070     auto make_c_out = [&]() -> std::optional<continuous_output_batch<T>> {
4071         if (with_c_out) {
4072             if (c_out_times_hi.size() / m_batch_size < 2u) {
4073                 // NOTE: this means that no successful steps
4074                 // were taken.
4075                 return {};
4076             }
4077 
4078             // Construct the return value.
4079             continuous_output_batch<T> ret(m_llvm.make_similar());
4080 
4081             // Fill in the data.
4082             ret.m_batch_size = m_batch_size;
4083             ret.m_tcs = std::move(c_out_tcs);
4084             ret.m_times_hi = std::move(c_out_times_hi);
4085             ret.m_times_lo = std::move(c_out_times_lo);
4086 
4087             // Add padding to the times vectors to make the
4088             // vectorised upper_bound implementation well defined.
4089             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4090                 ret.m_times_hi.push_back(m_t_dir[i] ? std::numeric_limits<T>::infinity()
4091                                                     : -std::numeric_limits<T>::infinity());
4092                 ret.m_times_lo.push_back(T(0));
4093             }
4094 
4095             // Prepare the output vector.
4096             ret.m_output.resize(boost::numeric_cast<decltype(ret.m_output.size())>(m_dim * m_batch_size));
4097 
4098             // Prepare the temp time vector.
4099             ret.m_tmp_tm.resize(boost::numeric_cast<decltype(ret.m_tmp_tm.size())>(m_batch_size));
4100 
4101             // Add the continuous output function.
4102             ret.add_c_out_function(m_order, m_dim, m_high_accuracy);
4103 
4104             return std::optional{std::move(ret)};
4105         } else {
4106             return {};
4107         }
4108     };
4109 
4110     // Helper to update the continuous output data after a timestep.
4111     auto update_c_out = [&]() {
4112         if (with_c_out) {
4113             // Check if any batch element produced an error during the
4114             // last timestep. The only error that can arise is a non-finite
4115             // time/state.
4116             if (std::any_of(m_step_res.begin(), m_step_res.end(),
4117                             [](const auto &tup) { return std::get<0>(tup) == taylor_outcome::err_nf_state; })) {
4118                 // Don't update the continuous output.
4119                 return;
4120             }
4121 
4122 #if !defined(NDEBUG)
4123             std::vector<dfloat<T>> prev_times;
4124             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4125                 prev_times.emplace_back(c_out_times_hi[c_out_times_hi.size() - m_batch_size + i],
4126                                         c_out_times_lo[c_out_times_lo.size() - m_batch_size + i]);
4127             }
4128 #endif
4129 
4130             c_out_times_hi.insert(c_out_times_hi.end(), m_time_hi.begin(), m_time_hi.end());
4131             c_out_times_lo.insert(c_out_times_lo.end(), m_time_lo.begin(), m_time_lo.end());
4132 
4133 #if !defined(NDEBUG)
4134             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4135                 const dfloat<T> new_time(c_out_times_hi[c_out_times_hi.size() - m_batch_size + i],
4136                                          c_out_times_lo[c_out_times_lo.size() - m_batch_size + i]);
4137                 assert(isfinite(new_time));
4138                 if (m_t_dir[i]) {
4139                     assert(!(new_time < prev_times[i]));
4140                 } else {
4141                     assert(!(new_time > prev_times[i]));
4142                 }
4143             }
4144 #endif
4145 
4146             c_out_tcs.insert(c_out_tcs.end(), m_tc.begin(), m_tc.end());
4147         }
4148     };
4149 
4150     while (true) {
4151         // Compute the max integration times for this timestep.
4152         // NOTE: m_rem_time[i] is guaranteed to be finite: we check it explicitly above
4153         // and we keep on decreasing its magnitude at the following iterations.
4154         // If some non-finite state/time is generated in
4155         // the step function, the integration will be stopped.
4156         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4157             assert((m_rem_time[i] >= T(0)) == m_t_dir[i] || m_rem_time[i] == T(0)); // LCOV_EXCL_LINE
4158 
4159             // Compute the time limit.
4160             const auto dt_limit = m_t_dir[i] ? std::min(dfloat<T>(max_delta_ts[i]), m_rem_time[i])
4161                                              : std::max(dfloat<T>(-max_delta_ts[i]), m_rem_time[i]);
4162 
4163             // Store it.
4164             m_cur_max_delta_ts[i] = static_cast<T>(dt_limit);
4165         }
4166 
4167         // Run the integration timestep.
4168         // NOTE: if dt_limit is zero, step_impl() will always return time_limit.
4169         step_impl(m_cur_max_delta_ts, wtc);
4170 
4171         // Update the continuous output.
4172         // NOTE: this function will avoid updating the c_out if any non-finite
4173         // state is detected in the step that was just taken.
4174         update_c_out();
4175 
4176         // Check if the integration timestep produced an error condition or we reached
4177         // a stopping terminal event.
4178         if (std::any_of(m_step_res.begin(), m_step_res.end(), [](const auto &tup) {
4179                 const auto oc = std::get<0>(tup);
4180                 return oc != taylor_outcome::success && oc != taylor_outcome::time_limit && oc < taylor_outcome{0};
4181             })) {
4182             // Setup m_prop_res before exiting.
4183             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4184                 const auto [oc, h] = m_step_res[i];
4185                 if (oc > taylor_outcome::success) {
4186                     // In case of a stopping terminal event, we still want to update
4187                     // the step counter.
4188                     m_ts_count[i] += static_cast<std::size_t>(h != 0);
4189                 }
4190 
4191                 m_prop_res[i] = std::tuple{oc, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4192             }
4193 
4194             return make_c_out();
4195         }
4196 
4197         // Update the iteration counter.
4198         ++iter_counter;
4199 
4200         // Update the local step counters and min_h/max_h.
4201         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4202             const auto [res, h] = m_step_res[i];
4203 
4204             // NOTE: the local step counters increase only if we integrated
4205             // for a nonzero time.
4206             m_ts_count[i] += static_cast<std::size_t>(h != 0);
4207 
4208             // Update min_h/max_h only if the outcome is success (otherwise
4209             // the step was artificially clamped either by a time limit or
4210             // by a continuing terminal event).
4211             if (res == taylor_outcome::success) {
4212                 const auto abs_h = abs(h);
4213                 m_min_abs_h[i] = std::min(m_min_abs_h[i], abs_h);
4214                 m_max_abs_h[i] = std::max(m_max_abs_h[i], abs_h);
4215             }
4216         }
4217 
4218         // The step was successful, execute the callback.
4219         if (cb && !cb(*this)) {
4220             // Setup m_prop_res before exiting. We set all outcomes
4221             // to cb_stop regardless of the timestep outcome.
4222             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4223                 m_prop_res[i] = std::tuple{taylor_outcome::cb_stop, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4224             }
4225 
4226             return make_c_out();
4227         }
4228 
4229         // Break out if we have reached the final time
4230         // for all batch elements.
4231         bool all_done = true;
4232         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4233             // NOTE: here we check h == rem_time, instead of just
4234             // res == time_limit, because clamping via max_delta_t
4235             // could also result in time_limit.
4236             if (std::get<1>(m_step_res[i]) != static_cast<T>(m_rem_time[i])) {
4237                 all_done = false;
4238                 break;
4239             }
4240         }
4241         if (all_done) {
4242             // Setup m_prop_res before exiting. The outcomes will all be time_limit.
4243             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4244                 m_prop_res[i] = std::tuple{taylor_outcome::time_limit, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4245             }
4246 
4247             return make_c_out();
4248         }
4249 
4250         // Check the iteration limit.
4251         // NOTE: if max_steps is 0 (i.e., no limit on the number of steps),
4252         // then this condition will never trigger as by this point we are
4253         // sure iter_counter is at least 1.
4254         if (iter_counter == max_steps) {
4255             // We reached the max_steps limit: set the outcome for all batch elements
4256             // to step_limit.
4257             // NOTE: this is the same logic adopted when the integration is stopped
4258             // by the callback (see above).
4259             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4260                 m_prop_res[i] = std::tuple{taylor_outcome::step_limit, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4261             }
4262 
4263             return make_c_out();
4264         }
4265 
4266         // Update the remaining times.
4267         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4268             const auto [res, h] = m_step_res[i];
4269 
4270             // NOTE: if static_cast<T>(m_rem_time[i]) was used as a timestep,
4271             // it means that we hit the time limit. Force rem_time to zero
4272             // to signal this, so that zero-length steps will be taken
4273             // for all remaining iterations
4274             // NOTE: if m_rem_time[i] was previously set to zero, it
4275             // will end up being repeatedly set to zero here. This
4276             // should be harmless.
4277             if (h == static_cast<T>(m_rem_time[i])) {
4278                 assert(res == taylor_outcome::time_limit); // LCOV_EXCL_LINE
4279                 m_rem_time[i] = dfloat<T>(T(0));
4280             } else {
4281                 m_rem_time[i] = ts[i] - dfloat<T>(m_time_hi[i], m_time_lo[i]);
4282             }
4283         }
4284     }
4285 
4286     // LCOV_EXCL_START
4287     assert(false);
4288 
4289     return {};
4290     // LCOV_EXCL_STOP
4291 }
4292 
4293 template <typename T>
propagate_until_impl(const std::vector<T> & ts,std::size_t max_steps,const std::vector<T> & max_delta_ts,std::function<bool (taylor_adaptive_batch_impl &)> cb,bool wtc,bool with_c_out)4294 std::optional<continuous_output_batch<T>> taylor_adaptive_batch_impl<T>::propagate_until_impl(
4295     const std::vector<T> &ts, std::size_t max_steps, const std::vector<T> &max_delta_ts,
4296     std::function<bool(taylor_adaptive_batch_impl &)> cb, bool wtc, bool with_c_out)
4297 {
4298     // Check the dimensionality of ts.
4299     if (ts.size() != m_batch_size) {
4300         throw std::invalid_argument(
4301             "Invalid number of time limits specified in a Taylor integrator in batch mode: the "
4302             "batch size is {}, but the number of specified time limits is {}"_format(m_batch_size, ts.size()));
4303     }
4304 
4305     // NOTE: re-use m_pfor_ts as tmp storage.
4306     assert(m_pfor_ts.size() == m_batch_size); // LCOV_EXCL_LINE
4307     for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4308         m_pfor_ts[i] = dfloat<T>(ts[i]);
4309     }
4310 
4311     // NOTE: max_delta_ts is checked in the other propagate_until_impl() overload.
4312     return propagate_until_impl(m_pfor_ts, max_steps, max_delta_ts, std::move(cb), wtc, with_c_out);
4313 }
4314 
4315 template <typename T>
propagate_grid_impl(const std::vector<T> & grid,std::size_t max_steps,const std::vector<T> & max_delta_ts,std::function<bool (taylor_adaptive_batch_impl &)> cb)4316 std::vector<T> taylor_adaptive_batch_impl<T>::propagate_grid_impl(const std::vector<T> &grid, std::size_t max_steps,
4317                                                                   const std::vector<T> &max_delta_ts,
4318                                                                   std::function<bool(taylor_adaptive_batch_impl &)> cb)
4319 {
4320     using std::abs;
4321     using std::isnan;
4322 
4323     // Helper to detect if an input value is nonfinite.
4324     auto is_nf = [](const T &t) {
4325         using std::isfinite;
4326         return !isfinite(t);
4327     };
4328 
4329     if (grid.empty()) {
4330         throw std::invalid_argument(
4331             "Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if the time grid is empty");
4332     }
4333 
4334     // Check that the grid size is a multiple of m_batch_size.
4335     if (grid.size() % m_batch_size != 0u) {
4336         throw std::invalid_argument(
4337             "Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: "
4338             "the grid has a size of {}, which is not a multiple of the batch size ({})"_format(grid.size(),
4339                                                                                                m_batch_size));
4340     }
4341 
4342     // Check the current time coordinates.
4343     if (std::any_of(m_time_hi.begin(), m_time_hi.end(), is_nf)
4344         || std::any_of(m_time_lo.begin(), m_time_lo.end(), is_nf)) {
4345         throw std::invalid_argument("Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if "
4346                                     "the current time is not finite");
4347     }
4348 
4349     // Check max_delta_ts.
4350     if (max_delta_ts.size() != m_batch_size) {
4351         throw std::invalid_argument(
4352             "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is {}, "
4353             "but the number of specified timesteps is {}"_format(m_batch_size, max_delta_ts.size()));
4354     }
4355     for (const auto &dt : max_delta_ts) {
4356         if (isnan(dt)) {
4357             throw std::invalid_argument("A nan max_delta_t was passed to the propagate_grid() function of an adaptive "
4358                                         "Taylor integrator in batch mode");
4359         }
4360         if (dt <= 0) {
4361             throw std::invalid_argument("A non-positive max_delta_t was passed to the propagate_grid() function of an "
4362                                         "adaptive Taylor integrator in batch mode");
4363         }
4364     }
4365 
4366     // The number of grid points.
4367     const auto n_grid_points = grid.size() / m_batch_size;
4368 
4369     // Pointer to the grid data.
4370     const auto *const grid_ptr = grid.data();
4371 
4372     // Check the input grid points.
4373     constexpr auto nf_err_msg
4374         = "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch mode";
4375     constexpr auto ig_err_msg = "A non-monotonic time grid was passed to propagate_grid() in an adaptive "
4376                                 "Taylor integrator in batch mode";
4377 
4378     // Check the first point.
4379     if (std::any_of(grid_ptr, grid_ptr + m_batch_size, is_nf)) {
4380         throw std::invalid_argument(nf_err_msg);
4381     }
4382     if (n_grid_points > 1u) {
4383         // Establish the direction of the grid from
4384         // the first two batches of points.
4385         if (std::any_of(grid_ptr + m_batch_size, grid_ptr + m_batch_size + m_batch_size, is_nf)) {
4386             throw std::invalid_argument(nf_err_msg);
4387         }
4388         if (grid_ptr[m_batch_size] == grid_ptr[0]) {
4389             throw std::invalid_argument(ig_err_msg);
4390         }
4391 
4392         const auto grid_direction = grid_ptr[m_batch_size] > grid_ptr[0];
4393         for (std::uint32_t i = 1; i < m_batch_size; ++i) {
4394             if ((grid_ptr[m_batch_size + i] > grid_ptr[i]) != grid_direction) {
4395                 throw std::invalid_argument(ig_err_msg);
4396             }
4397         }
4398 
4399         // Check that the remaining points are finite and that
4400         // they are ordered monotonically.
4401         for (decltype(grid.size()) i = 2; i < n_grid_points; ++i) {
4402             if (std::any_of(grid_ptr + i * m_batch_size, grid_ptr + (i + 1u) * m_batch_size, is_nf)) {
4403                 throw std::invalid_argument(nf_err_msg);
4404             }
4405 
4406             if (std::any_of(
4407                     grid_ptr + i * m_batch_size, grid_ptr + (i + 1u) * m_batch_size,
4408                     [this, grid_direction](const T &t) { return (t > *(&t - m_batch_size)) != grid_direction; })) {
4409                 throw std::invalid_argument(ig_err_msg);
4410             }
4411         }
4412     }
4413 
4414     // Pre-allocate the return value.
4415     std::vector<T> retval;
4416     // LCOV_EXCL_START
4417     if (get_dim() > std::numeric_limits<decltype(retval.size())>::max() / grid.size()) {
4418         throw std::overflow_error("Overflow detected in the creation of the return value of propagate_grid() in an "
4419                                   "adaptive Taylor integrator in batch mode");
4420     }
4421     // LCOV_EXCL_STOP
4422     // NOTE: fill with NaNs, so that the missing entries
4423     // are signalled with NaN if we exit early.
4424     retval.resize(grid.size() * get_dim(), std::numeric_limits<T>::quiet_NaN());
4425 
4426     // Propagate the system up to the first batch of grid points.
4427     // NOTE: this will *not* write the TCs, but because we know that
4428     // the grid is strictly monotonic, we are sure we will take at least
4429     // one TC-writing timestep below before trying to use the dense output.
4430     std::vector<T> pgrid_tmp;
4431     pgrid_tmp.resize(boost::numeric_cast<decltype(pgrid_tmp.size())>(m_batch_size));
4432     std::copy(grid_ptr, grid_ptr + m_batch_size, pgrid_tmp.begin());
4433     // NOTE: use the same max_steps for the initial propagation,
4434     // and don't pass the callback.
4435     propagate_until(pgrid_tmp, kw::max_delta_t = max_delta_ts, kw::max_steps = max_steps);
4436 
4437     // Check the result of the integration.
4438     if (std::any_of(m_prop_res.begin(), m_prop_res.end(), [](const auto &t) {
4439             // Check if the outcome is not time_limit and it is not a continuing
4440             // terminal event. This means that a non-finite state was
4441             // encountered, or a stopping terminal event triggered, or the step
4442             // limit was hit.
4443             const auto oc = std::get<0>(t);
4444             return oc != taylor_outcome::time_limit && oc < taylor_outcome{0};
4445         })) {
4446         // NOTE: for consistency with the scalar implementation,
4447         // keep the outcomes from propagate_until() but we reset
4448         // min/max h and the step counter.
4449         for (auto &[_, min_h, max_h, ts_count] : m_prop_res) {
4450             min_h = std::numeric_limits<T>::infinity();
4451             max_h = 0;
4452             ts_count = 0;
4453         }
4454 
4455         return retval;
4456     }
4457 
4458     // Add the first result to retval.
4459     std::copy(m_state.begin(), m_state.end(), retval.begin());
4460 
4461     // Init the remaining times and directions.
4462     for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4463         m_rem_time[i] = grid_ptr[(n_grid_points - 1u) * m_batch_size + i] - dfloat<T>(m_time_hi[i], m_time_lo[i]);
4464 
4465         // Check it.
4466         if (!isfinite(m_rem_time[i])) {
4467             throw std::invalid_argument("The final time passed to the propagate_grid() function of an adaptive Taylor "
4468                                         "integrator in batch mode results in an overflow condition");
4469         }
4470 
4471         m_t_dir[i] = (m_rem_time[i] >= T(0));
4472     }
4473 
4474     // Reset the counters and the min/max abs(h) vectors.
4475     std::size_t iter_counter = 0;
4476     for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4477         m_ts_count[i] = 0;
4478         m_min_abs_h[i] = std::numeric_limits<T>::infinity();
4479         m_max_abs_h[i] = 0;
4480     }
4481 
4482     // NOTE: in general, an integration timestep will cover a different number
4483     // of grid points for each batch element. We thus need to track the grid
4484     // index separately for each batch element. We will start with index
4485     // 1 for all batch elements, since all batch elements have been propagated to
4486     // index 0 already.
4487     std::vector<decltype(grid.size())> cur_grid_idx(
4488         boost::numeric_cast<typename std::vector<decltype(grid.size())>::size_type>(m_batch_size), 1);
4489 
4490     // Vectors to keep track of the time range of the last taken timestep.
4491     std::vector<dfloat<T>> t0(boost::numeric_cast<typename std::vector<dfloat<T>>::size_type>(m_batch_size)), t1(t0);
4492 
4493     // Vector of flags to keep track of the batch elements
4494     // we can compute dense output for.
4495     std::vector<unsigned> dflags(boost::numeric_cast<std::vector<unsigned>::size_type>(m_batch_size));
4496 
4497     // NOTE: loop until we have processed all grid points
4498     // for all batch elements.
4499     auto cont_cond = [n_grid_points, &cur_grid_idx]() {
4500         return std::any_of(cur_grid_idx.begin(), cur_grid_idx.end(),
4501                            [n_grid_points](auto idx) { return idx < n_grid_points; });
4502     };
4503 
4504     while (cont_cond()) {
4505         // Establish the time ranges of the last
4506         // taken timestep.
4507         // NOTE: t0 < t1.
4508         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4509             const dfloat<T> cur_time(m_time_hi[i], m_time_lo[i]), cmp = cur_time - m_last_h[i];
4510 
4511             t0[i] = std::min(cur_time, cmp);
4512             t1[i] = std::max(cur_time, cmp);
4513         }
4514 
4515         // Reset dflags.
4516         std::fill(dflags.begin(), dflags.end(), 1u);
4517 
4518         // Compute the state of the system via dense output for as many grid
4519         // points as possible, i.e., as long as the grid times
4520         // fall within the validity range for the dense output of at least
4521         // one batch element.
4522         while (true) {
4523             // Establish and count for which batch elements we
4524             // can still compute dense output.
4525             std::uint32_t counter = 0;
4526             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4527                 // Fetch the grid index for the current batch element.
4528                 const auto gidx = cur_grid_idx[i];
4529 
4530                 if (dflags[i] && gidx < n_grid_points) {
4531                     // The current batch element has not been eliminated
4532                     // yet from the candidate list and it still has grid
4533                     // points available. Determine if the current grid point
4534                     // falls within the validity domain for the dense output.
4535                     // NOTE: if we are at the last timestep for this batch
4536                     // element, force processing of all remaining grid points.
4537                     // We do this to avoid numerical issues when deciding if
4538                     // he last grid point falls within the range of validity
4539                     // of the dense output.
4540                     const auto idx = gidx * m_batch_size + i;
4541                     const auto d_avail
4542                         = (grid_ptr[idx] >= t0[i] && grid_ptr[idx] <= t1[i]) || (m_rem_time[i] == dfloat<T>(T(0)));
4543                     dflags[i] = d_avail;
4544                     counter += d_avail;
4545 
4546                     // Copy over the grid point to pgrid_tmp regardless
4547                     // of whether d_avail is true or false.
4548                     pgrid_tmp[i] = grid_ptr[idx];
4549                 } else {
4550                     // Either the batch element had already been eliminated
4551                     // previously, or there are no more grid points available.
4552                     // Make sure the batch element is marked as eliminated.
4553                     dflags[i] = 0;
4554                 }
4555             }
4556 
4557             if (counter == 0u) {
4558                 // Cannot use dense output on any of the batch elements,
4559                 // need to take another step.
4560                 break;
4561             }
4562 
4563             // Compute the dense output.
4564             update_d_output(pgrid_tmp);
4565 
4566             // Add the results to retval and bump up the values in cur_grid_idx.
4567             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4568                 if (dflags[i] != 0u) {
4569                     const auto gidx = cur_grid_idx[i];
4570 
4571                     for (std::uint32_t j = 0; j < m_dim; ++j) {
4572                         retval[gidx * m_batch_size * m_dim + j * m_batch_size + i] = m_d_out[j * m_batch_size + i];
4573                     }
4574 
4575                     assert(cur_grid_idx[i] < n_grid_points); // LCOV_EXCL_LINE
4576                     ++cur_grid_idx[i];
4577                 }
4578             }
4579 
4580             // Check if we exhausted all grid points for all batch elements.
4581             if (!cont_cond()) {
4582                 break;
4583             }
4584         }
4585 
4586         // Check if we exhausted all grid points for all batch elements.
4587         if (!cont_cond()) {
4588             break;
4589         }
4590 
4591         // Take the next step, making sure to write the Taylor coefficients
4592         // and to cap the timestep size so that we don't go past the
4593         // last grid point and we don't use a timestep exceeding max_delta_t.
4594         // NOTE: m_rem_time is guaranteed to be finite: we check it explicitly above
4595         // and we keep on decreasing its magnitude at the following iterations.
4596         // If some non-finite state/time is generated in
4597         // the step function, the integration will be stopped.
4598         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4599             // Max delta_t for the current batch element.
4600             const auto max_delta_t = max_delta_ts[i];
4601 
4602             // Compute the step limit for the current batch element.
4603             assert((m_rem_time[i] >= T(0)) == m_t_dir[i] || m_rem_time[i] == T(0)); // LCOV_EXCL_LINE
4604             const auto dt_limit = m_t_dir[i] ? std::min(dfloat<T>(max_delta_t), m_rem_time[i])
4605                                              : std::max(dfloat<T>(-max_delta_t), m_rem_time[i]);
4606 
4607             pgrid_tmp[i] = static_cast<T>(dt_limit);
4608         }
4609         step_impl(pgrid_tmp, true);
4610 
4611         // Check the result of the integration.
4612         if (std::any_of(m_step_res.begin(), m_step_res.end(), [](const auto &t) {
4613                 // Something went wrong in the propagation of the timestep, or we reached
4614                 // a stopping terminal event.
4615                 const auto oc = std::get<0>(t);
4616                 return oc != taylor_outcome::success && oc != taylor_outcome::time_limit && oc < taylor_outcome{0};
4617             })) {
4618             // Setup m_prop_res before exiting.
4619             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4620                 const auto [oc, h] = m_step_res[i];
4621                 if (oc > taylor_outcome::success) {
4622                     // In case of a stopping terminal event, we still want to update
4623                     // the step counter.
4624                     m_ts_count[i] += static_cast<std::size_t>(h != 0);
4625                 }
4626 
4627                 m_prop_res[i] = std::tuple{oc, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4628             }
4629 
4630             return retval;
4631         }
4632 
4633         // Update the number of iterations.
4634         ++iter_counter;
4635 
4636         // Update m_rem_time, the local step counters, min_h/max_h.
4637         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4638             const auto [res, h] = m_step_res[i];
4639 
4640             // NOTE: the local step counters increase only if we integrated
4641             // for a nonzero time.
4642             m_ts_count[i] += static_cast<std::size_t>(h != 0);
4643 
4644             // Update the remaining time.
4645             // NOTE: if static_cast<T>(m_rem_time[i]) was used as a timestep,
4646             // it means that we hit the time limit. Force rem_time to zero
4647             // to signal this, so that zero-length steps will be taken
4648             // for all remaining iterations, thus always triggering the
4649             // time_limit outcome. A zero m_rem_time[i]
4650             // will also force the processing of all remaining grid points.
4651             // NOTE: if m_rem_time[i] was previously set to zero, it
4652             // will end up being repeatedly set to zero here. This
4653             // should be harmless.
4654             if (h == static_cast<T>(m_rem_time[i])) {
4655                 assert(res == taylor_outcome::time_limit); // LCOV_EXCL_LINE
4656                 m_rem_time[i] = dfloat<T>(T(0));
4657             } else {
4658                 m_rem_time[i]
4659                     = grid_ptr[(n_grid_points - 1u) * m_batch_size + i] - dfloat<T>(m_time_hi[i], m_time_lo[i]);
4660             }
4661 
4662             // Update min_h/max_h, but only if the outcome is success (otherwise
4663             // the step was artificially clamped either by a time limit or
4664             // by a continuing terminal event).
4665             if (res == taylor_outcome::success) {
4666                 const auto abs_h = abs(h);
4667                 m_min_abs_h[i] = std::min(m_min_abs_h[i], abs_h);
4668                 m_max_abs_h[i] = std::max(m_max_abs_h[i], abs_h);
4669             }
4670         }
4671 
4672         // Step successful: invoke the callback, if needed.
4673         if (cb && !cb(*this)) {
4674             // Setup m_prop_res before exiting. We set all outcomes
4675             // to cb_stop regardless of the timestep outcome.
4676             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4677                 m_prop_res[i] = std::tuple{taylor_outcome::cb_stop, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4678             }
4679 
4680             return retval;
4681         }
4682 
4683         // Check the iteration limit.
4684         // NOTE: if max_steps is 0 (i.e., no limit on the number of steps),
4685         // then this condition will never trigger as by this point we are
4686         // sure iter_counter is at least 1.
4687         if (iter_counter == max_steps) {
4688             // We reached the max_steps limit: set the outcome for all batch elements
4689             // to step_limit.
4690             // NOTE: this is the same logic adopted when the integration is stopped
4691             // by the callback (see above).
4692             for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4693                 m_prop_res[i] = std::tuple{taylor_outcome::step_limit, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4694             }
4695 
4696             return retval;
4697         }
4698     }
4699 
4700     // Everything went fine, set all outcomes to time_limit.
4701     for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4702         m_prop_res[i] = std::tuple{taylor_outcome::time_limit, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4703     }
4704 
4705     return retval;
4706 }
4707 
4708 template <typename T>
get_llvm_state() const4709 const llvm_state &taylor_adaptive_batch_impl<T>::get_llvm_state() const
4710 {
4711     return m_llvm;
4712 }
4713 
4714 template <typename T>
get_decomposition() const4715 const taylor_dc_t &taylor_adaptive_batch_impl<T>::get_decomposition() const
4716 {
4717     return m_dc;
4718 }
4719 
4720 template <typename T>
get_order() const4721 std::uint32_t taylor_adaptive_batch_impl<T>::get_order() const
4722 {
4723     return m_order;
4724 }
4725 
4726 template <typename T>
get_tol() const4727 T taylor_adaptive_batch_impl<T>::get_tol() const
4728 {
4729     return m_tol;
4730 }
4731 
4732 template <typename T>
get_high_accuracy() const4733 bool taylor_adaptive_batch_impl<T>::get_high_accuracy() const
4734 {
4735     return m_high_accuracy;
4736 }
4737 
4738 template <typename T>
get_compact_mode() const4739 bool taylor_adaptive_batch_impl<T>::get_compact_mode() const
4740 {
4741     return m_compact_mode;
4742 }
4743 
4744 template <typename T>
get_batch_size() const4745 std::uint32_t taylor_adaptive_batch_impl<T>::get_batch_size() const
4746 {
4747     return m_batch_size;
4748 }
4749 
4750 template <typename T>
get_dim() const4751 std::uint32_t taylor_adaptive_batch_impl<T>::get_dim() const
4752 {
4753     return m_dim;
4754 }
4755 
4756 template <typename T>
update_d_output(const std::vector<T> & time,bool rel_time)4757 const std::vector<T> &taylor_adaptive_batch_impl<T>::update_d_output(const std::vector<T> &time, bool rel_time)
4758 {
4759     // Check the dimensionality of time.
4760     if (time.size() != m_batch_size) {
4761         throw std::invalid_argument(
4762             "Invalid number of time coordinates specified for the dense output in a Taylor integrator in batch "
4763             "mode: the batch size is {}, but the number of time coordinates is {}"_format(m_batch_size, time.size()));
4764     }
4765 
4766     // NOTE: "time" needs to be translated
4767     // because m_d_out_f expects a time coordinate
4768     // with respect to the starting time t0 of
4769     // the *previous* timestep.
4770     if (rel_time) {
4771         // Time coordinate relative to the current time.
4772         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4773             m_d_out_time[i] = m_last_h[i] + time[i];
4774         }
4775     } else {
4776         // Absolute time coordinate.
4777         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4778             m_d_out_time[i] = static_cast<T>(time[i] - (dfloat<T>(m_time_hi[i], m_time_lo[i]) - m_last_h[i]));
4779         }
4780     }
4781 
4782     m_d_out_f(m_d_out.data(), m_tc.data(), m_d_out_time.data());
4783 
4784     return m_d_out;
4785 }
4786 
4787 template <typename T>
reset_cooldowns()4788 void taylor_adaptive_batch_impl<T>::reset_cooldowns()
4789 {
4790     for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4791         reset_cooldowns(i);
4792     }
4793 }
4794 
4795 template <typename T>
reset_cooldowns(std::uint32_t i)4796 void taylor_adaptive_batch_impl<T>::reset_cooldowns(std::uint32_t i)
4797 {
4798     if (!m_ed_data) {
4799         throw std::invalid_argument("No events were defined for this integrator");
4800     }
4801 
4802     if (i >= m_batch_size) {
4803         throw std::invalid_argument(
4804             "Cannot reset the cooldowns at batch index {}: the batch size for this integrator is only {}"_format(
4805                 i, m_batch_size));
4806     }
4807 
4808     for (auto &cd : m_ed_data->m_te_cooldowns[i]) {
4809         cd.reset();
4810     }
4811 }
4812 
4813 // Explicit instantiation of the batch implementation classes.
4814 template class taylor_adaptive_batch_impl<double>;
4815 
4816 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<double>::finalise_ctor_impl(
4817     const std::vector<expression> &, std::vector<double>, std::uint32_t, std::vector<double>, double, bool, bool,
4818     std::vector<double>, std::vector<t_event_t>, std::vector<nt_event_t>);
4819 
4820 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<double>::finalise_ctor_impl(
4821     const std::vector<std::pair<expression, expression>> &, std::vector<double>, std::uint32_t, std::vector<double>,
4822     double, bool, bool, std::vector<double>, std::vector<t_event_t>, std::vector<nt_event_t>);
4823 
4824 template class taylor_adaptive_batch_impl<long double>;
4825 
4826 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<long double>::finalise_ctor_impl(
4827     const std::vector<expression> &, std::vector<long double>, std::uint32_t, std::vector<long double>, long double,
4828     bool, bool, std::vector<long double>, std::vector<t_event_t>, std::vector<nt_event_t>);
4829 
4830 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<long double>::finalise_ctor_impl(
4831     const std::vector<std::pair<expression, expression>> &, std::vector<long double>, std::uint32_t,
4832     std::vector<long double>, long double, bool, bool, std::vector<long double>, std::vector<t_event_t>,
4833     std::vector<nt_event_t>);
4834 
4835 #if defined(HEYOKA_HAVE_REAL128)
4836 
4837 template class taylor_adaptive_batch_impl<mppp::real128>;
4838 
4839 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<mppp::real128>::finalise_ctor_impl(
4840     const std::vector<expression> &, std::vector<mppp::real128>, std::uint32_t, std::vector<mppp::real128>,
4841     mppp::real128, bool, bool, std::vector<mppp::real128>, std::vector<t_event_t>, std::vector<nt_event_t>);
4842 
4843 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<mppp::real128>::finalise_ctor_impl(
4844     const std::vector<std::pair<expression, expression>> &, std::vector<mppp::real128>, std::uint32_t,
4845     std::vector<mppp::real128>, mppp::real128, bool, bool, std::vector<mppp::real128>, std::vector<t_event_t>,
4846     std::vector<nt_event_t>);
4847 
4848 #endif
4849 
4850 } // namespace detail
4851 
4852 namespace detail
4853 {
4854 
4855 namespace
4856 {
4857 
4858 // NOTE: in compact mode, care must be taken when adding multiple jet functions to the same llvm state
4859 // with the same floating-point type, batch size and number of u variables. The potential issue there
4860 // is that when the first jet is added, the compact mode AD functions are created and then optimised.
4861 // The optimisation pass might alter the functions in a way that makes them incompatible with subsequent
4862 // uses in the second jet (e.g., an argument might be removed from the signature because it is a
4863 // compile-time constant). A workaround to avoid issues is to set the optimisation level to zero
4864 // in the state, add the 2 jets and then run a single optimisation pass.
4865 // NOTE: document this eventually.
4866 template <typename T, typename U>
taylor_add_jet_impl(llvm_state & s,const std::string & name,const U & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)4867 auto taylor_add_jet_impl(llvm_state &s, const std::string &name, const U &sys, std::uint32_t order,
4868                          std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
4869                          const std::vector<expression> &sv_funcs)
4870 {
4871     if (s.is_compiled()) {
4872         throw std::invalid_argument("A function for the computation of the jet of Taylor derivatives cannot be added "
4873                                     "to an llvm_state after compilation");
4874     }
4875 
4876     if (order == 0u) {
4877         throw std::invalid_argument("The order of a Taylor jet cannot be zero");
4878     }
4879 
4880     if (batch_size == 0u) {
4881         throw std::invalid_argument("The batch size of a Taylor jet cannot be zero");
4882     }
4883 
4884 #if defined(HEYOKA_ARCH_PPC)
4885     if constexpr (std::is_same_v<T, long double>) {
4886         throw not_implemented_error("'long double' computations are not supported on PowerPC");
4887     }
4888 #endif
4889 
4890     auto &builder = s.builder();
4891 
4892     // Record the number of equations/variables.
4893     const auto n_eq = boost::numeric_cast<std::uint32_t>(sys.size());
4894 
4895     // Record the number of sv_funcs before consuming it.
4896     const auto n_sv_funcs = boost::numeric_cast<std::uint32_t>(sv_funcs.size());
4897 
4898     // Decompose the system of equations.
4899     // NOTE: don't use structured bindings due to the
4900     // usual issues with lambdas.
4901     const auto td_res = taylor_decompose(sys, sv_funcs);
4902     const auto &dc = td_res.first;
4903     const auto &sv_funcs_dc = td_res.second;
4904 
4905     assert(sv_funcs_dc.size() == n_sv_funcs); // LCOV_EXCL_LINE
4906 
4907     // Compute the number of u variables.
4908     assert(dc.size() > n_eq); // LCOV_EXCL_LINE
4909     const auto n_uvars = boost::numeric_cast<std::uint32_t>(dc.size() - n_eq);
4910 
4911     // Prepare the function prototype. The first argument is a float pointer to the in/out array,
4912     // the second argument a const float pointer to the pars, the third argument
4913     // a float pointer to the time. These arrays cannot overlap.
4914     auto *fp_t = to_llvm_type<T>(s.context());
4915     std::vector<llvm::Type *> fargs(3, llvm::PointerType::getUnqual(fp_t));
4916     // The function does not return anything.
4917     auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
4918     assert(ft != nullptr); // LCOV_EXCL_LINE
4919     // Now create the function.
4920     auto *f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, name, &s.module());
4921     if (f == nullptr) {
4922         throw std::invalid_argument(
4923             "Unable to create a function for the computation of the jet of Taylor derivatives with name '{}'"_format(
4924                 name));
4925     }
4926 
4927     // Set the names/attributes of the function arguments.
4928     auto *in_out = f->args().begin();
4929     in_out->setName("in_out");
4930     in_out->addAttr(llvm::Attribute::NoCapture);
4931     in_out->addAttr(llvm::Attribute::NoAlias);
4932 
4933     auto *par_ptr = in_out + 1;
4934     par_ptr->setName("par_ptr");
4935     par_ptr->addAttr(llvm::Attribute::NoCapture);
4936     par_ptr->addAttr(llvm::Attribute::NoAlias);
4937     par_ptr->addAttr(llvm::Attribute::ReadOnly);
4938 
4939     auto *time_ptr = par_ptr + 1;
4940     time_ptr->setName("time_ptr");
4941     time_ptr->addAttr(llvm::Attribute::NoCapture);
4942     time_ptr->addAttr(llvm::Attribute::NoAlias);
4943     time_ptr->addAttr(llvm::Attribute::ReadOnly);
4944 
4945     // Create a new basic block to start insertion into.
4946     auto *bb = llvm::BasicBlock::Create(s.context(), "entry", f);
4947     assert(bb != nullptr); // LCOV_EXCL_LINE
4948     builder.SetInsertPoint(bb);
4949 
4950     // Compute the jet of derivatives.
4951     auto diff_variant = taylor_compute_jet<T>(s, in_out, par_ptr, time_ptr, dc, sv_funcs_dc, n_eq, n_uvars, order,
4952                                               batch_size, compact_mode, high_accuracy);
4953 
4954     // Write the derivatives to in_out.
4955     // NOTE: overflow checking. We need to be able to index into the jet array
4956     // (size (n_eq + n_sv_funcs) * (order + 1) * batch_size)
4957     // using uint32_t.
4958     // LCOV_EXCL_START
4959     if (order == std::numeric_limits<std::uint32_t>::max()
4960         || (order + 1u) > std::numeric_limits<std::uint32_t>::max() / batch_size
4961         || n_eq > std::numeric_limits<std::uint32_t>::max() - n_sv_funcs
4962         || n_eq + n_sv_funcs > std::numeric_limits<std::uint32_t>::max() / ((order + 1u) * batch_size)) {
4963         throw std::overflow_error("An overflow condition was detected while adding a Taylor jet");
4964     }
4965     // LCOV_EXCL_STOP
4966 
4967     if (compact_mode) {
4968         auto diff_arr = std::get<llvm::Value *>(diff_variant);
4969 
4970         // Create a global read-only array containing the values in sv_funcs_dc, if any
4971         // (otherwise, svf_ptr will be null).
4972         auto svf_ptr = taylor_c_make_sv_funcs_arr(s, sv_funcs_dc);
4973 
4974         // Write the order 0 of the sv_funcs, if needed.
4975         if (svf_ptr != nullptr) {
4976             llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_sv_funcs), [&](llvm::Value *arr_idx) {
4977                 // Fetch the u var index from svf_ptr.
4978                 assert(llvm_depr_GEP_type_check(svf_ptr, builder.getInt32Ty())); // LCOV_EXCL_LINE
4979                 auto cur_idx = builder.CreateLoad(builder.getInt32Ty(),
4980                                                   builder.CreateInBoundsGEP(builder.getInt32Ty(), svf_ptr, arr_idx));
4981 
4982                 // Load the derivative value from diff_arr.
4983                 auto diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), cur_idx);
4984 
4985                 // Compute the index in the output pointer.
4986                 auto out_idx = builder.CreateMul(builder.CreateAdd(builder.getInt32(n_eq), arr_idx),
4987                                                  builder.getInt32(batch_size));
4988 
4989                 // Store into in_out.
4990                 assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
4991                 store_vector_to_memory(builder, builder.CreateInBoundsGEP(fp_t, in_out, out_idx), diff_val);
4992             });
4993         }
4994 
4995         // Write the other orders.
4996         llvm_loop_u32(
4997             s, builder.getInt32(1), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)),
4998             [&](llvm::Value *cur_order) {
4999                 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_idx) {
5000                     // Load the derivative value from diff_arr.
5001                     auto diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_idx);
5002 
5003                     // Compute the index in the output pointer.
5004                     auto out_idx = builder.CreateAdd(
5005                         builder.CreateMul(builder.getInt32((n_eq + n_sv_funcs) * batch_size), cur_order),
5006                         builder.CreateMul(cur_idx, builder.getInt32(batch_size)));
5007 
5008                     // Store into in_out.
5009                     assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5010                     store_vector_to_memory(builder, builder.CreateInBoundsGEP(fp_t, in_out, out_idx), diff_val);
5011                 });
5012 
5013                 if (svf_ptr != nullptr) {
5014                     llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_sv_funcs), [&](llvm::Value *arr_idx) {
5015                         // Fetch the u var index from svf_ptr.
5016                         assert(llvm_depr_GEP_type_check(svf_ptr, builder.getInt32Ty())); // LCOV_EXCL_LINE
5017                         auto cur_idx = builder.CreateLoad(
5018                             builder.getInt32Ty(), builder.CreateInBoundsGEP(builder.getInt32Ty(), svf_ptr, arr_idx));
5019 
5020                         // Load the derivative value from diff_arr.
5021                         auto diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_idx);
5022 
5023                         // Compute the index in the output pointer.
5024                         auto out_idx = builder.CreateAdd(
5025                             builder.CreateMul(builder.getInt32((n_eq + n_sv_funcs) * batch_size), cur_order),
5026                             builder.CreateMul(builder.CreateAdd(builder.getInt32(n_eq), arr_idx),
5027                                               builder.getInt32(batch_size)));
5028 
5029                         // Store into in_out.
5030                         assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5031                         store_vector_to_memory(builder, builder.CreateInBoundsGEP(fp_t, in_out, out_idx), diff_val);
5032                     });
5033                 }
5034             });
5035     } else {
5036         const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_variant);
5037 
5038         // Write the order 0 of the sv_funcs.
5039         for (std::uint32_t j = 0; j < n_sv_funcs; ++j) {
5040             // Index in the jet of derivatives.
5041             // NOTE: in non-compact mode, diff_arr contains the derivatives only of the
5042             // state variables and sv_funcs (not all u vars), hence the indexing is
5043             // n_eq + j.
5044             const auto arr_idx = n_eq + j;
5045             assert(arr_idx < diff_arr.size()); // LCOV_EXCL_LINE
5046             const auto val = diff_arr[arr_idx];
5047 
5048             // Index in the output array.
5049             const auto out_idx = (n_eq + j) * batch_size;
5050 
5051             assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5052             auto *out_ptr
5053                 = builder.CreateInBoundsGEP(fp_t, in_out, builder.getInt32(static_cast<std::uint32_t>(out_idx)));
5054             store_vector_to_memory(builder, out_ptr, val);
5055         }
5056 
5057         for (decltype(diff_arr.size()) cur_order = 1; cur_order <= order; ++cur_order) {
5058             for (std::uint32_t j = 0; j < n_eq; ++j) {
5059                 // Index in the jet of derivatives.
5060                 // NOTE: in non-compact mode, diff_arr contains the derivatives only of the
5061                 // state variables and sv_funcs (not all u vars), hence the indexing is
5062                 // cur_order * (n_eq + n_sv_funcs) + j.
5063                 const auto arr_idx = cur_order * (n_eq + n_sv_funcs) + j;
5064                 assert(arr_idx < diff_arr.size()); // LCOV_EXCL_LINE
5065                 const auto val = diff_arr[arr_idx];
5066 
5067                 // Index in the output array.
5068                 const auto out_idx = (n_eq + n_sv_funcs) * batch_size * cur_order + j * batch_size;
5069 
5070                 assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5071                 auto *out_ptr
5072                     = builder.CreateInBoundsGEP(fp_t, in_out, builder.getInt32(static_cast<std::uint32_t>(out_idx)));
5073                 store_vector_to_memory(builder, out_ptr, val);
5074             }
5075 
5076             for (std::uint32_t j = 0; j < n_sv_funcs; ++j) {
5077                 const auto arr_idx = cur_order * (n_eq + n_sv_funcs) + n_eq + j;
5078                 assert(arr_idx < diff_arr.size()); // LCOV_EXCL_LINE
5079                 const auto val = diff_arr[arr_idx];
5080 
5081                 const auto out_idx = (n_eq + n_sv_funcs) * batch_size * cur_order + (n_eq + j) * batch_size;
5082 
5083                 assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5084                 auto *out_ptr
5085                     = builder.CreateInBoundsGEP(fp_t, in_out, builder.getInt32(static_cast<std::uint32_t>(out_idx)));
5086                 store_vector_to_memory(builder, out_ptr, val);
5087             }
5088         }
5089     }
5090 
5091     // Finish off the function.
5092     builder.CreateRetVoid();
5093 
5094     // Verify it.
5095     s.verify_function(f);
5096 
5097     // Run the optimisation pass.
5098     s.optimise();
5099 
5100     return dc;
5101 }
5102 
5103 } // namespace
5104 
5105 } // namespace detail
5106 
taylor_add_jet_dbl(llvm_state & s,const std::string & name,const std::vector<expression> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5107 taylor_dc_t taylor_add_jet_dbl(llvm_state &s, const std::string &name, const std::vector<expression> &sys,
5108                                std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5109                                const std::vector<expression> &sv_funcs)
5110 {
5111     return detail::taylor_add_jet_impl<double>(s, name, sys, order, batch_size, high_accuracy, compact_mode, sv_funcs);
5112 }
5113 
taylor_add_jet_ldbl(llvm_state & s,const std::string & name,const std::vector<expression> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5114 taylor_dc_t taylor_add_jet_ldbl(llvm_state &s, const std::string &name, const std::vector<expression> &sys,
5115                                 std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5116                                 const std::vector<expression> &sv_funcs)
5117 {
5118     return detail::taylor_add_jet_impl<long double>(s, name, sys, order, batch_size, high_accuracy, compact_mode,
5119                                                     sv_funcs);
5120 }
5121 
5122 #if defined(HEYOKA_HAVE_REAL128)
5123 
taylor_add_jet_f128(llvm_state & s,const std::string & name,const std::vector<expression> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5124 taylor_dc_t taylor_add_jet_f128(llvm_state &s, const std::string &name, const std::vector<expression> &sys,
5125                                 std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5126                                 const std::vector<expression> &sv_funcs)
5127 {
5128     return detail::taylor_add_jet_impl<mppp::real128>(s, name, sys, order, batch_size, high_accuracy, compact_mode,
5129                                                       sv_funcs);
5130 }
5131 
5132 #endif
5133 
taylor_add_jet_dbl(llvm_state & s,const std::string & name,const std::vector<std::pair<expression,expression>> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5134 taylor_dc_t taylor_add_jet_dbl(llvm_state &s, const std::string &name,
5135                                const std::vector<std::pair<expression, expression>> &sys, std::uint32_t order,
5136                                std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5137                                const std::vector<expression> &sv_funcs)
5138 {
5139     return detail::taylor_add_jet_impl<double>(s, name, sys, order, batch_size, high_accuracy, compact_mode, sv_funcs);
5140 }
5141 
taylor_add_jet_ldbl(llvm_state & s,const std::string & name,const std::vector<std::pair<expression,expression>> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5142 taylor_dc_t taylor_add_jet_ldbl(llvm_state &s, const std::string &name,
5143                                 const std::vector<std::pair<expression, expression>> &sys, std::uint32_t order,
5144                                 std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5145                                 const std::vector<expression> &sv_funcs)
5146 {
5147     return detail::taylor_add_jet_impl<long double>(s, name, sys, order, batch_size, high_accuracy, compact_mode,
5148                                                     sv_funcs);
5149 }
5150 
5151 #if defined(HEYOKA_HAVE_REAL128)
5152 
taylor_add_jet_f128(llvm_state & s,const std::string & name,const std::vector<std::pair<expression,expression>> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5153 taylor_dc_t taylor_add_jet_f128(llvm_state &s, const std::string &name,
5154                                 const std::vector<std::pair<expression, expression>> &sys, std::uint32_t order,
5155                                 std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5156                                 const std::vector<expression> &sv_funcs)
5157 {
5158     return detail::taylor_add_jet_impl<mppp::real128>(s, name, sys, order, batch_size, high_accuracy, compact_mode,
5159                                                       sv_funcs);
5160 }
5161 
5162 #endif
5163 
5164 } // namespace heyoka
5165