1 // Copyright 2020, 2021 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
2 //
3 // This file is part of the heyoka library.
4 //
5 // This Source Code Form is subject to the terms of the Mozilla
6 // Public License v. 2.0. If a copy of the MPL was not distributed
7 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
9 #include <heyoka/config.hpp>
10
11 #include <algorithm>
12 #include <array>
13 #include <cassert>
14 #include <chrono> // NOTE: needed for the spdlog stopwatch.
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdint>
18 #include <exception>
19 #include <functional>
20 #include <limits>
21 #include <map>
22 #include <optional>
23 #include <stdexcept>
24 #include <string>
25 #include <tuple>
26 #include <type_traits>
27 #include <typeinfo>
28 #include <unordered_map>
29 #include <utility>
30 #include <variant>
31 #include <vector>
32
33 #include <boost/numeric/conversion/cast.hpp>
34
35 #include <fmt/format.h>
36 #include <fmt/ostream.h>
37
38 #include <spdlog/stopwatch.h>
39
40 #include <llvm/IR/Attributes.h>
41 #include <llvm/IR/BasicBlock.h>
42 #include <llvm/IR/Constant.h>
43 #include <llvm/IR/Constants.h>
44 #include <llvm/IR/DerivedTypes.h>
45 #include <llvm/IR/Function.h>
46 #include <llvm/IR/GlobalVariable.h>
47 #include <llvm/IR/IRBuilder.h>
48 #include <llvm/IR/LLVMContext.h>
49 #include <llvm/IR/Type.h>
50 #include <llvm/IR/Value.h>
51 #include <llvm/Support/Casting.h>
52
53 #if defined(HEYOKA_HAVE_REAL128)
54
55 #include <mp++/real128.hpp>
56
57 #endif
58
59 #include <heyoka/detail/event_detection.hpp>
60 #include <heyoka/detail/llvm_fwd.hpp>
61 #include <heyoka/detail/llvm_helpers.hpp>
62 #include <heyoka/detail/llvm_vector_type.hpp>
63 #include <heyoka/detail/logging_impl.hpp>
64 #include <heyoka/detail/sleef.hpp>
65 #include <heyoka/detail/string_conv.hpp>
66 #include <heyoka/detail/type_traits.hpp>
67 #include <heyoka/detail/visibility.hpp>
68 #include <heyoka/exceptions.hpp>
69 #include <heyoka/expression.hpp>
70 #include <heyoka/func.hpp>
71 #include <heyoka/llvm_state.hpp>
72 #include <heyoka/number.hpp>
73 #include <heyoka/param.hpp>
74 #include <heyoka/s11n.hpp>
75 #include <heyoka/taylor.hpp>
76 #include <heyoka/variable.hpp>
77
78 #if defined(_MSC_VER) && !defined(__clang__)
79
80 // NOTE: MSVC has issues with the other "using"
81 // statement form.
82 using namespace fmt::literals;
83
84 #else
85
86 using fmt::literals::operator""_format;
87
88 #endif
89
90 namespace heyoka
91 {
92
93 namespace detail
94 {
95
96 namespace
97 {
98
99 // RAII helper to temporarily set the opt level to 0 in an llvm_state.
100 struct opt_disabler {
101 llvm_state &m_s;
102 unsigned m_orig_opt_level;
103
opt_disablerheyoka::detail::__anon89701e160111::opt_disabler104 explicit opt_disabler(llvm_state &s) : m_s(s), m_orig_opt_level(s.opt_level())
105 {
106 // Disable optimisations.
107 m_s.opt_level() = 0;
108 }
109
110 opt_disabler(const opt_disabler &) = delete;
111 opt_disabler(opt_disabler &&) noexcept = delete;
112 opt_disabler &operator=(const opt_disabler &) = delete;
113 opt_disabler &operator=(opt_disabler &&) noexcept = delete;
114
~opt_disablerheyoka::detail::__anon89701e160111::opt_disabler115 ~opt_disabler()
116 {
117 // Restore the original optimisation level.
118 m_s.opt_level() = m_orig_opt_level;
119 }
120 };
121
122 // Helper to determine the optimal Taylor order for a given tolerance,
123 // following Jorba's prescription.
124 template <typename T>
taylor_order_from_tol(T tol)125 std::uint32_t taylor_order_from_tol(T tol)
126 {
127 using std::ceil;
128 using std::isfinite;
129 using std::log;
130
131 // Determine the order from the tolerance.
132 auto order_f = ceil(-log(tol) / 2 + 1);
133 // LCOV_EXCL_START
134 if (!isfinite(order_f)) {
135 throw std::invalid_argument(
136 "The computation of the Taylor order in an adaptive Taylor stepper produced a non-finite value");
137 }
138 // LCOV_EXCL_STOP
139 // NOTE: min order is 2.
140 order_f = std::max(T(2), order_f);
141
142 // NOTE: cast to double as that ensures that the
143 // max of std::uint32_t is exactly representable.
144 // LCOV_EXCL_START
145 if (order_f > static_cast<double>(std::numeric_limits<std::uint32_t>::max())) {
146 throw std::overflow_error("The computation of the Taylor order in an adaptive Taylor stepper resulted "
147 "in an overflow condition");
148 }
149 // LCOV_EXCL_STOP
150 return static_cast<std::uint32_t>(order_f);
151 }
152
153 // Helper to compute max(x_v, abs(y_v)) in the Taylor stepper implementation.
taylor_step_maxabs(llvm_state & s,llvm::Value * x_v,llvm::Value * y_v)154 llvm::Value *taylor_step_maxabs(llvm_state &s, llvm::Value *x_v, llvm::Value *y_v)
155 {
156 return llvm_max(s, x_v, llvm_abs(s, y_v));
157 }
158
159 // Helper to compute min(x_v, abs(y_v)) in the Taylor stepper implementation.
taylor_step_minabs(llvm_state & s,llvm::Value * x_v,llvm::Value * y_v)160 llvm::Value *taylor_step_minabs(llvm_state &s, llvm::Value *x_v, llvm::Value *y_v)
161 {
162 return llvm_min(s, x_v, llvm_abs(s, y_v));
163 }
164
165 // Helper to compute pow(x_v, y_v) in the Taylor stepper implementation.
taylor_step_pow(llvm_state & s,llvm::Value * x_v,llvm::Value * y_v)166 llvm::Value *taylor_step_pow(llvm_state &s, llvm::Value *x_v, llvm::Value *y_v)
167 {
168 #if defined(HEYOKA_HAVE_REAL128)
169 // Determine the scalar type of the vector arguments.
170 auto *x_t = x_v->getType()->getScalarType();
171
172 if (x_t == llvm::Type::getFP128Ty(s.context())) {
173 return call_extern_vec(s, x_v, y_v, "powq");
174 } else {
175 #endif
176 // If we are operating on SIMD vectors, try to see if we have a sleef
177 // function available for pow().
178 if (auto *vec_t = llvm::dyn_cast<llvm_vector_type>(x_v->getType())) {
179 // NOTE: if sfn ends up empty, we will be falling through
180 // below and use the LLVM intrinsic instead.
181 if (const auto sfn = sleef_function_name(s.context(), "pow", vec_t->getElementType(),
182 boost::numeric_cast<std::uint32_t>(vec_t->getNumElements()));
183 !sfn.empty()) {
184 return llvm_invoke_external(
185 s, sfn, vec_t, {x_v, y_v},
186 // NOTE: in theory we may add ReadNone here as well,
187 // but for some reason, at least up to LLVM 10,
188 // this causes strange codegen issues. Revisit
189 // in the future.
190 {llvm::Attribute::NoUnwind, llvm::Attribute::Speculatable, llvm::Attribute::WillReturn});
191 }
192 }
193
194 return llvm_invoke_intrinsic(s, "llvm.pow", {x_v->getType()}, {x_v, y_v});
195 #if defined(HEYOKA_HAVE_REAL128)
196 }
197 #endif
198 }
199
200 // Create a global read-only array containing the values in sv_funcs_dc, if any
201 // (otherwise, the return value will be null). This is for use in the adaptive steppers
202 // when employing compact mode.
taylor_c_make_sv_funcs_arr(llvm_state & s,const std::vector<std::uint32_t> & sv_funcs_dc)203 llvm::Value *taylor_c_make_sv_funcs_arr(llvm_state &s, const std::vector<std::uint32_t> &sv_funcs_dc)
204 {
205 auto &builder = s.builder();
206
207 if (sv_funcs_dc.empty()) {
208 return nullptr;
209 } else {
210 auto *arr_type
211 = llvm::ArrayType::get(builder.getInt32Ty(), boost::numeric_cast<std::uint64_t>(sv_funcs_dc.size()));
212 std::vector<llvm::Constant *> sv_funcs_dc_const;
213 sv_funcs_dc_const.reserve(sv_funcs_dc.size());
214 for (auto idx : sv_funcs_dc) {
215 sv_funcs_dc_const.emplace_back(builder.getInt32(idx));
216 }
217 auto *sv_funcs_dc_arr = llvm::ConstantArray::get(arr_type, sv_funcs_dc_const);
218 auto *g_sv_funcs_dc = new llvm::GlobalVariable(s.module(), sv_funcs_dc_arr->getType(), true,
219 llvm::GlobalVariable::InternalLinkage, sv_funcs_dc_arr);
220
221 // Get out a pointer to the beginning of the array.
222 assert(llvm_depr_GEP_type_check(g_sv_funcs_dc, arr_type)); // LCOV_EXCL_LINE
223 return builder.CreateInBoundsGEP(arr_type, g_sv_funcs_dc, {builder.getInt32(0), builder.getInt32(0)});
224 }
225 }
226
227 // Helper to generate the LLVM code to determine the timestep in an adaptive Taylor integrator,
228 // following Jorba's prescription. diff_variant is the output of taylor_compute_jet(), and it contains
229 // the jet of derivatives for the state variables and the sv_funcs. h_ptr is a pointer containing
230 // the clamping values for the timesteps. svf_ptr is a pointer to the first element of an LLVM array containing the
231 // values in sv_funcs_dc. If max_abs_state_ptr is not nullptr, the computed norm infinity of the
232 // state vector (including sv_funcs, if any) will be written into it.
233 template <typename T>
taylor_determine_h(llvm_state & s,const std::variant<llvm::Value *,std::vector<llvm::Value * >> & diff_variant,const std::vector<std::uint32_t> & sv_funcs_dc,llvm::Value * svf_ptr,llvm::Value * h_ptr,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size,llvm::Value * max_abs_state_ptr)234 llvm::Value *taylor_determine_h(llvm_state &s,
235 const std::variant<llvm::Value *, std::vector<llvm::Value *>> &diff_variant,
236 const std::vector<std::uint32_t> &sv_funcs_dc, llvm::Value *svf_ptr, llvm::Value *h_ptr,
237 std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order,
238 std::uint32_t batch_size, llvm::Value *max_abs_state_ptr)
239 {
240 assert(batch_size != 0u);
241 #if !defined(NDEBUG)
242 if (diff_variant.index() == 0u) {
243 // Compact mode.
244 assert(sv_funcs_dc.empty() == !svf_ptr);
245 } else {
246 // Non-compact mode.
247 assert(svf_ptr == nullptr);
248 }
249 #endif
250
251 using std::exp;
252
253 auto &builder = s.builder();
254 auto &context = s.context();
255
256 llvm::Value *max_abs_state = nullptr, *max_abs_diff_o = nullptr, *max_abs_diff_om1 = nullptr;
257
258 if (diff_variant.index() == 0u) {
259 // Compact mode.
260 auto *diff_arr = std::get<llvm::Value *>(diff_variant);
261
262 // These will end up containing the norm infinity of the state vector + sv_funcs and the
263 // norm infinity of the derivatives at orders order and order - 1.
264 auto vec_t = to_llvm_vector_type<T>(context, batch_size);
265 max_abs_state = builder.CreateAlloca(vec_t);
266 max_abs_diff_o = builder.CreateAlloca(vec_t);
267 max_abs_diff_om1 = builder.CreateAlloca(vec_t);
268
269 // Initialise with the abs(derivatives) of the first state variable at orders 0, 'order' and 'order - 1'.
270 builder.CreateStore(
271 llvm_abs(s, taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), builder.getInt32(0))),
272 max_abs_state);
273 builder.CreateStore(
274 llvm_abs(s, taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order), builder.getInt32(0))),
275 max_abs_diff_o);
276 builder.CreateStore(
277 llvm_abs(s, taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order - 1u), builder.getInt32(0))),
278 max_abs_diff_om1);
279
280 // Iterate over the variables to compute the norm infinities.
281 llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(n_eq), [&](llvm::Value *cur_idx) {
282 builder.CreateStore(
283 taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_state),
284 taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), cur_idx)),
285 max_abs_state);
286 builder.CreateStore(
287 taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_diff_o),
288 taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order), cur_idx)),
289 max_abs_diff_o);
290 builder.CreateStore(
291 taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_diff_om1),
292 taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order - 1u), cur_idx)),
293 max_abs_diff_om1);
294 });
295
296 if (svf_ptr != nullptr) {
297 // Consider also the functions of state variables for
298 // the computation of the timestep.
299 llvm_loop_u32(
300 s, builder.getInt32(0), builder.getInt32(boost::numeric_cast<std::uint32_t>(sv_funcs_dc.size())),
301 [&](llvm::Value *arr_idx) {
302 // Fetch the index value from the array.
303 assert(llvm_depr_GEP_type_check(svf_ptr, builder.getInt32Ty())); // LCOV_EXCL_LINE
304 auto cur_idx = builder.CreateLoad(
305 builder.getInt32Ty(), builder.CreateInBoundsGEP(builder.getInt32Ty(), svf_ptr, arr_idx));
306
307 builder.CreateStore(
308 taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_state),
309 taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), cur_idx)),
310 max_abs_state);
311 builder.CreateStore(
312 taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_diff_o),
313 taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order), cur_idx)),
314 max_abs_diff_o);
315 builder.CreateStore(taylor_step_maxabs(s, builder.CreateLoad(vec_t, max_abs_diff_om1),
316 taylor_c_load_diff(s, diff_arr, n_uvars,
317 builder.getInt32(order - 1u), cur_idx)),
318 max_abs_diff_om1);
319 });
320 }
321
322 // Load the values for later use.
323 max_abs_state = builder.CreateLoad(vec_t, max_abs_state);
324 max_abs_diff_o = builder.CreateLoad(vec_t, max_abs_diff_o);
325 max_abs_diff_om1 = builder.CreateLoad(vec_t, max_abs_diff_om1);
326 } else {
327 // Non-compact mode.
328 const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_variant);
329
330 const auto n_sv_funcs = static_cast<std::uint32_t>(sv_funcs_dc.size());
331
332 // Compute the norm infinity of the state vector and the norm infinity of the derivatives
333 // at orders order and order - 1. We first create vectors of absolute values and then
334 // compute their maxima.
335 std::vector<llvm::Value *> v_max_abs_state, v_max_abs_diff_o, v_max_abs_diff_om1;
336
337 // NOTE: iterate up to n_eq + n_sv_funcs in order to
338 // consider also the functions of state variables for
339 // the computation of the timestep.
340 for (std::uint32_t i = 0; i < n_eq + n_sv_funcs; ++i) {
341 v_max_abs_state.push_back(llvm_abs(s, diff_arr[i]));
342 // NOTE: in non-compact mode, diff_arr contains the derivatives only of the
343 // state variables and sv funcs (not all u vars), hence the indexing is
344 // order * (n_eq + n_sv_funcs).
345 v_max_abs_diff_o.push_back(llvm_abs(s, diff_arr[order * (n_eq + n_sv_funcs) + i]));
346 v_max_abs_diff_om1.push_back(llvm_abs(s, diff_arr[(order - 1u) * (n_eq + n_sv_funcs) + i]));
347 }
348
349 // Find the maxima via pairwise reduction.
350 auto reducer = [&s](llvm::Value *a, llvm::Value *b) -> llvm::Value * { return llvm_max(s, a, b); };
351 max_abs_state = pairwise_reduce(v_max_abs_state, reducer);
352 max_abs_diff_o = pairwise_reduce(v_max_abs_diff_o, reducer);
353 max_abs_diff_om1 = pairwise_reduce(v_max_abs_diff_om1, reducer);
354 }
355
356 // Store max_abs_state, if requested.
357 if (max_abs_state_ptr != nullptr) {
358 store_vector_to_memory(builder, max_abs_state_ptr, max_abs_state);
359 }
360
361 // Determine if we are in absolute or relative tolerance mode.
362 auto abs_or_rel
363 = builder.CreateFCmpOLE(max_abs_state, vector_splat(builder, codegen<T>(s, number{1.}), batch_size));
364
365 // Estimate rho at orders order - 1 and order.
366 auto num_rho
367 = builder.CreateSelect(abs_or_rel, vector_splat(builder, codegen<T>(s, number{1.}), batch_size), max_abs_state);
368 auto rho_o = taylor_step_pow(s, builder.CreateFDiv(num_rho, max_abs_diff_o),
369 vector_splat(builder, codegen<T>(s, number{T(1) / order}), batch_size));
370 auto rho_om1 = taylor_step_pow(s, builder.CreateFDiv(num_rho, max_abs_diff_om1),
371 vector_splat(builder, codegen<T>(s, number{T(1) / (order - 1u)}), batch_size));
372
373 // Take the minimum.
374 auto rho_m = llvm_min(s, rho_o, rho_om1);
375
376 // Compute the scaling + safety factor.
377 const auto rhofac = exp((T(-7) / T(10)) / (order - 1u)) / (exp(T(1)) * exp(T(1)));
378
379 // Determine the step size in absolute value.
380 auto h = builder.CreateFMul(rho_m, vector_splat(builder, codegen<T>(s, number{rhofac}), batch_size));
381
382 // Ensure that the step size does not exceed the limit in absolute value.
383 auto *max_h_vec = load_vector_from_memory(builder, h_ptr, batch_size);
384 h = taylor_step_minabs(s, h, max_h_vec);
385
386 // Handle backwards propagation.
387 auto backward = builder.CreateFCmpOLT(max_h_vec, vector_splat(builder, codegen<T>(s, number{0.}), batch_size));
388 auto h_fac = builder.CreateSelect(backward, vector_splat(builder, codegen<T>(s, number{-1.}), batch_size),
389 vector_splat(builder, codegen<T>(s, number{1.}), batch_size));
390 h = builder.CreateFMul(h_fac, h);
391
392 return h;
393 }
394
395 // Store the value val as the derivative of order 'order' of the u variable u_idx
396 // into the array of Taylor derivatives diff_arr. n_uvars is the total number of u variables.
taylor_c_store_diff(llvm_state & s,llvm::Value * diff_arr,std::uint32_t n_uvars,llvm::Value * order,llvm::Value * u_idx,llvm::Value * val)397 void taylor_c_store_diff(llvm_state &s, llvm::Value *diff_arr, std::uint32_t n_uvars, llvm::Value *order,
398 llvm::Value *u_idx, llvm::Value *val)
399 {
400 auto &builder = s.builder();
401
402 // NOTE: overflow check has already been done to ensure that the
403 // total size of diff_arr fits in a 32-bit unsigned integer.
404 assert(llvm_depr_GEP_type_check(diff_arr, pointee_type(diff_arr))); // LCOV_EXCL_LINE
405 auto *ptr
406 = builder.CreateInBoundsGEP(pointee_type(diff_arr), diff_arr,
407 builder.CreateAdd(builder.CreateMul(order, builder.getInt32(n_uvars)), u_idx));
408
409 builder.CreateStore(val, ptr);
410 }
411
412 // Compute the derivative of order "order" of a state variable.
413 // ex is the formula for the first-order derivative of the state variable (which
414 // is either a u variable or a number/param), n_uvars the number of variables in
415 // the decomposition, arr the array containing the derivatives of all u variables
416 // up to order - 1.
417 template <typename T>
taylor_compute_sv_diff(llvm_state & s,const expression & ex,const std::vector<llvm::Value * > & arr,llvm::Value * par_ptr,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size)418 llvm::Value *taylor_compute_sv_diff(llvm_state &s, const expression &ex, const std::vector<llvm::Value *> &arr,
419 llvm::Value *par_ptr, std::uint32_t n_uvars, std::uint32_t order,
420 std::uint32_t batch_size)
421 {
422 assert(order > 0u);
423
424 auto &builder = s.builder();
425
426 return std::visit(
427 [&](const auto &v) -> llvm::Value * {
428 using type = uncvref_t<decltype(v)>;
429
430 if constexpr (std::is_same_v<type, variable>) {
431 // Extract the index of the u variable in the expression
432 // of the first-order derivative.
433 const auto u_idx = uname_to_index(v.name());
434
435 // Fetch from arr the derivative
436 // of order 'order - 1' of the u variable at u_idx. The index is:
437 // (order - 1) * n_uvars + u_idx.
438 auto ret = taylor_fetch_diff(arr, u_idx, order - 1u, n_uvars);
439
440 // We have to divide the derivative by order
441 // to get the normalised derivative of the state variable.
442 return builder.CreateFDiv(
443 ret, vector_splat(builder, codegen<T>(s, number(static_cast<T>(order))), batch_size));
444 } else if constexpr (std::is_same_v<type, number> || std::is_same_v<type, param>) {
445 // The first-order derivative is a constant.
446 // If the first-order derivative is being requested,
447 // do the codegen for the constant itself, otherwise
448 // return 0. No need for normalization as the only
449 // nonzero value that can be produced here is the first-order
450 // derivative.
451 if (order == 1u) {
452 return taylor_codegen_numparam<T>(s, v, par_ptr, batch_size);
453 } else {
454 return vector_splat(builder, codegen<T>(s, number{0.}), batch_size);
455 }
456 } else {
457 assert(false);
458 return nullptr;
459 }
460 },
461 ex.value());
462 }
463
464 // Function to split the central part of the decomposition (i.e., the definitions of the u variables
465 // that do not represent state variables) into parallelisable segments. Within a segment,
466 // the definition of a u variable does not depend on any u variable defined within that segment.
467 // NOTE: the hidden deps are not considered as dependencies.
468 // NOTE: the segments in the return value will contain shallow copies of the
469 // expressions in dc.
taylor_segment_dc(const taylor_dc_t & dc,std::uint32_t n_eq)470 std::vector<taylor_dc_t> taylor_segment_dc(const taylor_dc_t &dc, std::uint32_t n_eq)
471 {
472 // Log runtime in trace mode.
473 spdlog::stopwatch sw;
474
475 // Helper that takes in input the definition ex of a u variable, and returns
476 // in output the list of indices of the u variables on which ex depends.
477 auto udef_args_indices = [](const expression &ex) -> std::vector<std::uint32_t> {
478 return std::visit(
479 [](const auto &v) -> std::vector<std::uint32_t> {
480 using type = detail::uncvref_t<decltype(v)>;
481
482 if constexpr (std::is_same_v<type, func>) {
483 std::vector<std::uint32_t> retval;
484
485 for (const auto &arg : v.args()) {
486 std::visit(
487 [&retval](const auto &x) {
488 using tp = detail::uncvref_t<decltype(x)>;
489
490 if constexpr (std::is_same_v<tp, variable>) {
491 retval.push_back(uname_to_index(x.name()));
492 } else if constexpr (!std::is_same_v<tp, number> && !std::is_same_v<tp, param>) {
493 throw std::invalid_argument(
494 "Invalid argument encountered in an element of a Taylor decomposition: the "
495 "argument is not a variable or a number/param");
496 }
497 },
498 arg.value());
499 }
500
501 return retval;
502 } else {
503 throw std::invalid_argument("Invalid expression encountered in a Taylor decomposition: the "
504 "expression is not a function");
505 }
506 },
507 ex.value());
508 };
509
510 // Init the return value.
511 std::vector<taylor_dc_t> s_dc;
512
513 // cur_limit_idx is initially the index of the first
514 // u variable which is not a state variable.
515 auto cur_limit_idx = n_eq;
516 for (std::uint32_t i = n_eq; i < dc.size() - n_eq; ++i) {
517 // NOTE: at the very first iteration of this for loop,
518 // no block has been created yet. Do it now.
519 if (i == n_eq) {
520 assert(s_dc.empty());
521 s_dc.emplace_back();
522 } else {
523 assert(!s_dc.empty());
524 }
525
526 const auto &[ex, deps] = dc[i];
527
528 // Determine the u indices on which ex depends.
529 const auto u_indices = udef_args_indices(ex);
530
531 if (std::any_of(u_indices.begin(), u_indices.end(),
532 [cur_limit_idx](auto idx) { return idx >= cur_limit_idx; })) {
533 // The current expression depends on one or more variables
534 // within the current block. Start a new block and
535 // update cur_limit_idx with the start index of the new block.
536 s_dc.emplace_back();
537 cur_limit_idx = i;
538 }
539
540 // Append ex to the current block.
541 s_dc.back().emplace_back(ex, deps);
542 }
543
544 #if !defined(NDEBUG)
545 // Verify s_dc.
546
547 decltype(dc.size()) counter = 0;
548 for (const auto &s : s_dc) {
549 // No segment can be empty.
550 assert(!s.empty());
551
552 for (const auto &[ex, _] : s) {
553 // All the indices in the definitions of the
554 // u variables in the current block must be
555 // less than counter + n_eq (which is the starting
556 // index of the block).
557 const auto u_indices = udef_args_indices(ex);
558 assert(std::all_of(u_indices.begin(), u_indices.end(),
559 [idx_limit = counter + n_eq](auto idx) { return idx < idx_limit; }));
560 }
561
562 // Update the counter.
563 counter += s.size();
564 }
565
566 assert(counter == dc.size() - n_eq * 2u);
567 #endif
568
569 get_logger()->debug("Taylor N of segments: {}", s_dc.size());
570 get_logger()->trace("Taylor segment runtime: {}", sw);
571
572 return s_dc;
573 }
574
575 // Small helper to compute the size of a global array.
taylor_c_gl_arr_size(llvm::Value * v)576 std::uint32_t taylor_c_gl_arr_size(llvm::Value *v)
577 {
578 assert(llvm::isa<llvm::GlobalVariable>(v));
579
580 return boost::numeric_cast<std::uint32_t>(
581 llvm::cast<llvm::ArrayType>(llvm::cast<llvm::PointerType>(v->getType())->getElementType())->getNumElements());
582 }
583
584 // Helper to construct the global arrays needed for the computation of the
585 // derivatives of the state variables in compact mode. The first part of the
586 // return value is a set of 6 arrays:
587 // - the indices of the state variables whose time derivative is a u variable, paired to
588 // - the indices of the u variables appearing in the derivatives, and
589 // - the indices of the state variables whose time derivative is a constant, paired to
590 // - the values of said constants, and
591 // - the indices of the state variables whose time derivative is a param, paired to
592 // - the indices of the params.
593 // The second part of the return value is a boolean flag that will be true if
594 // the time derivatives of all state variables are u variables, false otherwise.
595 template <typename T>
596 std::pair<std::array<llvm::GlobalVariable *, 6>, bool>
taylor_c_make_sv_diff_globals(llvm_state & s,const taylor_dc_t & dc,std::uint32_t n_uvars)597 taylor_c_make_sv_diff_globals(llvm_state &s, const taylor_dc_t &dc, std::uint32_t n_uvars)
598 {
599 auto &context = s.context();
600 auto &builder = s.builder();
601 auto &module = s.module();
602
603 // Build iteratively the output values as vectors of constants.
604 std::vector<llvm::Constant *> var_indices, vars, num_indices, nums, par_indices, pars;
605
606 // Keep track of how many time derivatives
607 // of the state variables are u variables.
608 std::uint32_t n_der_vars = 0;
609
610 // NOTE: the derivatives of the state variables are at the end of the decomposition.
611 for (auto i = n_uvars; i < boost::numeric_cast<std::uint32_t>(dc.size()); ++i) {
612 std::visit(
613 [&](const auto &v) {
614 using type = uncvref_t<decltype(v)>;
615
616 if constexpr (std::is_same_v<type, variable>) {
617 ++n_der_vars;
618 // NOTE: remove from i the n_uvars offset to get the
619 // true index of the state variable.
620 var_indices.push_back(builder.getInt32(i - n_uvars));
621 vars.push_back(builder.getInt32(uname_to_index(v.name())));
622 } else if constexpr (std::is_same_v<type, number>) {
623 num_indices.push_back(builder.getInt32(i - n_uvars));
624 nums.push_back(llvm::cast<llvm::Constant>(codegen<T>(s, v)));
625 } else if constexpr (std::is_same_v<type, param>) {
626 par_indices.push_back(builder.getInt32(i - n_uvars));
627 pars.push_back(builder.getInt32(v.idx()));
628 } else {
629 assert(false);
630 }
631 },
632 dc[i].first.value());
633 }
634
635 // Flag to signal that the time derivatives of all state variables are u variables.
636 assert(dc.size() >= n_uvars);
637 const auto all_der_vars = (n_der_vars == (dc.size() - n_uvars));
638
639 assert(var_indices.size() == vars.size());
640 assert(num_indices.size() == nums.size());
641 assert(par_indices.size() == pars.size());
642
643 // Turn the vectors into global read-only LLVM arrays.
644
645 // Variables.
646 auto *var_arr_type
647 = llvm::ArrayType::get(llvm::Type::getInt32Ty(context), boost::numeric_cast<std::uint64_t>(var_indices.size()));
648
649 auto *var_indices_arr = llvm::ConstantArray::get(var_arr_type, var_indices);
650 auto *g_var_indices = new llvm::GlobalVariable(module, var_indices_arr->getType(), true,
651 llvm::GlobalVariable::InternalLinkage, var_indices_arr);
652
653 auto *vars_arr = llvm::ConstantArray::get(var_arr_type, vars);
654 auto *g_vars
655 = new llvm::GlobalVariable(module, vars_arr->getType(), true, llvm::GlobalVariable::InternalLinkage, vars_arr);
656
657 // Numbers.
658 auto *num_indices_arr_type
659 = llvm::ArrayType::get(llvm::Type::getInt32Ty(context), boost::numeric_cast<std::uint64_t>(num_indices.size()));
660 auto *num_indices_arr = llvm::ConstantArray::get(num_indices_arr_type, num_indices);
661 auto *g_num_indices = new llvm::GlobalVariable(module, num_indices_arr->getType(), true,
662 llvm::GlobalVariable::InternalLinkage, num_indices_arr);
663
664 auto nums_arr_type
665 = llvm::ArrayType::get(to_llvm_type<T>(context), boost::numeric_cast<std::uint64_t>(nums.size()));
666 auto nums_arr = llvm::ConstantArray::get(nums_arr_type, nums);
667 // NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
668 auto *g_nums
669 = new llvm::GlobalVariable(module, nums_arr->getType(), true, llvm::GlobalVariable::InternalLinkage, nums_arr);
670
671 // Params.
672 auto *par_arr_type
673 = llvm::ArrayType::get(llvm::Type::getInt32Ty(context), boost::numeric_cast<std::uint64_t>(par_indices.size()));
674
675 auto *par_indices_arr = llvm::ConstantArray::get(par_arr_type, par_indices);
676 auto *g_par_indices = new llvm::GlobalVariable(module, par_indices_arr->getType(), true,
677 llvm::GlobalVariable::InternalLinkage, par_indices_arr);
678
679 auto *pars_arr = llvm::ConstantArray::get(par_arr_type, pars);
680 auto *g_pars
681 = new llvm::GlobalVariable(module, pars_arr->getType(), true, llvm::GlobalVariable::InternalLinkage, pars_arr);
682
683 return std::pair{std::array{g_var_indices, g_vars, g_num_indices, g_nums, g_par_indices, g_pars}, all_der_vars};
684 }
685
686 // Helper to compute and store the derivatives of the state variables in compact mode at order 'order'.
687 // svd_gl is the return value of taylor_c_make_sv_diff_globals(), which contains
688 // the indices/constants necessary for the computation.
taylor_c_compute_sv_diffs(llvm_state & s,const std::pair<std::array<llvm::GlobalVariable *,6>,bool> & svd_gl,llvm::Value * diff_arr,llvm::Value * par_ptr,std::uint32_t n_uvars,llvm::Value * order,std::uint32_t batch_size)689 void taylor_c_compute_sv_diffs(llvm_state &s, const std::pair<std::array<llvm::GlobalVariable *, 6>, bool> &svd_gl,
690 llvm::Value *diff_arr, llvm::Value *par_ptr, std::uint32_t n_uvars, llvm::Value *order,
691 std::uint32_t batch_size)
692 {
693 assert(batch_size > 0u);
694
695 // Fetch the global arrays and
696 // the all_der_vars flag.
697 const auto &sv_diff_gl = svd_gl.first;
698 const auto all_der_vars = svd_gl.second;
699
700 auto &builder = s.builder();
701
702 // Recover the number of state variables whose derivatives are given
703 // by u variables, numbers and params.
704 const auto n_vars = taylor_c_gl_arr_size(sv_diff_gl[0]);
705 const auto n_nums = taylor_c_gl_arr_size(sv_diff_gl[2]);
706 const auto n_pars = taylor_c_gl_arr_size(sv_diff_gl[4]);
707
708 // Fetch the type stored in the array of derivatives.
709 auto *fp_vec_t = pointee_type(diff_arr);
710
711 // Handle the u variables definitions.
712 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_vars), [&](llvm::Value *cur_idx) {
713 // Fetch the index of the state variable.
714 // NOTE: if the time derivatives of all state variables are u variables, there's
715 // no need to lookup the index in the global array (which will just contain
716 // the values in the [0, n_vars] range).
717 auto *sv_idx = all_der_vars
718 ? cur_idx
719 : builder.CreateLoad(builder.getInt32Ty(),
720 builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[0]), sv_diff_gl[0],
721 {builder.getInt32(0), cur_idx}));
722
723 // Fetch the index of the u variable.
724 auto *u_idx = builder.CreateLoad(
725 builder.getInt32Ty(),
726 builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[1]), sv_diff_gl[1], {builder.getInt32(0), cur_idx}));
727
728 // Fetch from diff_arr the derivative of order 'order - 1' of the u variable u_idx.
729 auto *ret = taylor_c_load_diff(s, diff_arr, n_uvars, builder.CreateSub(order, builder.getInt32(1)), u_idx);
730
731 // We have to divide the derivative by 'order' in order
732 // to get the normalised derivative of the state variable.
733 ret = builder.CreateFDiv(
734 ret, vector_splat(builder, builder.CreateUIToFP(order, fp_vec_t->getScalarType()), batch_size));
735
736 // Store the derivative.
737 taylor_c_store_diff(s, diff_arr, n_uvars, order, sv_idx, ret);
738 });
739
740 // Handle the number definitions.
741 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_nums), [&](llvm::Value *cur_idx) {
742 // Fetch the index of the state variable.
743 auto *sv_idx = builder.CreateLoad(
744 builder.getInt32Ty(),
745 builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[2]), sv_diff_gl[2], {builder.getInt32(0), cur_idx}));
746
747 // Fetch the constant.
748 auto *num = builder.CreateLoad(
749 fp_vec_t->getScalarType(),
750 builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[3]), sv_diff_gl[3], {builder.getInt32(0), cur_idx}));
751
752 // If the first-order derivative is being requested,
753 // do the codegen for the constant itself, otherwise
754 // return 0. No need for normalization as the only
755 // nonzero value that can be produced here is the first-order
756 // derivative.
757 auto *cmp_cond = builder.CreateICmpEQ(order, builder.getInt32(1));
758 auto ret = builder.CreateSelect(cmp_cond, vector_splat(builder, num, batch_size),
759 llvm::ConstantFP::get(fp_vec_t, 0.));
760
761 // Store the derivative.
762 taylor_c_store_diff(s, diff_arr, n_uvars, order, sv_idx, ret);
763 });
764
765 // Handle the param definitions.
766 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_pars), [&](llvm::Value *cur_idx) {
767 // Fetch the index of the state variable.
768 auto *sv_idx = builder.CreateLoad(
769 builder.getInt32Ty(),
770 builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[4]), sv_diff_gl[4], {builder.getInt32(0), cur_idx}));
771
772 // Fetch the index of the param.
773 auto *par_idx = builder.CreateLoad(
774 builder.getInt32Ty(),
775 builder.CreateInBoundsGEP(pointee_type(sv_diff_gl[5]), sv_diff_gl[5], {builder.getInt32(0), cur_idx}));
776
777 // If the first-order derivative is being requested,
778 // do the codegen for the constant itself, otherwise
779 // return 0. No need for normalization as the only
780 // nonzero value that can be produced here is the first-order
781 // derivative.
782 llvm_if_then_else(
783 s, builder.CreateICmpEQ(order, builder.getInt32(1)),
784 [&]() {
785 // Derivative of order 1. Fetch the value from par_ptr.
786 // NOTE: param{0} is unused, its only purpose is type tagging.
787 taylor_c_store_diff(s, diff_arr, n_uvars, order, sv_idx,
788 taylor_c_diff_numparam_codegen(s, param{0}, par_idx, par_ptr, batch_size));
789 },
790 [&]() {
791 // Derivative of order > 1, return 0.
792 taylor_c_store_diff(s, diff_arr, n_uvars, order, sv_idx, llvm::ConstantFP::get(fp_vec_t, 0.));
793 });
794 });
795 }
796
797 // Helper to convert the arguments of the definition of a u variable
798 // into a vector of variants. u variables will be converted to their indices,
799 // numbers will be unchanged, parameters will be converted to their indices.
800 // The hidden deps will also be converted to indices.
taylor_udef_to_variants(const expression & ex,const std::vector<std::uint32_t> & deps)801 auto taylor_udef_to_variants(const expression &ex, const std::vector<std::uint32_t> &deps)
802 {
803 return std::visit(
804 [&deps](const auto &v) -> std::vector<std::variant<std::uint32_t, number>> {
805 using type = detail::uncvref_t<decltype(v)>;
806
807 if constexpr (std::is_same_v<type, func>) {
808 std::vector<std::variant<std::uint32_t, number>> retval;
809
810 for (const auto &arg : v.args()) {
811 std::visit(
812 [&retval](const auto &x) {
813 using tp = detail::uncvref_t<decltype(x)>;
814
815 if constexpr (std::is_same_v<tp, variable>) {
816 retval.emplace_back(uname_to_index(x.name()));
817 } else if constexpr (std::is_same_v<tp, number>) {
818 retval.emplace_back(x);
819 } else if constexpr (std::is_same_v<tp, param>) {
820 retval.emplace_back(x.idx());
821 } else {
822 throw std::invalid_argument(
823 "Invalid argument encountered in an element of a Taylor decomposition: the "
824 "argument is not a variable or a number");
825 }
826 },
827 arg.value());
828 }
829
830 // Handle the hidden deps.
831 for (auto idx : deps) {
832 retval.emplace_back(idx);
833 }
834
835 return retval;
836 } else {
837 throw std::invalid_argument("Invalid expression encountered in a Taylor decomposition: the "
838 "expression is not a function");
839 }
840 },
841 ex.value());
842 }
843
844 // Helper to convert a vector of variants into a variant of vectors.
845 // All elements of v must be of the same type, and v cannot be empty.
846 template <typename... T>
taylor_c_vv_transpose(const std::vector<std::variant<T...>> & v)847 auto taylor_c_vv_transpose(const std::vector<std::variant<T...>> &v)
848 {
849 assert(!v.empty());
850
851 // Init the return value based on the type
852 // of the first element of v.
853 auto retval = std::visit(
854 [size = v.size()](const auto &x) {
855 using type = detail::uncvref_t<decltype(x)>;
856
857 std::vector<type> tmp;
858 tmp.reserve(boost::numeric_cast<decltype(tmp.size())>(size));
859 tmp.push_back(x);
860
861 return std::variant<std::vector<T>...>(std::move(tmp));
862 },
863 v[0]);
864
865 // Append the other values from v.
866 for (decltype(v.size()) i = 1; i < v.size(); ++i) {
867 std::visit(
868 [&retval](const auto &x) {
869 std::visit(
870 [&x](auto &vv) {
871 // The value type of retval.
872 using scal_t = typename detail::uncvref_t<decltype(vv)>::value_type;
873
874 // The type of the current element of v.
875 using x_t = detail::uncvref_t<decltype(x)>;
876
877 if constexpr (std::is_same_v<scal_t, x_t>) {
878 vv.push_back(x);
879 } else {
880 throw std::invalid_argument(
881 "Inconsistent types detected while building the transposed sets of "
882 "arguments for the Taylor derivative functions in compact mode");
883 }
884 },
885 retval);
886 },
887 v[i]);
888 }
889
890 return retval;
891 }
892
893 // Helper to check if a vector of indices consists of consecutive values:
894 // [n, n + 1, n + 2, ...]
895 // NOTE: requires a non-empty vector.
is_consecutive(const std::vector<std::uint32_t> & v)896 bool is_consecutive(const std::vector<std::uint32_t> &v)
897 {
898 assert(!v.empty());
899
900 for (decltype(v.size()) i = 1; i < v.size(); ++i) {
901 // NOTE: the first check is to avoid potential
902 // negative overflow in the second check.
903 if (v[i] <= v[i - 1u] || v[i] - v[i - 1u] != 1u) {
904 return false;
905 }
906 }
907
908 return true;
909 }
910
911 // Functions the create the arguments generators for the functions that compute
912 // the Taylor derivatives in compact mode. The generators are created from vectors
913 // of either u var indices (taylor_c_make_arg_gen_vidx()) or floating-point constants
914 // (taylor_c_make_arg_gen_vc()).
taylor_c_make_arg_gen_vidx(llvm_state & s,const std::vector<std::uint32_t> & ind)915 std::function<llvm::Value *(llvm::Value *)> taylor_c_make_arg_gen_vidx(llvm_state &s,
916 const std::vector<std::uint32_t> &ind)
917 {
918 assert(!ind.empty());
919
920 auto &builder = s.builder();
921
922 // Check if all indices in ind are the same.
923 if (std::all_of(ind.begin() + 1, ind.end(), [&ind](const auto &n) { return n == ind[0]; })) {
924 // If all indices are the same, don't construct an array, just always return
925 // the same value.
926 return [num = builder.getInt32(ind[0])](llvm::Value *) -> llvm::Value * { return num; };
927 }
928
929 // If ind consists of consecutive indices, we can replace
930 // the index array with a simple offset computation.
931 if (is_consecutive(ind)) {
932 return [&builder, start_idx = builder.getInt32(ind[0])](llvm::Value *cur_call_idx) -> llvm::Value * {
933 return builder.CreateAdd(start_idx, cur_call_idx);
934 };
935 }
936
937 // Check if ind consists of a repeated pattern like [a, a, a, b, b, b, c, c, c, ...],
938 // that is, [a X n, b X n, c X n, ...], such that [a, b, c, ...] are consecutive numbers.
939 if (ind.size() > 1u) {
940 // Determine the candidate number of repetitions.
941 decltype(ind.size()) n_reps = 1;
942 for (decltype(ind.size()) i = 1; i < ind.size(); ++i) {
943 if (ind[i] == ind[i - 1u]) {
944 ++n_reps;
945 } else {
946 break;
947 }
948 }
949
950 if (n_reps > 1u && (ind.size() % n_reps) == 0u) {
951 // There is an initial number of repetitions
952 // and the vector size is a multiple of that.
953 // See if the repetitions continue, and keep
954 // track of the repeated indices.
955 std::vector<std::uint32_t> rep_indices{ind[0]};
956
957 bool rep_flag = true;
958
959 // Iterate over the blocks of repetitions.
960 for (decltype(ind.size()) rep_idx = 1; rep_idx < ind.size() / n_reps; ++rep_idx) {
961 for (decltype(ind.size()) i = 1; i < n_reps; ++i) {
962 const auto cur_idx = rep_idx * n_reps + i;
963
964 if (ind[cur_idx] != ind[cur_idx - 1u]) {
965 rep_flag = false;
966 break;
967 }
968 }
969
970 if (rep_flag) {
971 rep_indices.push_back(ind[rep_idx * n_reps]);
972 } else {
973 break;
974 }
975 }
976
977 if (rep_flag && is_consecutive(rep_indices)) {
978 // The pattern is [a X n, b X n, c X n, ...] and [a, b, c, ...]
979 // are consecutive numbers. The m-th value in the array can thus
980 // be computed as a + floor(m / n).
981
982 #if !defined(NDEBUG)
983 // Double-check the result in debug mode.
984 std::vector<std::uint32_t> checker;
985 for (decltype(ind.size()) i = 0; i < ind.size(); ++i) {
986 checker.push_back(boost::numeric_cast<std::uint32_t>(ind[0] + i / n_reps));
987 }
988 assert(checker == ind);
989 #endif
990
991 return [&builder, start_idx = builder.getInt32(rep_indices[0]),
992 n_reps = builder.getInt32(boost::numeric_cast<std::uint32_t>(n_reps))](
993 llvm::Value *cur_call_idx) -> llvm::Value * {
994 return builder.CreateAdd(start_idx, builder.CreateUDiv(cur_call_idx, n_reps));
995 };
996 }
997 }
998 }
999
1000 auto &md = s.module();
1001
1002 // Generate the array of indices as llvm constants.
1003 std::vector<llvm::Constant *> tmp_c_vec;
1004 tmp_c_vec.reserve(ind.size());
1005 for (const auto &val : ind) {
1006 tmp_c_vec.push_back(builder.getInt32(val));
1007 }
1008
1009 // Create the array type.
1010 auto *arr_type = llvm::ArrayType::get(tmp_c_vec[0]->getType(), boost::numeric_cast<std::uint64_t>(ind.size()));
1011 assert(arr_type != nullptr);
1012
1013 // Create the constant array as a global read-only variable.
1014 auto *const_arr = llvm::ConstantArray::get(arr_type, tmp_c_vec);
1015 assert(const_arr != nullptr);
1016 // NOTE: naked new here is fine, gvar will be registered in the module
1017 // object and cleaned up when the module is destroyed.
1018 auto *gvar
1019 = new llvm::GlobalVariable(md, const_arr->getType(), true, llvm::GlobalVariable::InternalLinkage, const_arr);
1020
1021 // Return the generator.
1022 return [gvar, arr_type, &builder](llvm::Value *cur_call_idx) -> llvm::Value * {
1023 assert(llvm_depr_GEP_type_check(gvar, arr_type)); // LCOV_EXCL_LINE
1024 return builder.CreateLoad(builder.getInt32Ty(),
1025 builder.CreateInBoundsGEP(arr_type, gvar, {builder.getInt32(0), cur_call_idx}));
1026 };
1027 }
1028
1029 template <typename T>
taylor_c_make_arg_gen_vc(llvm_state & s,const std::vector<number> & vc)1030 std::function<llvm::Value *(llvm::Value *)> taylor_c_make_arg_gen_vc(llvm_state &s, const std::vector<number> &vc)
1031 {
1032 assert(!vc.empty());
1033
1034 // Check if all the numbers are the same.
1035 // NOTE: the comparison operator of number will consider two numbers of different
1036 // type but equal value to be equal.
1037 if (std::all_of(vc.begin() + 1, vc.end(), [&vc](const auto &n) { return n == vc[0]; })) {
1038 // If all constants are the same, don't construct an array, just always return
1039 // the same value.
1040 return [num = codegen<T>(s, vc[0])](llvm::Value *) -> llvm::Value * { return num; };
1041 }
1042
1043 // Generate the array of constants as llvm constants.
1044 std::vector<llvm::Constant *> tmp_c_vec;
1045 tmp_c_vec.reserve(vc.size());
1046 for (const auto &val : vc) {
1047 tmp_c_vec.push_back(llvm::cast<llvm::Constant>(codegen<T>(s, val)));
1048 }
1049
1050 // Create the array type.
1051 auto *arr_type = llvm::ArrayType::get(tmp_c_vec[0]->getType(), boost::numeric_cast<std::uint64_t>(vc.size()));
1052 assert(arr_type != nullptr);
1053
1054 // Create the constant array as a global read-only variable.
1055 auto *const_arr = llvm::ConstantArray::get(arr_type, tmp_c_vec);
1056 assert(const_arr != nullptr);
1057 // NOTE: naked new here is fine, gvar will be registered in the module
1058 // object and cleaned up when the module is destroyed.
1059 auto *gvar = new llvm::GlobalVariable(s.module(), const_arr->getType(), true, llvm::GlobalVariable::InternalLinkage,
1060 const_arr);
1061
1062 // Return the generator.
1063 return [gvar, arr_type, &s](llvm::Value *cur_call_idx) -> llvm::Value * {
1064 auto &builder = s.builder();
1065
1066 assert(llvm_depr_GEP_type_check(gvar, arr_type)); // LCOV_EXCL_LINE
1067 return builder.CreateLoad(arr_type->getArrayElementType(),
1068 builder.CreateInBoundsGEP(arr_type, gvar, {builder.getInt32(0), cur_call_idx}));
1069 };
1070 }
1071
1072 // Comparision operator for LLVM functions based on their names.
1073 struct llvm_func_name_compare {
operator ()heyoka::detail::__anon89701e160111::llvm_func_name_compare1074 bool operator()(const llvm::Function *f0, const llvm::Function *f1) const
1075 {
1076 return f0->getName() < f1->getName();
1077 }
1078 };
1079
1080 // For each segment in s_dc, this function will return a dict mapping an LLVM function
1081 // f for the computation of a Taylor derivative to a size and a vector of std::functions. For example, one entry
1082 // in the return value will read something like:
1083 // {f : (2, [g_0, g_1, g_2])}
1084 // The meaning in this example is that the arity of f is 3 and it will be called with 2 different
1085 // sets of arguments. The g_i functions are expected to be called with input argument j in [0, 1]
1086 // to yield the value of the i-th function argument for f at the j-th invocation.
1087 template <typename T>
taylor_build_function_maps(llvm_state & s,const std::vector<taylor_dc_t> & s_dc,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t batch_size,bool high_accuracy)1088 auto taylor_build_function_maps(llvm_state &s, const std::vector<taylor_dc_t> &s_dc, std::uint32_t n_eq,
1089 std::uint32_t n_uvars, std::uint32_t batch_size, bool high_accuracy)
1090 {
1091 // Log runtime in trace mode.
1092 spdlog::stopwatch sw;
1093
1094 // Init the return value.
1095 // NOTE: use maps with name-based comparison for the functions. This ensures that the order in which these
1096 // functions are invoked in taylor_compute_jet_compact_mode() is always the same. If we used directly pointer
1097 // comparisons instead, the order could vary across different executions and different platforms. The name
1098 // mangling we do when creating the function names should ensure that there are no possible name collisions.
1099 std::vector<
1100 std::map<llvm::Function *, std::pair<std::uint32_t, std::vector<std::function<llvm::Value *(llvm::Value *)>>>,
1101 llvm_func_name_compare>>
1102 retval;
1103
1104 // Variable to keep track of the u variable
1105 // on whose definition we are operating.
1106 auto cur_u_idx = n_eq;
1107 for (const auto &seg : s_dc) {
1108 // This structure maps an LLVM function to sets of arguments
1109 // with which the function is to be called. For instance, if function
1110 // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map
1111 // will contain {f : [[a, b, c], [d, e, f]]}.
1112 // After construction, we have verified that for each function
1113 // in the map the sets of arguments have all the same size.
1114 std::unordered_map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
1115
1116 for (const auto &ex : seg) {
1117 // Get the function for the computation of the derivative.
1118 auto func = taylor_c_diff_func<T>(s, ex.first, n_uvars, batch_size, high_accuracy);
1119
1120 // Insert the function into tmp_map.
1121 const auto [it, is_new_func] = tmp_map.try_emplace(func);
1122
1123 assert(is_new_func || !it->second.empty());
1124
1125 // Convert the variables/constants in the current dc
1126 // element into a set of indices/constants.
1127 const auto cdiff_args = taylor_udef_to_variants(ex.first, ex.second);
1128
1129 if (!is_new_func && it->second.back().size() - 1u != cdiff_args.size()) {
1130 throw std::invalid_argument(
1131 "Inconsistent arity detected in a Taylor derivative function in compact "
1132 "mode: the same function is being called with both {} and {} arguments"_format(
1133 it->second.back().size() - 1u, cdiff_args.size()));
1134 }
1135
1136 // Add the new set of arguments.
1137 it->second.emplace_back();
1138 // Add the idx of the u variable.
1139 it->second.back().emplace_back(cur_u_idx);
1140 // Add the actual function arguments.
1141 it->second.back().insert(it->second.back().end(), cdiff_args.begin(), cdiff_args.end());
1142
1143 ++cur_u_idx;
1144 }
1145
1146 // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
1147 // to {f : [[a, d], [b, e], [c, f]]}.
1148 std::unordered_map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
1149 tmp_map_transpose;
1150 for (const auto &[func, vv] : tmp_map) {
1151 assert(!vv.empty());
1152
1153 // Add the function.
1154 const auto [it, ins_status] = tmp_map_transpose.try_emplace(func);
1155 assert(ins_status);
1156
1157 const auto n_calls = vv.size();
1158 const auto n_args = vv[0].size();
1159 // NOTE: n_args must be at least 1 because the u idx
1160 // is prepended to the actual function arguments in
1161 // the tmp_map entries.
1162 assert(n_args >= 1u);
1163
1164 for (decltype(vv[0].size()) i = 0; i < n_args; ++i) {
1165 // Build the vector of values corresponding
1166 // to the current argument index.
1167 std::vector<std::variant<std::uint32_t, number>> tmp_c_vec;
1168 for (decltype(vv.size()) j = 0; j < n_calls; ++j) {
1169 tmp_c_vec.push_back(vv[j][i]);
1170 }
1171
1172 // Turn tmp_c_vec (a vector of variants) into a variant
1173 // of vectors, and insert the result.
1174 it->second.push_back(taylor_c_vv_transpose(tmp_c_vec));
1175 }
1176 }
1177
1178 // Add a new entry in retval for the current segment.
1179 retval.emplace_back();
1180 auto &a_map = retval.back();
1181
1182 for (const auto &[func, vv] : tmp_map_transpose) {
1183 assert(!vv.empty());
1184
1185 // Add the function.
1186 const auto [it, ins_status] = a_map.try_emplace(func);
1187 assert(ins_status);
1188
1189 // Set the number of calls for this function.
1190 it->second.first
1191 = std::visit([](const auto &x) { return boost::numeric_cast<std::uint32_t>(x.size()); }, vv[0]);
1192 assert(it->second.first > 0u);
1193
1194 // Create the g functions for each argument.
1195 for (const auto &v : vv) {
1196 it->second.second.push_back(std::visit(
1197 [&s](const auto &x) {
1198 using type = detail::uncvref_t<decltype(x)>;
1199
1200 if constexpr (std::is_same_v<type, std::vector<std::uint32_t>>) {
1201 return taylor_c_make_arg_gen_vidx(s, x);
1202 } else {
1203 return taylor_c_make_arg_gen_vc<T>(s, x);
1204 }
1205 },
1206 v));
1207 }
1208 }
1209 }
1210
1211 get_logger()->trace("Taylor build function maps runtime: {}", sw);
1212
1213 return retval;
1214 }
1215
1216 // Helper for the computation of a jet of derivatives in compact mode,
1217 // used in taylor_compute_jet() below.
1218 template <typename T>
taylor_compute_jet_compact_mode(llvm_state & s,llvm::Value * order0,llvm::Value * par_ptr,llvm::Value * time_ptr,const taylor_dc_t & dc,const std::vector<std::uint32_t> & sv_funcs_dc,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy)1219 llvm::Value *taylor_compute_jet_compact_mode(llvm_state &s, llvm::Value *order0, llvm::Value *par_ptr,
1220 llvm::Value *time_ptr, const taylor_dc_t &dc,
1221 const std::vector<std::uint32_t> &sv_funcs_dc, std::uint32_t n_eq,
1222 std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
1223 bool high_accuracy)
1224 {
1225 auto &builder = s.builder();
1226
1227 // Split dc into segments.
1228 const auto s_dc = taylor_segment_dc(dc, n_eq);
1229
1230 // Generate the function maps.
1231 const auto f_maps = taylor_build_function_maps<T>(s, s_dc, n_eq, n_uvars, batch_size, high_accuracy);
1232
1233 // Log the runtime of IR construction in trace mode.
1234 spdlog::stopwatch sw;
1235
1236 // Generate the global arrays for the computation of the derivatives
1237 // of the state variables.
1238 const auto svd_gl = taylor_c_make_sv_diff_globals<T>(s, dc, n_uvars);
1239
1240 // Determine the maximum u variable index appearing in sv_funcs_dc, or zero
1241 // if sv_funcs_dc is empty.
1242 const auto max_svf_idx
1243 = sv_funcs_dc.empty() ? std::uint32_t(0) : *std::max_element(sv_funcs_dc.begin(), sv_funcs_dc.end());
1244
1245 // Prepare the array that will contain the jet of derivatives.
1246 // We will be storing all the derivatives of the u variables
1247 // up to order 'order - 1', the derivatives of order
1248 // 'order' of the state variables and the derivatives
1249 // of order 'order' of the sv_funcs.
1250 // NOTE: the array size is specified as a 64-bit integer in the
1251 // LLVM API.
1252 // NOTE: fp_type is the original, scalar floating-point type.
1253 // It will be turned into a vector type (if necessary) by
1254 // make_vector_type() below.
1255 // NOTE: if sv_funcs_dc is empty, or if all its indices are not greater
1256 // than the indices of the state variables, then we don't need additional
1257 // slots after the sv derivatives. If we need additional slots, allocate
1258 // another full column of derivatives, as it is complicated at this stage
1259 // to know exactly how many slots we will need.
1260 auto *fp_type = llvm::cast<llvm::PointerType>(order0->getType())->getElementType();
1261 auto *fp_vec_type = make_vector_type(fp_type, batch_size);
1262 auto *array_type
1263 = llvm::ArrayType::get(fp_vec_type, (max_svf_idx < n_eq) ? (n_uvars * order + n_eq) : (n_uvars * (order + 1u)));
1264
1265 // Make the global array and fetch a pointer to its first element.
1266 // NOTE: we use a global array rather than a local one here because
1267 // its size can grow quite large, which can lead to stack overflow issues.
1268 // This has of course consequences in terms of thread safety, which
1269 // we will have to document.
1270 auto diff_arr_gvar = make_global_zero_array(s.module(), array_type);
1271 assert(llvm_depr_GEP_type_check(diff_arr_gvar, array_type)); // LCOV_EXCL_LINE
1272 auto *diff_arr = builder.CreateInBoundsGEP(array_type, diff_arr_gvar, {builder.getInt32(0), builder.getInt32(0)});
1273
1274 // Copy over the order-0 derivatives of the state variables.
1275 // NOTE: overflow checking is already done in the parent function.
1276 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1277 // Fetch the pointer from order0.
1278 assert(llvm_depr_GEP_type_check(order0, fp_type)); // LCOV_EXCL_LINE
1279 auto *ptr
1280 = builder.CreateInBoundsGEP(fp_type, order0, builder.CreateMul(cur_var_idx, builder.getInt32(batch_size)));
1281
1282 // Load as a vector.
1283 auto *vec = load_vector_from_memory(builder, ptr, batch_size);
1284
1285 // Store into diff_arr.
1286 assert(llvm_depr_GEP_type_check(diff_arr, fp_vec_type)); // LCOV_EXCL_LINE
1287 builder.CreateStore(vec, builder.CreateInBoundsGEP(fp_vec_type, diff_arr, cur_var_idx));
1288 });
1289
1290 // Helper to compute and store the derivatives of order cur_order
1291 // of the u variables which are not state variables.
1292 auto compute_u_diffs = [&](llvm::Value *cur_order) {
1293 for (const auto &map : f_maps) {
1294 for (const auto &p : map) {
1295 // The LLVM function for the computation of the
1296 // derivative in compact mode.
1297 const auto &func = p.first;
1298
1299 // The number of func calls.
1300 const auto ncalls = p.second.first;
1301
1302 // The generators for the arguments of func.
1303 const auto &gens = p.second.second;
1304
1305 assert(ncalls > 0u);
1306 assert(!gens.empty());
1307 assert(std::all_of(gens.begin(), gens.end(), [](const auto &f) { return static_cast<bool>(f); }));
1308
1309 // Loop over the number of calls.
1310 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
1311 // Create the u variable index from the first generator.
1312 auto u_idx = gens[0](cur_call_idx);
1313
1314 // Initialise the vector of arguments with which func must be called. The following
1315 // initial arguments are always present:
1316 // - current Taylor order,
1317 // - u index of the variable,
1318 // - array of derivatives,
1319 // - pointer to the param values,
1320 // - pointer to the time value(s).
1321 std::vector<llvm::Value *> args{cur_order, u_idx, diff_arr, par_ptr, time_ptr};
1322
1323 // Create the other arguments via the generators.
1324 for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
1325 args.push_back(gens[i](cur_call_idx));
1326 }
1327
1328 // Calculate the derivative and store the result.
1329 taylor_c_store_diff(s, diff_arr, n_uvars, cur_order, u_idx, builder.CreateCall(func, args));
1330 });
1331 }
1332 }
1333 };
1334
1335 // Compute the order-0 derivatives (i.e., the initial values)
1336 // for all u variables which are not state variables.
1337 compute_u_diffs(builder.getInt32(0));
1338
1339 // Compute all derivatives up to order 'order - 1'.
1340 llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(order), [&](llvm::Value *cur_order) {
1341 // State variables first.
1342 taylor_c_compute_sv_diffs(s, svd_gl, diff_arr, par_ptr, n_uvars, cur_order, batch_size);
1343
1344 // The other u variables.
1345 compute_u_diffs(cur_order);
1346 });
1347
1348 // Compute the last-order derivatives for the state variables.
1349 taylor_c_compute_sv_diffs(s, svd_gl, diff_arr, par_ptr, n_uvars, builder.getInt32(order), batch_size);
1350
1351 // Compute the last-order derivatives for the sv_funcs, if any. Because the sv funcs
1352 // correspond to u variables in the decomposition, we will have to compute the
1353 // last-order derivatives of the u variables until we are sure all sv_funcs derivatives
1354 // have been properly computed.
1355
1356 // Monitor the starting index of the current
1357 // segment while iterating on f_maps.
1358 auto cur_start_u_idx = n_eq;
1359
1360 // NOTE: this is a slight repetition of compute_u_diffs() with minor modifications.
1361 for (const auto &map : f_maps) {
1362 if (cur_start_u_idx > max_svf_idx) {
1363 // We computed all the necessary derivatives, break out.
1364 // NOTE: if we did not have sv_funcs to begin with,
1365 // max_svf_idx is zero and we exit at the first iteration
1366 // without doing anything. If all sv funcs alias state variables,
1367 // then max_svf_idx < n_eq and thus we also exit immediately
1368 // at the first iteration.
1369 break;
1370 }
1371
1372 for (const auto &p : map) {
1373 const auto &func = p.first;
1374 const auto ncalls = p.second.first;
1375 const auto &gens = p.second.second;
1376
1377 assert(ncalls > 0u);
1378 assert(!gens.empty());
1379 assert(std::all_of(gens.begin(), gens.end(), [](const auto &f) { return static_cast<bool>(f); }));
1380
1381 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
1382 auto u_idx = gens[0](cur_call_idx);
1383
1384 std::vector<llvm::Value *> args{builder.getInt32(order), u_idx, diff_arr, par_ptr, time_ptr};
1385
1386 for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
1387 args.push_back(gens[i](cur_call_idx));
1388 }
1389
1390 taylor_c_store_diff(s, diff_arr, n_uvars, builder.getInt32(order), u_idx,
1391 builder.CreateCall(func, args));
1392 });
1393
1394 // Update cur_start_u_idx taking advantage of the fact
1395 // that each block in a segment processes the derivatives
1396 // of exactly ncalls u variables.
1397 cur_start_u_idx += ncalls;
1398 }
1399 }
1400
1401 get_logger()->trace("Taylor IR creation compact mode runtime: {}", sw);
1402
1403 // Return the array of derivatives of the u variables.
1404 return diff_arr;
1405 }
1406
1407 // Given an input pointer 'in', load the first n * batch_size values in it as n vectors
1408 // with size batch_size. If batch_size is 1, the values will be loaded as scalars.
taylor_load_values(llvm_state & s,llvm::Value * in,std::uint32_t n,std::uint32_t batch_size)1409 auto taylor_load_values(llvm_state &s, llvm::Value *in, std::uint32_t n, std::uint32_t batch_size)
1410 {
1411 assert(batch_size > 0u);
1412
1413 auto &builder = s.builder();
1414
1415 std::vector<llvm::Value *> retval;
1416 for (std::uint32_t i = 0; i < n; ++i) {
1417 // Fetch the pointer from in.
1418 // NOTE: overflow checking is done in the parent function.
1419 assert(llvm_depr_GEP_type_check(in, pointee_type(in))); // LCOV_EXCL_LINE
1420 auto *ptr = builder.CreateInBoundsGEP(pointee_type(in), in, builder.getInt32(i * batch_size));
1421
1422 // Load the value in vector mode.
1423 retval.push_back(load_vector_from_memory(builder, ptr, batch_size));
1424 }
1425
1426 return retval;
1427 }
1428
1429 // Small helper to deduce the number of parameters
1430 // present in the Taylor decomposition of an ODE system.
1431 // NOTE: this will also include the functions of state variables,
1432 // as they are part of the decomposition.
1433 // NOTE: the first few entries in the decomposition are the mapping
1434 // u variables -> state variables. These never contain any param
1435 // by construction.
1436 template <typename T>
n_pars_in_dc(const T & dc)1437 std::uint32_t n_pars_in_dc(const T &dc)
1438 {
1439 std::uint32_t retval = 0;
1440
1441 for (const auto &p : dc) {
1442 retval = std::max(retval, get_param_size(p.first));
1443 }
1444
1445 return retval;
1446 }
1447
1448 // Helper function to compute the jet of Taylor derivatives up to a given order. n_eq
1449 // is the number of equations/variables in the ODE sys, dc its Taylor decomposition,
1450 // n_uvars the total number of u variables in the decomposition.
1451 // order is the max derivative order desired, batch_size the batch size.
1452 // order0 is a pointer to an array of (at least) n_eq * batch_size scalar elements
1453 // containing the derivatives of order 0. par_ptr is a pointer to an array containing
1454 // the numerical values of the parameters, time_ptr a pointer to the time value(s).
1455 // sv_funcs are the indices, in the decomposition, of the functions of state
1456 // variables.
1457 //
1458 // The return value is a variant containing either:
1459 // - in compact mode, the array containing the derivatives of all u variables,
1460 // - otherwise, the jet of derivatives of the state variables and sv_funcs
1461 // up to order 'order'.
1462 template <typename T>
1463 std::variant<llvm::Value *, std::vector<llvm::Value *>>
taylor_compute_jet(llvm_state & s,llvm::Value * order0,llvm::Value * par_ptr,llvm::Value * time_ptr,const taylor_dc_t & dc,const std::vector<std::uint32_t> & sv_funcs_dc,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size,bool compact_mode,bool high_accuracy)1464 taylor_compute_jet(llvm_state &s, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr,
1465 const taylor_dc_t &dc, const std::vector<std::uint32_t> &sv_funcs_dc, std::uint32_t n_eq,
1466 std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, bool compact_mode,
1467 bool high_accuracy)
1468 {
1469 assert(batch_size > 0u);
1470 assert(n_eq > 0u);
1471 assert(order > 0u);
1472
1473 // Make sure we can represent n_uvars * (order + 1) as a 32-bit
1474 // unsigned integer. This is the maximum total number of derivatives we will have to compute
1475 // and store, with the +1 taking into account the extra slots that might be needed by sv_funcs_dc.
1476 // If sv_funcs_dc is empty, we need only n_uvars * order + n_eq derivatives.
1477 // LCOV_EXCL_START
1478 if (order == std::numeric_limits<std::uint32_t>::max()
1479 || n_uvars > std::numeric_limits<std::uint32_t>::max() / (order + 1u)) {
1480 throw std::overflow_error(
1481 "An overflow condition was detected in the computation of a jet of Taylor derivatives");
1482 }
1483
1484 // We also need to be able to index up to n_eq * batch_size in order0.
1485 if (n_eq > std::numeric_limits<std::uint32_t>::max() / batch_size) {
1486 throw std::overflow_error(
1487 "An overflow condition was detected in the computation of a jet of Taylor derivatives");
1488 }
1489 // LCOV_EXCL_STOP
1490
1491 if (compact_mode) {
1492 // In compact mode, let's ensure that we can index into par_ptr using std::uint32_t.
1493 // NOTE: in default mode the check is done inside taylor_codegen_numparam_par().
1494
1495 // Deduce the size of the param array from the expressions in the decomposition.
1496 const auto param_size = n_pars_in_dc(dc);
1497 // LCOV_EXCL_START
1498 if (param_size > std::numeric_limits<std::uint32_t>::max() / batch_size) {
1499 throw std::overflow_error(
1500 "An overflow condition was detected in the computation of a jet of Taylor derivatives in compact mode");
1501 }
1502 // LCOV_EXCL_STOP
1503
1504 return taylor_compute_jet_compact_mode<T>(s, order0, par_ptr, time_ptr, dc, sv_funcs_dc, n_eq, n_uvars, order,
1505 batch_size, high_accuracy);
1506 } else {
1507 // Log the runtime of IR construction in trace mode.
1508 spdlog::stopwatch sw;
1509
1510 // Init the derivatives array with the order 0 of the state variables.
1511 auto diff_arr = taylor_load_values(s, order0, n_eq, batch_size);
1512
1513 // Compute the order-0 derivatives of the other u variables.
1514 for (auto i = n_eq; i < n_uvars; ++i) {
1515 diff_arr.push_back(taylor_diff<T>(s, dc[i].first, dc[i].second, diff_arr, par_ptr, time_ptr, n_uvars, 0, i,
1516 batch_size, high_accuracy));
1517 }
1518
1519 // Compute the derivatives order by order, starting from 1 to order excluded.
1520 // We will compute the highest derivatives of the state variables separately
1521 // in the last step.
1522 for (std::uint32_t cur_order = 1; cur_order < order; ++cur_order) {
1523 // Begin with the state variables.
1524 // NOTE: the derivatives of the state variables
1525 // are at the end of the decomposition vector.
1526 for (auto i = n_uvars; i < boost::numeric_cast<std::uint32_t>(dc.size()); ++i) {
1527 diff_arr.push_back(
1528 taylor_compute_sv_diff<T>(s, dc[i].first, diff_arr, par_ptr, n_uvars, cur_order, batch_size));
1529 }
1530
1531 // Now the other u variables.
1532 for (auto i = n_eq; i < n_uvars; ++i) {
1533 diff_arr.push_back(taylor_diff<T>(s, dc[i].first, dc[i].second, diff_arr, par_ptr, time_ptr, n_uvars,
1534 cur_order, i, batch_size, high_accuracy));
1535 }
1536 }
1537
1538 // Compute the last-order derivatives for the state variables.
1539 for (auto i = n_uvars; i < boost::numeric_cast<std::uint32_t>(dc.size()); ++i) {
1540 diff_arr.push_back(
1541 taylor_compute_sv_diff<T>(s, dc[i].first, diff_arr, par_ptr, n_uvars, order, batch_size));
1542 }
1543
1544 // If there are sv funcs, we need to compute their last-order derivatives too:
1545 // we will need to compute the derivatives of the u variables up to
1546 // the maximum index in sv_funcs_dc.
1547 const auto max_svf_idx
1548 = sv_funcs_dc.empty() ? std::uint32_t(0) : *std::max_element(sv_funcs_dc.begin(), sv_funcs_dc.end());
1549
1550 // NOTE: if there are no sv_funcs, max_svf_idx is set to zero
1551 // above, thus we never enter the loop.
1552 // NOTE: <= because max_svf_idx is an index, not a size.
1553 for (std::uint32_t i = n_eq; i <= max_svf_idx; ++i) {
1554 diff_arr.push_back(taylor_diff<T>(s, dc[i].first, dc[i].second, diff_arr, par_ptr, time_ptr, n_uvars, order,
1555 i, batch_size, high_accuracy));
1556 }
1557
1558 #if !defined(NDEBUG)
1559 if (sv_funcs_dc.empty()) {
1560 assert(diff_arr.size() == static_cast<decltype(diff_arr.size())>(n_uvars) * order + n_eq);
1561 } else {
1562 // NOTE: we use std::max<std::uint32_t>(n_eq, max_svf_idx + 1u) here because
1563 // the sv funcs could all be aliases of the state variables themselves,
1564 // in which case in the previous loop we ended up appending nothing.
1565 assert(diff_arr.size()
1566 == static_cast<decltype(diff_arr.size())>(n_uvars) * order
1567 + std::max<std::uint32_t>(n_eq, max_svf_idx + 1u));
1568 }
1569 #endif
1570
1571 // Extract the derivatives of the state variables and sv_funcs from diff_arr.
1572 std::vector<llvm::Value *> retval;
1573 for (std::uint32_t o = 0; o <= order; ++o) {
1574 for (std::uint32_t var_idx = 0; var_idx < n_eq; ++var_idx) {
1575 retval.push_back(taylor_fetch_diff(diff_arr, var_idx, o, n_uvars));
1576 }
1577 for (auto idx : sv_funcs_dc) {
1578 retval.push_back(taylor_fetch_diff(diff_arr, idx, o, n_uvars));
1579 }
1580 }
1581
1582 get_logger()->trace("Taylor IR creation default mode runtime: {}", sw);
1583
1584 return retval;
1585 }
1586 }
1587
1588 // Helper to generate the LLVM code to store the Taylor coefficients of the state variables and
1589 // the sv funcs into an external array. The Taylor polynomials are stored in row-major order,
1590 // first the state variables and after the sv funcs. For use in the adaptive timestepper implementations.
taylor_write_tc(llvm_state & s,const std::variant<llvm::Value *,std::vector<llvm::Value * >> & diff_variant,const std::vector<std::uint32_t> & sv_funcs_dc,llvm::Value * svf_ptr,llvm::Value * tc_ptr,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t batch_size)1591 void taylor_write_tc(llvm_state &s, const std::variant<llvm::Value *, std::vector<llvm::Value *>> &diff_variant,
1592 const std::vector<std::uint32_t> &sv_funcs_dc, llvm::Value *svf_ptr, llvm::Value *tc_ptr,
1593 std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size)
1594 {
1595 assert(batch_size != 0u);
1596 #if !defined(NDEBUG)
1597 if (diff_variant.index() == 0u) {
1598 // Compact mode.
1599 assert(sv_funcs_dc.empty() == !svf_ptr);
1600 } else {
1601 // Non-compact mode.
1602 assert(svf_ptr == nullptr);
1603 }
1604 #endif
1605
1606 auto &builder = s.builder();
1607
1608 // Convert to std::uint32_t for overflow checking and use below.
1609 const auto n_sv_funcs = boost::numeric_cast<std::uint32_t>(sv_funcs_dc.size());
1610
1611 // Overflow checking: ensure we can index into
1612 // tc_ptr using std::uint32_t.
1613 // NOTE: this is the same check done in taylor_add_jet_impl().
1614 // LCOV_EXCL_START
1615 if (order == std::numeric_limits<std::uint32_t>::max()
1616 || (order + 1u) > std::numeric_limits<std::uint32_t>::max() / batch_size
1617 || n_eq > std::numeric_limits<std::uint32_t>::max() - n_sv_funcs
1618 || n_eq + n_sv_funcs > std::numeric_limits<std::uint32_t>::max() / ((order + 1u) * batch_size)) {
1619 throw std::overflow_error("An overflow condition was detected while generating the code for writing the Taylor "
1620 "polynomials of an ODE system into the output array");
1621 }
1622 // LCOV_EXCL_STOP
1623
1624 if (diff_variant.index() == 0u) {
1625 // Compact mode.
1626
1627 auto *diff_arr = std::get<llvm::Value *>(diff_variant);
1628
1629 // Write out the Taylor coefficients for the state variables.
1630 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var) {
1631 llvm_loop_u32(s, builder.getInt32(0), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)),
1632 [&](llvm::Value *cur_order) {
1633 // Load the value of the derivative from diff_arr.
1634 auto *diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_var);
1635
1636 // Compute the index in the output pointer.
1637 auto *out_idx = builder.CreateAdd(
1638 builder.CreateMul(builder.getInt32((order + 1u) * batch_size), cur_var),
1639 builder.CreateMul(cur_order, builder.getInt32(batch_size)));
1640
1641 // Store into tc_ptr.
1642 // LCOV_EXCL_START
1643 assert(llvm_depr_GEP_type_check(tc_ptr, pointee_type(tc_ptr)));
1644 // LCOV_EXCL_STOP
1645 store_vector_to_memory(
1646 builder, builder.CreateInBoundsGEP(pointee_type(tc_ptr), tc_ptr, out_idx), diff_val);
1647 });
1648 });
1649
1650 // Write out the Taylor coefficients for the sv funcs, if necessary.
1651 if (svf_ptr != nullptr) {
1652 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_sv_funcs), [&](llvm::Value *arr_idx) {
1653 // Fetch the u var index from svf_ptr.
1654 assert(llvm_depr_GEP_type_check(svf_ptr, builder.getInt32Ty())); // LCOV_EXCL_LINE
1655 auto *cur_idx = builder.CreateLoad(builder.getInt32Ty(),
1656 builder.CreateInBoundsGEP(builder.getInt32Ty(), svf_ptr, arr_idx));
1657
1658 llvm_loop_u32(
1659 s, builder.getInt32(0), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)),
1660 [&](llvm::Value *cur_order) {
1661 // Load the derivative value from diff_arr.
1662 auto *diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_idx);
1663
1664 // Compute the index in the output pointer.
1665 auto *out_idx
1666 = builder.CreateAdd(builder.CreateMul(builder.getInt32((order + 1u) * batch_size),
1667 builder.CreateAdd(builder.getInt32(n_eq), arr_idx)),
1668 builder.CreateMul(cur_order, builder.getInt32(batch_size)));
1669
1670 // Store into tc_ptr.
1671 // LCOV_EXCL_START
1672 assert(llvm_depr_GEP_type_check(tc_ptr, pointee_type(tc_ptr)));
1673 // LCOV_EXCL_STOP
1674 store_vector_to_memory(
1675 builder, builder.CreateInBoundsGEP(pointee_type(tc_ptr), tc_ptr, out_idx), diff_val);
1676 });
1677 });
1678 }
1679 } else {
1680 // Non-compact mode.
1681
1682 const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_variant);
1683
1684 for (std::uint32_t j = 0; j < n_eq; ++j) {
1685 for (decltype(diff_arr.size()) cur_order = 0; cur_order <= order; ++cur_order) {
1686 // Index in the jet of derivatives.
1687 // NOTE: in non-compact mode, diff_arr contains the derivatives only of the
1688 // state variables and sv_variable (not all u vars), hence the indexing
1689 // is cur_order * (n_eq + n_sv_funcs) + j.
1690 const auto arr_idx = cur_order * (n_eq + n_sv_funcs) + j;
1691 assert(arr_idx < diff_arr.size());
1692 auto *const val = diff_arr[arr_idx];
1693
1694 // Index in tc_ptr.
1695 const auto out_idx = (order + 1u) * batch_size * j + cur_order * batch_size;
1696
1697 // Write to tc_ptr.
1698 assert(llvm_depr_GEP_type_check(tc_ptr, pointee_type(tc_ptr))); // LCOV_EXCL_LINE
1699 auto *out_ptr = builder.CreateInBoundsGEP(pointee_type(tc_ptr), tc_ptr,
1700 builder.getInt32(static_cast<std::uint32_t>(out_idx)));
1701 store_vector_to_memory(builder, out_ptr, val);
1702 }
1703 }
1704
1705 for (std::uint32_t j = 0; j < n_sv_funcs; ++j) {
1706 for (decltype(diff_arr.size()) cur_order = 0; cur_order <= order; ++cur_order) {
1707 // Index in the jet of derivatives.
1708 const auto arr_idx = cur_order * (n_eq + n_sv_funcs) + n_eq + j;
1709 assert(arr_idx < diff_arr.size());
1710 auto *const val = diff_arr[arr_idx];
1711
1712 // Index in tc_ptr.
1713 const auto out_idx = (order + 1u) * batch_size * (n_eq + j) + cur_order * batch_size;
1714
1715 // Write to tc_ptr.
1716 assert(llvm_depr_GEP_type_check(tc_ptr, pointee_type(tc_ptr))); // LCOV_EXCL_LINE
1717 auto *out_ptr = builder.CreateInBoundsGEP(pointee_type(tc_ptr), tc_ptr,
1718 builder.getInt32(static_cast<std::uint32_t>(out_idx)));
1719 store_vector_to_memory(builder, out_ptr, val);
1720 }
1721 }
1722 }
1723 }
1724
1725 // Add to s an adaptive timestepper function with support for events. This timestepper will *not*
1726 // propagate the state of the system. Instead, its output will be the jet of derivatives
1727 // of all state variables and event equations, and the deduced timestep value(s).
1728 template <typename T, typename U>
taylor_add_adaptive_step_with_events(llvm_state & s,const std::string & name,const U & sys,T tol,std::uint32_t batch_size,bool,bool compact_mode,const std::vector<expression> & evs,bool high_accuracy)1729 auto taylor_add_adaptive_step_with_events(llvm_state &s, const std::string &name, const U &sys, T tol,
1730 std::uint32_t batch_size, bool, bool compact_mode,
1731 const std::vector<expression> &evs, bool high_accuracy)
1732 {
1733 using std::isfinite;
1734
1735 assert(!s.is_compiled());
1736 assert(batch_size != 0u);
1737 assert(isfinite(tol) && tol > 0);
1738
1739 // Determine the order from the tolerance.
1740 const auto order = taylor_order_from_tol(tol);
1741
1742 // Record the number of equations/variables.
1743 const auto n_eq = boost::numeric_cast<std::uint32_t>(sys.size());
1744
1745 // Decompose the system of equations.
1746 auto [dc, ev_dc] = taylor_decompose(sys, evs);
1747
1748 // Compute the number of u variables.
1749 assert(dc.size() > n_eq);
1750 const auto n_uvars = boost::numeric_cast<std::uint32_t>(dc.size() - n_eq);
1751
1752 auto &builder = s.builder();
1753 auto &context = s.context();
1754
1755 // Prepare the function prototype. The arguments are:
1756 // - pointer to the output jet of derivative (write only),
1757 // - pointer to the current state vector (read only),
1758 // - pointer to the parameters (read only),
1759 // - pointer to the time value(s) (read only),
1760 // - pointer to the array of max timesteps (read & write),
1761 // - pointer to the max_abs_state output variable (write only).
1762 // These pointers cannot overlap.
1763 std::vector<llvm::Type *> fargs(6, llvm::PointerType::getUnqual(to_llvm_type<T>(context)));
1764 // The function does not return anything.
1765 auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
1766 assert(ft != nullptr);
1767 // Now create the function.
1768 auto *f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, name, &s.module());
1769 // LCOV_EXCL_START
1770 if (f == nullptr) {
1771 throw std::invalid_argument(
1772 "Unable to create a function for an adaptive Taylor stepper with name '{}'"_format(name));
1773 }
1774 // LCOV_EXCL_STOP
1775
1776 // Set the names/attributes of the function arguments.
1777 auto *jet_ptr = f->args().begin();
1778 jet_ptr->setName("jet_ptr");
1779 jet_ptr->addAttr(llvm::Attribute::NoCapture);
1780 jet_ptr->addAttr(llvm::Attribute::NoAlias);
1781 jet_ptr->addAttr(llvm::Attribute::WriteOnly);
1782
1783 auto *state_ptr = jet_ptr + 1;
1784 state_ptr->setName("state_ptr");
1785 state_ptr->addAttr(llvm::Attribute::NoCapture);
1786 state_ptr->addAttr(llvm::Attribute::NoAlias);
1787 state_ptr->addAttr(llvm::Attribute::ReadOnly);
1788
1789 auto *par_ptr = state_ptr + 1;
1790 par_ptr->setName("par_ptr");
1791 par_ptr->addAttr(llvm::Attribute::NoCapture);
1792 par_ptr->addAttr(llvm::Attribute::NoAlias);
1793 par_ptr->addAttr(llvm::Attribute::ReadOnly);
1794
1795 auto *time_ptr = par_ptr + 1;
1796 time_ptr->setName("time_ptr");
1797 time_ptr->addAttr(llvm::Attribute::NoCapture);
1798 time_ptr->addAttr(llvm::Attribute::NoAlias);
1799 time_ptr->addAttr(llvm::Attribute::ReadOnly);
1800
1801 auto *h_ptr = time_ptr + 1;
1802 h_ptr->setName("h_ptr");
1803 h_ptr->addAttr(llvm::Attribute::NoCapture);
1804 h_ptr->addAttr(llvm::Attribute::NoAlias);
1805
1806 auto *max_abs_state_ptr = h_ptr + 1;
1807 max_abs_state_ptr->setName("max_abs_state_ptr");
1808 max_abs_state_ptr->addAttr(llvm::Attribute::NoCapture);
1809 max_abs_state_ptr->addAttr(llvm::Attribute::NoAlias);
1810 max_abs_state_ptr->addAttr(llvm::Attribute::WriteOnly);
1811
1812 // Create a new basic block to start insertion into.
1813 auto *bb = llvm::BasicBlock::Create(context, "entry", f);
1814 assert(bb != nullptr); // LCOV_EXCL_LINE
1815 builder.SetInsertPoint(bb);
1816
1817 // Create a global read-only array containing the values in ev_dc, if there
1818 // are any and we are in compact mode (otherwise, svf_ptr will be null).
1819 auto *svf_ptr = compact_mode ? taylor_c_make_sv_funcs_arr(s, ev_dc) : nullptr;
1820
1821 // Compute the jet of derivatives at the given order.
1822 auto diff_variant = taylor_compute_jet<T>(s, state_ptr, par_ptr, time_ptr, dc, ev_dc, n_eq, n_uvars, order,
1823 batch_size, compact_mode, high_accuracy);
1824
1825 // Determine the integration timestep.
1826 auto h = taylor_determine_h<T>(s, diff_variant, ev_dc, svf_ptr, h_ptr, n_eq, n_uvars, order, batch_size,
1827 max_abs_state_ptr);
1828
1829 // Store h to memory.
1830 store_vector_to_memory(builder, h_ptr, h);
1831
1832 // Copy the jet of derivatives to jet_ptr.
1833 taylor_write_tc(s, diff_variant, ev_dc, svf_ptr, jet_ptr, n_eq, n_uvars, order, batch_size);
1834
1835 // Create the return value.
1836 builder.CreateRetVoid();
1837
1838 // Verify the function.
1839 s.verify_function(f);
1840
1841 // Run the optimisation pass.
1842 // NOTE: this does nothing currently, as the optimisation
1843 // level is set to zero from the outside.
1844 s.optimise();
1845
1846 return std::tuple{std::move(dc), order};
1847 }
1848
1849 // Run the Horner scheme to propagate an ODE state via the evaluation of the Taylor polynomials.
1850 // diff_var contains either the derivatives for all u variables (in compact mode) or only
1851 // for the state variables (non-compact mode). The evaluation point (i.e., the timestep)
1852 // is h. The evaluation is run in parallel over the polynomials of all the state
1853 // variables.
1854 std::variant<llvm::Value *, std::vector<llvm::Value *>>
taylor_run_multihorner(llvm_state & s,const std::variant<llvm::Value *,std::vector<llvm::Value * >> & diff_var,llvm::Value * h,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,std::uint32_t,bool compact_mode)1855 taylor_run_multihorner(llvm_state &s, const std::variant<llvm::Value *, std::vector<llvm::Value *>> &diff_var,
1856 llvm::Value *h, std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t,
1857 bool compact_mode)
1858 {
1859 auto &builder = s.builder();
1860
1861 if (compact_mode) {
1862 // Compact mode.
1863 auto *diff_arr = std::get<llvm::Value *>(diff_var);
1864
1865 // Create the array storing the results of the evaluation.
1866 auto *fp_vec_t = pointee_type(diff_arr);
1867 auto *array_type = llvm::ArrayType::get(fp_vec_t, n_eq);
1868 auto *array_inst = builder.CreateAlloca(array_type);
1869 assert(llvm_depr_GEP_type_check(array_inst, array_type)); // LCOV_EXCL_LINE
1870 auto *res_arr = builder.CreateInBoundsGEP(array_type, array_inst, {builder.getInt32(0), builder.getInt32(0)});
1871
1872 // Init the return value, filling it with the values of the
1873 // coefficients of the highest-degree monomial in each polynomial.
1874 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1875 // Load the value from diff_arr and store it in res_arr.
1876 assert(llvm_depr_GEP_type_check(res_arr, fp_vec_t)); // LCOV_EXCL_LINE
1877 builder.CreateStore(taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(order), cur_var_idx),
1878 builder.CreateInBoundsGEP(fp_vec_t, res_arr, cur_var_idx));
1879 });
1880
1881 // Run the evaluation.
1882 llvm_loop_u32(
1883 s, builder.getInt32(1), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)),
1884 [&](llvm::Value *cur_order) {
1885 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1886 // Load the current poly coeff from diff_arr.
1887 // NOTE: we are loading the coefficients backwards wrt the order, hence
1888 // we specify order - cur_order.
1889 auto *cf = taylor_c_load_diff(s, diff_arr, n_uvars,
1890 builder.CreateSub(builder.getInt32(order), cur_order), cur_var_idx);
1891
1892 // Accumulate in res_arr.
1893 assert(llvm_depr_GEP_type_check(res_arr, fp_vec_t)); // LCOV_EXCL_LINE
1894 auto *res_ptr = builder.CreateInBoundsGEP(fp_vec_t, res_arr, cur_var_idx);
1895 builder.CreateStore(
1896 builder.CreateFAdd(cf, builder.CreateFMul(builder.CreateLoad(fp_vec_t, res_ptr), h)), res_ptr);
1897 });
1898 });
1899
1900 return res_arr;
1901 } else {
1902 // Non-compact mode.
1903 const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_var);
1904
1905 // Init the return value, filling it with the values of the
1906 // coefficients of the highest-degree monomial in each polynomial.
1907 std::vector<llvm::Value *> res_arr;
1908 for (std::uint32_t i = 0; i < n_eq; ++i) {
1909 res_arr.push_back(diff_arr[(n_eq * order) + i]);
1910 }
1911
1912 // Run the Horner scheme simultaneously for all polynomials.
1913 for (std::uint32_t i = 1; i <= order; ++i) {
1914 for (std::uint32_t j = 0; j < n_eq; ++j) {
1915 res_arr[j] = builder.CreateFAdd(diff_arr[(order - i) * n_eq + j], builder.CreateFMul(res_arr[j], h));
1916 }
1917 }
1918
1919 return res_arr;
1920 }
1921 }
1922
1923 // Same as taylor_run_multihorner(), but instead of the Horner scheme this implementation uses
1924 // a compensated summation over the naive evaluation of monomials.
1925 std::variant<llvm::Value *, std::vector<llvm::Value *>>
taylor_run_ceval(llvm_state & s,const std::variant<llvm::Value *,std::vector<llvm::Value * >> & diff_var,llvm::Value * h,std::uint32_t n_eq,std::uint32_t n_uvars,std::uint32_t order,bool,bool compact_mode)1926 taylor_run_ceval(llvm_state &s, const std::variant<llvm::Value *, std::vector<llvm::Value *>> &diff_var, llvm::Value *h,
1927 std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, bool, bool compact_mode)
1928 {
1929 auto &builder = s.builder();
1930
1931 if (compact_mode) {
1932 // Compact mode.
1933 auto *diff_arr = std::get<llvm::Value *>(diff_var);
1934
1935 // Create the arrays storing the results of the evaluation and the running compensations.
1936 auto *fp_vec_t = pointee_type(diff_arr);
1937 auto *array_type = llvm::ArrayType::get(fp_vec_t, n_eq);
1938 auto *res_arr_inst = builder.CreateAlloca(array_type);
1939 auto *comp_arr_inst = builder.CreateAlloca(array_type);
1940 // LCOV_EXCL_START
1941 assert(llvm_depr_GEP_type_check(res_arr_inst, array_type));
1942 assert(llvm_depr_GEP_type_check(comp_arr_inst, array_type));
1943 // LCOV_EXCL_STOP
1944 auto *res_arr = builder.CreateInBoundsGEP(array_type, res_arr_inst, {builder.getInt32(0), builder.getInt32(0)});
1945 auto *comp_arr
1946 = builder.CreateInBoundsGEP(array_type, comp_arr_inst, {builder.getInt32(0), builder.getInt32(0)});
1947
1948 // Init res_arr with the order-0 coefficients, and the running
1949 // compensations with zero.
1950 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1951 // Load the value from diff_arr.
1952 assert(llvm_depr_GEP_type_check(diff_arr, fp_vec_t)); // LCOV_EXCL_LINE
1953 auto *val = builder.CreateLoad(fp_vec_t, builder.CreateInBoundsGEP(fp_vec_t, diff_arr, cur_var_idx));
1954
1955 // Store it in res_arr.
1956 assert(llvm_depr_GEP_type_check(res_arr, fp_vec_t)); // LCOV_EXCL_LINE
1957 builder.CreateStore(val, builder.CreateInBoundsGEP(fp_vec_t, res_arr, cur_var_idx));
1958
1959 // Zero-init the element in comp_arr.
1960 assert(llvm_depr_GEP_type_check(comp_arr, fp_vec_t)); // LCOV_EXCL_LINE
1961 builder.CreateStore(llvm::ConstantFP::get(fp_vec_t, 0.),
1962 builder.CreateInBoundsGEP(fp_vec_t, comp_arr, cur_var_idx));
1963 });
1964
1965 // Init the running updater for the powers of h.
1966 auto *cur_h = builder.CreateAlloca(h->getType());
1967 builder.CreateStore(h, cur_h);
1968
1969 // Run the evaluation.
1970 llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(order + 1u), [&](llvm::Value *cur_order) {
1971 // Load the current power of h.
1972 auto *cur_h_val = builder.CreateLoad(fp_vec_t, cur_h);
1973
1974 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
1975 // Evaluate the current monomial.
1976 auto *cf = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_var_idx);
1977 auto *tmp = builder.CreateFMul(cf, cur_h_val);
1978
1979 // Compute the quantities for the compensation.
1980 assert(llvm_depr_GEP_type_check(comp_arr, fp_vec_t)); // LCOV_EXCL_LINE
1981 auto *comp_ptr = builder.CreateInBoundsGEP(fp_vec_t, comp_arr, cur_var_idx);
1982 assert(llvm_depr_GEP_type_check(res_arr, fp_vec_t)); // LCOV_EXCL_LINE
1983 auto *res_ptr = builder.CreateInBoundsGEP(fp_vec_t, res_arr, cur_var_idx);
1984 auto *y = builder.CreateFSub(tmp, builder.CreateLoad(fp_vec_t, comp_ptr));
1985 auto *cur_res = builder.CreateLoad(fp_vec_t, res_ptr);
1986 auto *t = builder.CreateFAdd(cur_res, y);
1987
1988 // Update the compensation and the return value.
1989 builder.CreateStore(builder.CreateFSub(builder.CreateFSub(t, cur_res), y), comp_ptr);
1990 builder.CreateStore(t, res_ptr);
1991 });
1992
1993 // Update the value of h.
1994 builder.CreateStore(builder.CreateFMul(cur_h_val, h), cur_h);
1995 });
1996
1997 return res_arr;
1998 } else {
1999 // Non-compact mode.
2000 const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_var);
2001
2002 // Init the return values with the order-0 monomials, and the running
2003 // compensations with zero.
2004 std::vector<llvm::Value *> res_arr, comp_arr;
2005 for (std::uint32_t i = 0; i < n_eq; ++i) {
2006 res_arr.push_back(diff_arr[i]);
2007 comp_arr.push_back(llvm::ConstantFP::get(diff_arr[i]->getType(), 0.));
2008 }
2009
2010 // Evaluate and sum.
2011 auto *cur_h = h;
2012 for (std::uint32_t i = 1; i <= order; ++i) {
2013 for (std::uint32_t j = 0; j < n_eq; ++j) {
2014 // Evaluate the current monomial.
2015 auto *tmp = builder.CreateFMul(diff_arr[i * n_eq + j], cur_h);
2016
2017 // Compute the quantities for the compensation.
2018 auto *y = builder.CreateFSub(tmp, comp_arr[j]);
2019 auto *t = builder.CreateFAdd(res_arr[j], y);
2020
2021 // Update the compensation and the return value.
2022 comp_arr[j] = builder.CreateFSub(builder.CreateFSub(t, res_arr[j]), y);
2023 res_arr[j] = t;
2024 }
2025
2026 // Update the power of h.
2027 cur_h = builder.CreateFMul(cur_h, h);
2028 }
2029
2030 return res_arr;
2031 }
2032 }
2033
2034 // NOTE: in compact mode, care must be taken when adding multiple stepper functions to the same llvm state
2035 // with the same floating-point type, batch size and number of u variables. The potential issue there
2036 // is that when the first stepper is added, the compact mode AD functions are created and then optimised.
2037 // The optimisation pass might alter the functions in a way that makes them incompatible with subsequent
2038 // uses in the second stepper (e.g., an argument might be removed from the signature because it is a
2039 // compile-time constant). A workaround to avoid issues is to set the optimisation level to zero
2040 // in the state, add the 2 steppers and then run a single optimisation pass. This is what we do
2041 // in the integrators' ctors.
2042 // NOTE: document this eventually.
2043 template <typename T, typename U>
taylor_add_adaptive_step(llvm_state & s,const std::string & name,const U & sys,T tol,std::uint32_t batch_size,bool high_accuracy,bool compact_mode)2044 auto taylor_add_adaptive_step(llvm_state &s, const std::string &name, const U &sys, T tol, std::uint32_t batch_size,
2045 bool high_accuracy, bool compact_mode)
2046 {
2047 using std::isfinite;
2048
2049 assert(!s.is_compiled());
2050 assert(batch_size > 0u);
2051 assert(isfinite(tol) && tol > 0);
2052
2053 // Determine the order from the tolerance.
2054 const auto order = taylor_order_from_tol(tol);
2055
2056 // Record the number of equations/variables.
2057 const auto n_eq = boost::numeric_cast<std::uint32_t>(sys.size());
2058
2059 // Decompose the system of equations.
2060 // NOTE: no sv_funcs needed for this stepper.
2061 auto [dc, sv_funcs_dc] = taylor_decompose(sys, {});
2062
2063 assert(sv_funcs_dc.empty());
2064
2065 // Compute the number of u variables.
2066 assert(dc.size() > n_eq);
2067 const auto n_uvars = boost::numeric_cast<std::uint32_t>(dc.size() - n_eq);
2068
2069 auto &builder = s.builder();
2070 auto &context = s.context();
2071
2072 // Prepare the function prototype. The arguments are:
2073 // - pointer to the current state vector (read & write),
2074 // - pointer to the parameters (read only),
2075 // - pointer to the time value(s) (read only),
2076 // - pointer to the array of max timesteps (read & write),
2077 // - pointer to the Taylor coefficients output (write only).
2078 // These pointers cannot overlap.
2079 auto *fp_t = to_llvm_type<T>(context);
2080 auto *fp_vec_t = make_vector_type(fp_t, batch_size);
2081 std::vector<llvm::Type *> fargs(5, llvm::PointerType::getUnqual(fp_t));
2082 // The function does not return anything.
2083 auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
2084 assert(ft != nullptr);
2085 // Now create the function.
2086 auto *f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, name, &s.module());
2087 if (f == nullptr) {
2088 throw std::invalid_argument(
2089 "Unable to create a function for an adaptive Taylor stepper with name '{}'"_format(name));
2090 }
2091
2092 // Set the names/attributes of the function arguments.
2093 auto *state_ptr = f->args().begin();
2094 state_ptr->setName("state_ptr");
2095 state_ptr->addAttr(llvm::Attribute::NoCapture);
2096 state_ptr->addAttr(llvm::Attribute::NoAlias);
2097
2098 auto *par_ptr = state_ptr + 1;
2099 par_ptr->setName("par_ptr");
2100 par_ptr->addAttr(llvm::Attribute::NoCapture);
2101 par_ptr->addAttr(llvm::Attribute::NoAlias);
2102 par_ptr->addAttr(llvm::Attribute::ReadOnly);
2103
2104 auto *time_ptr = par_ptr + 1;
2105 time_ptr->setName("time_ptr");
2106 time_ptr->addAttr(llvm::Attribute::NoCapture);
2107 time_ptr->addAttr(llvm::Attribute::NoAlias);
2108 time_ptr->addAttr(llvm::Attribute::ReadOnly);
2109
2110 auto *h_ptr = time_ptr + 1;
2111 h_ptr->setName("h_ptr");
2112 h_ptr->addAttr(llvm::Attribute::NoCapture);
2113 h_ptr->addAttr(llvm::Attribute::NoAlias);
2114
2115 auto *tc_ptr = h_ptr + 1;
2116 tc_ptr->setName("tc_ptr");
2117 tc_ptr->addAttr(llvm::Attribute::NoCapture);
2118 tc_ptr->addAttr(llvm::Attribute::NoAlias);
2119 tc_ptr->addAttr(llvm::Attribute::WriteOnly);
2120
2121 // Create a new basic block to start insertion into.
2122 auto *bb = llvm::BasicBlock::Create(context, "entry", f);
2123 assert(bb != nullptr);
2124 builder.SetInsertPoint(bb);
2125
2126 // Compute the jet of derivatives at the given order.
2127 auto diff_variant = taylor_compute_jet<T>(s, state_ptr, par_ptr, time_ptr, dc, {}, n_eq, n_uvars, order, batch_size,
2128 compact_mode, high_accuracy);
2129
2130 // Determine the integration timestep.
2131 auto h = taylor_determine_h<T>(s, diff_variant, sv_funcs_dc, nullptr, h_ptr, n_eq, n_uvars, order, batch_size,
2132 nullptr);
2133
2134 // Evaluate the Taylor polynomials, producing the updated state of the system.
2135 auto new_state_var
2136 = high_accuracy ? taylor_run_ceval(s, diff_variant, h, n_eq, n_uvars, order, high_accuracy, compact_mode)
2137 : taylor_run_multihorner(s, diff_variant, h, n_eq, n_uvars, order, batch_size, compact_mode);
2138
2139 // Store the new state.
2140 // NOTE: no need to perform overflow check on n_eq * batch_size,
2141 // as in taylor_compute_jet() we already checked.
2142 if (compact_mode) {
2143 auto new_state = std::get<llvm::Value *>(new_state_var);
2144
2145 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
2146 assert(llvm_depr_GEP_type_check(new_state, fp_vec_t)); // LCOV_EXCL_LINE
2147 auto val = builder.CreateLoad(fp_vec_t, builder.CreateInBoundsGEP(fp_vec_t, new_state, cur_var_idx));
2148 assert(llvm_depr_GEP_type_check(state_ptr, fp_t)); // LCOV_EXCL_LINE
2149 store_vector_to_memory(builder,
2150 builder.CreateInBoundsGEP(
2151 fp_t, state_ptr, builder.CreateMul(cur_var_idx, builder.getInt32(batch_size))),
2152 val);
2153 });
2154 } else {
2155 const auto &new_state = std::get<std::vector<llvm::Value *>>(new_state_var);
2156
2157 assert(new_state.size() == n_eq);
2158
2159 for (std::uint32_t var_idx = 0; var_idx < n_eq; ++var_idx) {
2160 assert(llvm_depr_GEP_type_check(state_ptr, fp_t)); // LCOV_EXCL_LINE
2161 store_vector_to_memory(builder,
2162 builder.CreateInBoundsGEP(fp_t, state_ptr, builder.getInt32(var_idx * batch_size)),
2163 new_state[var_idx]);
2164 }
2165 }
2166
2167 // Store the timesteps that were used.
2168 store_vector_to_memory(builder, h_ptr, h);
2169
2170 // Write the Taylor coefficients, if requested.
2171 auto nptr = llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(to_llvm_type<T>(s.context())));
2172 llvm_if_then_else(
2173 s, builder.CreateICmpNE(tc_ptr, nptr),
2174 [&]() {
2175 // tc_ptr is not null: copy the Taylor coefficients
2176 // for the state variables.
2177 taylor_write_tc(s, diff_variant, {}, nullptr, tc_ptr, n_eq, n_uvars, order, batch_size);
2178 },
2179 [&]() {
2180 // Taylor coefficients were not requested,
2181 // don't do anything in this branch.
2182 });
2183
2184 // Create the return value.
2185 builder.CreateRetVoid();
2186
2187 // Verify the function.
2188 s.verify_function(f);
2189
2190 // Run the optimisation pass.
2191 // NOTE: this does nothing currently, as the optimisation
2192 // level is set to zero from the outside.
2193 s.optimise();
2194
2195 return std::tuple{std::move(dc), order};
2196 }
2197
2198 } // namespace
2199
2200 template <typename T>
2201 template <typename U>
finalise_ctor_impl(const U & sys,std::vector<T> state,T time,T tol,bool high_accuracy,bool compact_mode,std::vector<T> pars,std::vector<t_event_t> tes,std::vector<nt_event_t> ntes)2202 void taylor_adaptive_impl<T>::finalise_ctor_impl(const U &sys, std::vector<T> state, T time, T tol, bool high_accuracy,
2203 bool compact_mode, std::vector<T> pars, std::vector<t_event_t> tes,
2204 std::vector<nt_event_t> ntes)
2205 {
2206 #if defined(HEYOKA_ARCH_PPC)
2207 if constexpr (std::is_same_v<T, long double>) {
2208 throw not_implemented_error("'long double' computations are not supported on PowerPC");
2209 }
2210 #endif
2211
2212 using std::isfinite;
2213
2214 // Assign the data members.
2215 m_state = std::move(state);
2216 m_time = dfloat<T>(time);
2217 m_pars = std::move(pars);
2218 m_high_accuracy = high_accuracy;
2219 m_compact_mode = compact_mode;
2220
2221 // Check input params.
2222 if (std::any_of(m_state.begin(), m_state.end(), [](const auto &x) { return !isfinite(x); })) {
2223 throw std::invalid_argument(
2224 "A non-finite value was detected in the initial state of an adaptive Taylor integrator");
2225 }
2226
2227 if (m_state.size() != sys.size()) {
2228 throw std::invalid_argument(
2229 "Inconsistent sizes detected in the initialization of an adaptive Taylor "
2230 "integrator: the state vector has a dimension of {}, while the number of equations is {}"_format(
2231 m_state.size(), sys.size()));
2232 }
2233
2234 if (!isfinite(m_time)) {
2235 throw std::invalid_argument(
2236 "Cannot initialise an adaptive Taylor integrator with a non-finite initial time of {}"_format(
2237 static_cast<T>(m_time)));
2238 }
2239
2240 if (!isfinite(tol) || tol <= 0) {
2241 throw std::invalid_argument(
2242 "The tolerance in an adaptive Taylor integrator must be finite and positive, but it is {} instead"_format(
2243 tol));
2244 }
2245
2246 // Store the tolerance.
2247 m_tol = tol;
2248
2249 // Store the dimension of the system.
2250 m_dim = boost::numeric_cast<std::uint32_t>(sys.size());
2251
2252 // Do we have events?
2253 const auto with_events = !tes.empty() || !ntes.empty();
2254
2255 // Temporarily disable optimisations in s, so that
2256 // we don't optimise twice when adding the step
2257 // and then the d_out.
2258 std::optional<opt_disabler> od(m_llvm);
2259
2260 // Add the stepper function.
2261 if (with_events) {
2262 std::vector<expression> ee;
2263 // NOTE: no need for deep copies of the expressions: ee is never mutated
2264 // and we will be deep-copying it anyway when we do the decomposition.
2265 for (const auto &ev : tes) {
2266 ee.push_back(ev.get_expression());
2267 }
2268 for (const auto &ev : ntes) {
2269 ee.push_back(ev.get_expression());
2270 }
2271
2272 std::tie(m_dc, m_order) = taylor_add_adaptive_step_with_events<T>(m_llvm, "step_e", sys, tol, 1, high_accuracy,
2273 compact_mode, ee, high_accuracy);
2274 } else {
2275 std::tie(m_dc, m_order) = taylor_add_adaptive_step<T>(m_llvm, "step", sys, tol, 1, high_accuracy, compact_mode);
2276 }
2277
2278 // Fix m_pars' size, if necessary.
2279 const auto npars = n_pars_in_dc(m_dc);
2280 if (m_pars.size() < npars) {
2281 m_pars.resize(boost::numeric_cast<decltype(m_pars.size())>(npars));
2282 } else if (m_pars.size() > npars) {
2283 throw std::invalid_argument(
2284 "Excessive number of parameter values passed to the constructor of an adaptive "
2285 "Taylor integrator: {} parameter values were passed, but the ODE system contains only {} parameters"_format(
2286 m_pars.size(), npars));
2287 }
2288
2289 // Log runtimes in trace mode.
2290 spdlog::stopwatch sw;
2291
2292 // Add the function for the computation of
2293 // the dense output.
2294 taylor_add_d_out_function<T>(m_llvm, m_dim, m_order, 1, high_accuracy);
2295
2296 get_logger()->trace("Taylor dense output runtime: {}", sw);
2297 sw.reset();
2298
2299 // Restore the original optimisation level in s.
2300 od.reset();
2301
2302 // Run the optimisation pass manually.
2303 m_llvm.optimise();
2304
2305 get_logger()->trace("Taylor global opt pass runtime: {}", sw);
2306 sw.reset();
2307
2308 // Run the jit.
2309 m_llvm.compile();
2310
2311 get_logger()->trace("Taylor LLVM compilation runtime: {}", sw);
2312
2313 // Fetch the stepper.
2314 if (with_events) {
2315 m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
2316 } else {
2317 m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
2318 }
2319
2320 // Fetch the function to compute the dense output.
2321 m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
2322
2323 // Setup the vector for the Taylor coefficients.
2324 // LCOV_EXCL_START
2325 if (m_order == std::numeric_limits<std::uint32_t>::max()
2326 || m_state.size() > std::numeric_limits<decltype(m_tc.size())>::max() / (m_order + 1u)) {
2327 throw std::overflow_error("Overflow detected in the initialisation of an adaptive Taylor integrator: the order "
2328 "or the state size is too large");
2329 }
2330 // LCOV_EXCL_STOP
2331
2332 m_tc.resize(m_state.size() * (m_order + 1u));
2333
2334 // Setup the vector for the dense output.
2335 m_d_out.resize(m_state.size());
2336
2337 // Init the event data structure if needed.
2338 // NOTE: this can be done in parallel with the rest of the constructor,
2339 // once we have m_order/m_dim and we are done using tes/ntes.
2340 if (with_events) {
2341 m_ed_data = std::make_unique<ed_data>(std::move(tes), std::move(ntes), m_order, m_dim);
2342 }
2343 }
2344
2345 template <typename T>
taylor_adaptive_impl()2346 taylor_adaptive_impl<T>::taylor_adaptive_impl()
2347 : taylor_adaptive_impl({prime("x"_var) = 0_dbl}, {T(0)}, kw::tol = T(1e-1))
2348 {
2349 }
2350
2351 template <typename T>
taylor_adaptive_impl(const taylor_adaptive_impl & other)2352 taylor_adaptive_impl<T>::taylor_adaptive_impl(const taylor_adaptive_impl &other)
2353 : m_state(other.m_state), m_time(other.m_time), m_llvm(other.m_llvm), m_dim(other.m_dim), m_order(other.m_order),
2354 m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy), m_compact_mode(other.m_compact_mode),
2355 m_pars(other.m_pars), m_tc(other.m_tc), m_last_h(other.m_last_h), m_d_out(other.m_d_out),
2356 m_ed_data(other.m_ed_data ? std::make_unique<ed_data>(*other.m_ed_data) : nullptr)
2357 {
2358 // NOTE: make explicit deep copy of the decomposition.
2359 m_dc.reserve(other.m_dc.size());
2360 for (const auto &[ex, deps] : other.m_dc) {
2361 m_dc.emplace_back(copy(ex), deps);
2362 }
2363
2364 if (m_ed_data) {
2365 m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
2366 } else {
2367 m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
2368 }
2369 m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
2370 }
2371
2372 template <typename T>
2373 taylor_adaptive_impl<T>::taylor_adaptive_impl(taylor_adaptive_impl &&) noexcept = default;
2374
2375 template <typename T>
operator =(const taylor_adaptive_impl & other)2376 taylor_adaptive_impl<T> &taylor_adaptive_impl<T>::operator=(const taylor_adaptive_impl &other)
2377 {
2378 if (this != &other) {
2379 *this = taylor_adaptive_impl(other);
2380 }
2381
2382 return *this;
2383 }
2384
2385 template <typename T>
2386 taylor_adaptive_impl<T> &taylor_adaptive_impl<T>::operator=(taylor_adaptive_impl &&) noexcept = default;
2387
2388 template <typename T>
2389 taylor_adaptive_impl<T>::~taylor_adaptive_impl() = default;
2390
2391 // NOTE: the save/load patterns mimic the copy constructor logic.
2392 template <typename T>
2393 template <typename Archive>
save_impl(Archive & ar,unsigned) const2394 void taylor_adaptive_impl<T>::save_impl(Archive &ar, unsigned) const
2395 {
2396 ar << m_state;
2397 ar << m_time;
2398 ar << m_llvm;
2399 ar << m_dim;
2400 ar << m_dc;
2401 ar << m_order;
2402 ar << m_tol;
2403 ar << m_high_accuracy;
2404 ar << m_compact_mode;
2405 ar << m_pars;
2406 ar << m_tc;
2407 ar << m_last_h;
2408 ar << m_d_out;
2409 ar << m_ed_data;
2410 }
2411
2412 template <typename T>
2413 template <typename Archive>
load_impl(Archive & ar,unsigned version)2414 void taylor_adaptive_impl<T>::load_impl(Archive &ar, unsigned version)
2415 {
2416 // LCOV_EXCL_START
2417 if (version < static_cast<unsigned>(boost::serialization::version<taylor_adaptive_impl<T>>::type::value)) {
2418 throw std::invalid_argument("Unable to load a taylor_adaptive integrator: "
2419 "the archive version ({}) is too old"_format(version));
2420 }
2421 // LCOV_EXCL_STOP
2422
2423 ar >> m_state;
2424 ar >> m_time;
2425 ar >> m_llvm;
2426 ar >> m_dim;
2427 ar >> m_dc;
2428 ar >> m_order;
2429 ar >> m_tol;
2430 ar >> m_high_accuracy;
2431 ar >> m_compact_mode;
2432 ar >> m_pars;
2433 ar >> m_tc;
2434 ar >> m_last_h;
2435 ar >> m_d_out;
2436 ar >> m_ed_data;
2437
2438 // Recover the function pointers.
2439 if (m_ed_data) {
2440 m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
2441 } else {
2442 m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
2443 }
2444 m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
2445 }
2446
2447 template <typename T>
save(boost::archive::binary_oarchive & ar,unsigned v) const2448 void taylor_adaptive_impl<T>::save(boost::archive::binary_oarchive &ar, unsigned v) const
2449 {
2450 save_impl(ar, v);
2451 }
2452
2453 template <typename T>
load(boost::archive::binary_iarchive & ar,unsigned v)2454 void taylor_adaptive_impl<T>::load(boost::archive::binary_iarchive &ar, unsigned v)
2455 {
2456 load_impl(ar, v);
2457 }
2458
2459 // Implementation detail to make a single integration timestep.
2460 // The magnitude of the timestep is automatically deduced, but it will
2461 // always be not greater than abs(max_delta_t). The propagation
2462 // is done forward in time if max_delta_t >= 0, backwards in
2463 // time otherwise.
2464 //
2465 // The function will return a pair, containing
2466 // a flag describing the outcome of the integration,
2467 // and the integration timestep that was used.
2468 template <typename T>
step_impl(T max_delta_t,bool wtc)2469 std::tuple<taylor_outcome, T> taylor_adaptive_impl<T>::step_impl(T max_delta_t, bool wtc)
2470 {
2471 using std::abs;
2472 using std::isfinite;
2473
2474 #if !defined(NDEBUG)
2475 // NOTE: this is the only precondition on max_delta_t.
2476 using std::isnan;
2477 assert(!isnan(max_delta_t)); // LCOV_EXCL_LINE
2478 #endif
2479
2480 auto h = max_delta_t;
2481
2482 if (m_step_f.index() == 0u) {
2483 assert(!m_ed_data); // LCOV_EXCL_LINE
2484
2485 // Invoke the vanilla stepper.
2486 std::get<0>(m_step_f)(m_state.data(), m_pars.data(), &m_time.hi, &h, wtc ? m_tc.data() : nullptr);
2487
2488 // Update the time.
2489 m_time += h;
2490
2491 // Store the last timestep.
2492 m_last_h = h;
2493
2494 // Check if the time or the state vector are non-finite at the
2495 // end of the timestep.
2496 if (!isfinite(m_time)
2497 || std::any_of(m_state.cbegin(), m_state.cend(), [](const auto &x) { return !isfinite(x); })) {
2498 return std::tuple{taylor_outcome::err_nf_state, h};
2499 }
2500
2501 return std::tuple{h == max_delta_t ? taylor_outcome::time_limit : taylor_outcome::success, h};
2502 } else {
2503 assert(m_ed_data); // LCOV_EXCL_LINE
2504
2505 auto &ed_data = *m_ed_data;
2506
2507 // Invoke the stepper for event handling. We will record the norm infinity of the state vector +
2508 // event equations at the beginning of the timestep for later use.
2509 T max_abs_state;
2510 std::get<1>(m_step_f)(ed_data.m_ev_jet.data(), m_state.data(), m_pars.data(), &m_time.hi, &h, &max_abs_state);
2511
2512 // Compute the maximum absolute error on the Taylor series of the event equations, which we will use for
2513 // automatic cooldown deduction. If max_abs_state is not finite, set it to inf so that
2514 // in ed_data.detect_events() we skip event detection altogether.
2515 T g_eps;
2516 if (isfinite(max_abs_state)) {
2517 // Are we in absolute or relative error control mode?
2518 const auto abs_or_rel = max_abs_state < 1;
2519
2520 // Estimate the size of the largest remainder in the Taylor
2521 // series of both the dynamical equations and the events.
2522 const auto max_r_size = abs_or_rel ? m_tol : (m_tol * max_abs_state);
2523
2524 // NOTE: depending on m_tol, max_r_size is arbitrarily small, but the real
2525 // integration error cannot be too small due to floating-point truncation.
2526 // This is the case for instance if we use sub-epsilon integration tolerances
2527 // to achieve Brouwer's law. In such a case, we cap the value of g_eps,
2528 // using eps * max_abs_state as an estimation of the smallest number
2529 // that can be resolved with the current floating-point type.
2530 // NOTE: the if condition in the next line is equivalent, in relative
2531 // error control mode, to:
2532 // if (m_tol < std::numeric_limits<T>::epsilon())
2533 if (max_r_size < std::numeric_limits<T>::epsilon() * max_abs_state) {
2534 g_eps = std::numeric_limits<T>::epsilon() * max_abs_state;
2535 } else {
2536 g_eps = max_r_size;
2537 }
2538 } else {
2539 g_eps = std::numeric_limits<T>::infinity();
2540 }
2541
2542 // Write unconditionally the tcs.
2543 std::copy(ed_data.m_ev_jet.data(), ed_data.m_ev_jet.data() + m_dim * (m_order + 1u), m_tc.data());
2544
2545 // Do the event detection.
2546 ed_data.detect_events(h, m_order, m_dim, g_eps);
2547
2548 // NOTE: before this point, we did not alter
2549 // any user-visible data in the integrator (just
2550 // temporary memory). From here until we start invoking
2551 // the callbacks, everything is noexcept, so we don't
2552 // risk leaving the integrator in a half-baked state.
2553
2554 // Sort the events by time.
2555 // NOTE: the time coordinates in m_d_(n)tes is relative
2556 // to the beginning of the timestep. It will be negative
2557 // for backward integration, thus we compare using
2558 // abs() so that the first events are those which
2559 // happen closer to the beginning of the timestep.
2560 // NOTE: the checks inside ed_data.detect_events() ensure
2561 // that we can safely sort the events' times.
2562 auto cmp = [](const auto &ev0, const auto &ev1) { return abs(std::get<1>(ev0)) < abs(std::get<1>(ev1)); };
2563 std::sort(ed_data.m_d_tes.begin(), ed_data.m_d_tes.end(), cmp);
2564 std::sort(ed_data.m_d_ntes.begin(), ed_data.m_d_ntes.end(), cmp);
2565
2566 // If we have terminal events we need
2567 // to update the value of h.
2568 if (!ed_data.m_d_tes.empty()) {
2569 h = std::get<1>(ed_data.m_d_tes[0]);
2570 }
2571
2572 // Update the state.
2573 m_d_out_f(m_state.data(), ed_data.m_ev_jet.data(), &h);
2574
2575 // Update the time.
2576 m_time += h;
2577
2578 // Store the last timestep.
2579 m_last_h = h;
2580
2581 // Check if the time or the state vector are non-finite at the
2582 // end of the timestep.
2583 if (!isfinite(m_time)
2584 || std::any_of(m_state.cbegin(), m_state.cend(), [](const auto &x) { return !isfinite(x); })) {
2585 // Let's also reset the cooldown values, as at this point
2586 // they have become useless.
2587 reset_cooldowns();
2588
2589 return std::tuple{taylor_outcome::err_nf_state, h};
2590 }
2591
2592 // Update the cooldowns.
2593 for (auto &cd : ed_data.m_te_cooldowns) {
2594 if (cd) {
2595 // Check if the timestep we just took
2596 // brought this event outside the cooldown.
2597 auto tmp = cd->first + h;
2598
2599 if (abs(tmp) >= cd->second) {
2600 // We are now outside the cooldown period
2601 // for this event, reset cd.
2602 cd.reset();
2603 } else {
2604 // Still in cooldown, update the
2605 // time spent in cooldown.
2606 cd->first = tmp;
2607 }
2608 }
2609 }
2610
2611 // If we don't have terminal events, we will invoke the callbacks
2612 // of *all* the non-terminal events. Otherwise, we need to figure
2613 // out which non-terminal events do not happen because their time
2614 // coordinate is past the the first terminal event.
2615 const auto ntes_end_it
2616 = ed_data.m_d_tes.empty()
2617 ? ed_data.m_d_ntes.end()
2618 : std::lower_bound(ed_data.m_d_ntes.begin(), ed_data.m_d_ntes.end(), h,
2619 [](const auto &ev, const auto &t) { return abs(std::get<1>(ev)) < abs(t); });
2620
2621 // Invoke the callbacks of the non-terminal events, which are guaranteed
2622 // to happen before the first terminal event.
2623 for (auto it = ed_data.m_d_ntes.begin(); it != ntes_end_it; ++it) {
2624 const auto &t = *it;
2625 const auto &cb = ed_data.m_ntes[std::get<0>(t)].get_callback();
2626 assert(cb); // LCOV_EXCL_LINE
2627 cb(*this, static_cast<T>(m_time - m_last_h + std::get<1>(t)), std::get<2>(t));
2628 }
2629
2630 // The return value of the first
2631 // terminal event's callback. It will be
2632 // unused if there are no terminal events.
2633 bool te_cb_ret = false;
2634
2635 if (!ed_data.m_d_tes.empty()) {
2636 // Fetch the first terminal event.
2637 const auto te_idx = std::get<0>(ed_data.m_d_tes[0]);
2638 assert(te_idx < ed_data.m_tes.size()); // LCOV_EXCL_LINE
2639 const auto &te = ed_data.m_tes[te_idx];
2640
2641 // Set the corresponding cooldown.
2642 if (te.get_cooldown() >= 0) {
2643 // Cooldown explicitly provided by the user, use it.
2644 ed_data.m_te_cooldowns[te_idx].emplace(0, te.get_cooldown());
2645 } else {
2646 // Deduce the cooldown automatically.
2647 // NOTE: if g_eps is not finite, we skipped event detection
2648 // altogether and thus we never end up here. If the derivative
2649 // of the event equation is not finite, the event is also skipped.
2650 ed_data.m_te_cooldowns[te_idx].emplace(0,
2651 taylor_deduce_cooldown(g_eps, std::get<4>(ed_data.m_d_tes[0])));
2652 }
2653
2654 // Invoke the callback of the first terminal event, if it has one.
2655 if (te.get_callback()) {
2656 te_cb_ret = te.get_callback()(*this, std::get<2>(ed_data.m_d_tes[0]), std::get<3>(ed_data.m_d_tes[0]));
2657 }
2658 }
2659
2660 if (ed_data.m_d_tes.empty()) {
2661 // No terminal events detected, return success or time limit.
2662 return std::tuple{h == max_delta_t ? taylor_outcome::time_limit : taylor_outcome::success, h};
2663 } else {
2664 // Terminal event detected. Fetch its index.
2665 const auto ev_idx = static_cast<std::int64_t>(std::get<0>(ed_data.m_d_tes[0]));
2666
2667 // NOTE: if te_cb_ret is true, it means that the terminal event has
2668 // a callback and its invocation returned true (meaning that the
2669 // integration should continue). Otherwise, either the terminal event
2670 // has no callback or its callback returned false, meaning that the
2671 // integration must stop.
2672 return std::tuple{taylor_outcome{te_cb_ret ? ev_idx : (-ev_idx - 1)}, h};
2673 }
2674 }
2675 }
2676
2677 template <typename T>
step(bool wtc)2678 std::tuple<taylor_outcome, T> taylor_adaptive_impl<T>::step(bool wtc)
2679 {
2680 // NOTE: time limit +inf means integration forward in time
2681 // and no time limit.
2682 return step_impl(std::numeric_limits<T>::infinity(), wtc);
2683 }
2684
2685 template <typename T>
step_backward(bool wtc)2686 std::tuple<taylor_outcome, T> taylor_adaptive_impl<T>::step_backward(bool wtc)
2687 {
2688 return step_impl(-std::numeric_limits<T>::infinity(), wtc);
2689 }
2690
2691 template <typename T>
step(T max_delta_t,bool wtc)2692 std::tuple<taylor_outcome, T> taylor_adaptive_impl<T>::step(T max_delta_t, bool wtc)
2693 {
2694 using std::isnan;
2695
2696 if (isnan(max_delta_t)) {
2697 throw std::invalid_argument(
2698 "A NaN max_delta_t was passed to the step() function of an adaptive Taylor integrator");
2699 }
2700
2701 return step_impl(max_delta_t, wtc);
2702 }
2703
2704 // Reset all cooldowns for the terminal events.
2705 template <typename T>
reset_cooldowns()2706 void taylor_adaptive_impl<T>::reset_cooldowns()
2707 {
2708 if (!m_ed_data) {
2709 throw std::invalid_argument("No events were defined for this integrator");
2710 }
2711
2712 for (auto &cd : m_ed_data->m_te_cooldowns) {
2713 cd.reset();
2714 }
2715 }
2716
2717 template <typename T>
2718 std::tuple<taylor_outcome, T, T, std::size_t, std::optional<continuous_output<T>>>
propagate_until_impl(const dfloat<T> & t,std::size_t max_steps,T max_delta_t,std::function<bool (taylor_adaptive_impl &)> cb,bool wtc,bool with_c_out)2719 taylor_adaptive_impl<T>::propagate_until_impl(const dfloat<T> &t, std::size_t max_steps, T max_delta_t,
2720 std::function<bool(taylor_adaptive_impl &)> cb, bool wtc, bool with_c_out)
2721 {
2722 using std::abs;
2723 using std::isfinite;
2724 using std::isnan;
2725
2726 // Check the current time.
2727 if (!isfinite(m_time)) {
2728 throw std::invalid_argument("Cannot invoke the propagate_until() function of an adaptive Taylor integrator if "
2729 "the current time is not finite");
2730 }
2731
2732 // Check the final time.
2733 if (!isfinite(t)) {
2734 throw std::invalid_argument(
2735 "A non-finite time was passed to the propagate_until() function of an adaptive Taylor integrator");
2736 }
2737
2738 // Check max_delta_t.
2739 if (isnan(max_delta_t)) {
2740 throw std::invalid_argument(
2741 "A nan max_delta_t was passed to the propagate_until() function of an adaptive Taylor integrator");
2742 }
2743 if (max_delta_t <= 0) {
2744 throw std::invalid_argument(
2745 "A non-positive max_delta_t was passed to the propagate_until() function of an adaptive Taylor integrator");
2746 }
2747
2748 // If with_c_out is true, we always need to write the Taylor coefficients.
2749 wtc = wtc || with_c_out;
2750
2751 // These vectors are used in the construction of the continuous output.
2752 // If continuous output is not requested, they will remain empty.
2753 std::vector<T> c_out_tcs, c_out_times_hi, c_out_times_lo;
2754 if (with_c_out) {
2755 // Push in the starting time.
2756 c_out_times_hi.push_back(m_time.hi);
2757 c_out_times_lo.push_back(m_time.lo);
2758 }
2759
2760 // Initial values for the counters
2761 // and the min/max abs of the integration
2762 // timesteps.
2763 // NOTE: iter_counter is for keeping track of the max_steps
2764 // limits, step_counter counts the number of timesteps performed
2765 // with a nonzero h. Most of the time these two quantities
2766 // will be identical, apart from corner cases.
2767 std::size_t iter_counter = 0, step_counter = 0;
2768 T min_h = std::numeric_limits<T>::infinity(), max_h(0);
2769
2770 // Init the remaining time.
2771 auto rem_time = t - m_time;
2772
2773 // Check it.
2774 if (!isfinite(rem_time)) {
2775 throw std::invalid_argument("The final time passed to the propagate_until() function of an adaptive Taylor "
2776 "integrator results in an overflow condition");
2777 }
2778
2779 // Cache the integration direction.
2780 const auto t_dir = (rem_time >= T(0));
2781
2782 // Helper to create the continuous output object.
2783 auto make_c_out = [&]() -> std::optional<continuous_output<T>> {
2784 if (with_c_out) {
2785 if (c_out_times_hi.size() < 2u) {
2786 // NOTE: this means that no successful steps
2787 // were taken.
2788 return {};
2789 }
2790
2791 // Construct the return value.
2792 continuous_output<T> ret(m_llvm.make_similar());
2793
2794 // Fill in the data.
2795 ret.m_tcs = std::move(c_out_tcs);
2796 ret.m_times_hi = std::move(c_out_times_hi);
2797 ret.m_times_lo = std::move(c_out_times_lo);
2798
2799 // Prepare the output vector.
2800 ret.m_output.resize(boost::numeric_cast<decltype(ret.m_output.size())>(m_dim));
2801
2802 // Add the continuous output function.
2803 ret.add_c_out_function(m_order, m_dim, m_high_accuracy);
2804
2805 return std::optional{std::move(ret)};
2806 } else {
2807 return {};
2808 }
2809 };
2810
2811 // Helper to update the continuous output data after a timestep.
2812 auto update_c_out = [&]() {
2813 if (with_c_out) {
2814 #if !defined(NDEBUG)
2815 const dfloat<T> prev_time(c_out_times_hi.back(), c_out_times_lo.back());
2816 #endif
2817
2818 c_out_times_hi.push_back(m_time.hi);
2819 c_out_times_lo.push_back(m_time.lo);
2820
2821 #if !defined(NDEBUG)
2822 const dfloat<T> new_time(c_out_times_hi.back(), c_out_times_lo.back());
2823 assert(isfinite(new_time));
2824 if (t_dir) {
2825 assert(!(new_time < prev_time));
2826 } else {
2827 assert(!(new_time > prev_time));
2828 }
2829 #endif
2830
2831 c_out_tcs.insert(c_out_tcs.end(), m_tc.begin(), m_tc.end());
2832 }
2833 };
2834
2835 while (true) {
2836 // Compute the max integration times for this timestep.
2837 // NOTE: rem_time is guaranteed to be finite: we check it explicitly above
2838 // and we keep on decreasing its magnitude at the following iterations.
2839 // If some non-finite state/time is generated in
2840 // the step function, the integration will be stopped.
2841 assert((rem_time >= T(0)) == t_dir); // LCOV_EXCL_LINE
2842 const auto dt_limit
2843 = t_dir ? std::min(dfloat<T>(max_delta_t), rem_time) : std::max(dfloat<T>(-max_delta_t), rem_time);
2844 // NOTE: if dt_limit is zero, step_impl() will always return time_limit.
2845 const auto [res, h] = step_impl(static_cast<T>(dt_limit), wtc);
2846
2847 if (res != taylor_outcome::success && res != taylor_outcome::time_limit && res < taylor_outcome{0}) {
2848 // Something went wrong in the propagation of the timestep, or we reached
2849 // a stopping terminal event
2850 if (res > taylor_outcome::success) {
2851 // In case of a stopping terminal event, we still want to update
2852 // the continuous output and the step counter.
2853 update_c_out();
2854 step_counter += static_cast<std::size_t>(h != 0);
2855 }
2856
2857 return std::tuple{res, min_h, max_h, step_counter, make_c_out()};
2858 }
2859
2860 // Update the continuous output data.
2861 update_c_out();
2862
2863 // Update the number of iterations.
2864 ++iter_counter;
2865
2866 // Update the number of steps.
2867 step_counter += static_cast<std::size_t>(h != 0);
2868
2869 // Update min_h/max_h, but only if the outcome is success (otherwise
2870 // the step was artificially clamped either by a time limit or
2871 // by a terminal event).
2872 if (res == taylor_outcome::success) {
2873 const auto abs_h = abs(h);
2874 min_h = std::min(min_h, abs_h);
2875 max_h = std::max(max_h, abs_h);
2876 }
2877
2878 // The step was successful, execute the callback if applicable.
2879 if (cb && !cb(*this)) {
2880 // Interruption via callback.
2881 return std::tuple{taylor_outcome::cb_stop, min_h, max_h, step_counter, make_c_out()};
2882 }
2883
2884 // Break out if the final time is reached,
2885 // NOTE: here we check h == rem_time, instead of just
2886 // res == time_limit, because clamping via max_delta_t
2887 // could also result in time_limit.
2888 if (h == static_cast<T>(rem_time)) {
2889 assert(res == taylor_outcome::time_limit); // LCOV_EXCL_LINE
2890 return std::tuple{taylor_outcome::time_limit, min_h, max_h, step_counter, make_c_out()};
2891 }
2892
2893 // Check the iteration limit.
2894 // NOTE: if max_steps is 0 (i.e., no limit on the number of steps),
2895 // then this condition will never trigger as by this point we are
2896 // sure iter_counter is at least 1.
2897 if (iter_counter == max_steps) {
2898 return std::tuple{taylor_outcome::step_limit, min_h, max_h, step_counter, make_c_out()};
2899 }
2900
2901 // Update the remaining time.
2902 // NOTE: in principle, due to the fact that
2903 // dt_limit is computed in extended
2904 // precision but it is cast to normal precision
2905 // before being added to m_time in step_impl(),
2906 // there could be numerical inconsistencies
2907 // at the last timestep which could result in rem_time
2908 // not exactly zero or with a flipped sign.
2909 // For this function, this should
2910 // not matter because if static_cast<T>(rem_time) was used
2911 // as an integration step, we will have already exited
2912 // above before anything bad can
2913 // happen. If anything smaller than static_cast<T>(rem_time)
2914 // was used as timestep, rem_time should not change sign.
2915 rem_time = t - m_time;
2916 }
2917 }
2918
2919 template <typename T>
2920 std::tuple<taylor_outcome, T, T, std::size_t, std::vector<T>>
propagate_grid_impl(const std::vector<T> & grid,std::size_t max_steps,T max_delta_t,std::function<bool (taylor_adaptive_impl &)> cb)2921 taylor_adaptive_impl<T>::propagate_grid_impl(const std::vector<T> &grid, std::size_t max_steps, T max_delta_t,
2922 std::function<bool(taylor_adaptive_impl &)> cb)
2923 {
2924 using std::abs;
2925 using std::isfinite;
2926 using std::isnan;
2927
2928 if (!isfinite(m_time)) {
2929 throw std::invalid_argument(
2930 "Cannot invoke propagate_grid() in an adaptive Taylor integrator if the current time is not finite");
2931 }
2932
2933 // Check max_delta_t.
2934 if (isnan(max_delta_t)) {
2935 throw std::invalid_argument(
2936 "A nan max_delta_t was passed to the propagate_grid() function of an adaptive Taylor integrator");
2937 }
2938 if (max_delta_t <= 0) {
2939 throw std::invalid_argument(
2940 "A non-positive max_delta_t was passed to the propagate_grid() function of an adaptive Taylor integrator");
2941 }
2942
2943 // Check the grid.
2944 if (grid.empty()) {
2945 throw std::invalid_argument(
2946 "Cannot invoke propagate_grid() in an adaptive Taylor integrator if the time grid is empty");
2947 }
2948
2949 constexpr auto nf_err_msg
2950 = "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator";
2951 constexpr auto ig_err_msg
2952 = "A non-monotonic time grid was passed to propagate_grid() in an adaptive Taylor integrator";
2953 // Check the first point.
2954 if (!isfinite(grid[0])) {
2955 throw std::invalid_argument(nf_err_msg);
2956 }
2957 if (grid.size() > 1u) {
2958 // Establish the direction of the grid from
2959 // the first two points.
2960 if (!isfinite(grid[1])) {
2961 throw std::invalid_argument(nf_err_msg);
2962 }
2963 if (grid[1] == grid[0]) {
2964 throw std::invalid_argument(ig_err_msg);
2965 }
2966 const auto grid_direction = grid[1] > grid[0];
2967
2968 // Check that the remaining points are finite and that
2969 // they are ordered monotonically.
2970 for (decltype(grid.size()) i = 2; i < grid.size(); ++i) {
2971 if (!isfinite(grid[i])) {
2972 throw std::invalid_argument(nf_err_msg);
2973 }
2974
2975 if ((grid[i] > grid[i - 1u]) != grid_direction) {
2976 throw std::invalid_argument(ig_err_msg);
2977 }
2978 }
2979 }
2980
2981 // Pre-allocate the return value.
2982 std::vector<T> retval;
2983 // LCOV_EXCL_START
2984 if (get_dim() > std::numeric_limits<decltype(retval.size())>::max() / grid.size()) {
2985 throw std::overflow_error("Overflow detected in the creation of the return value of propagate_grid() in an "
2986 "adaptive Taylor integrator");
2987 }
2988 // LCOV_EXCL_STOP
2989 retval.reserve(grid.size() * get_dim());
2990
2991 // Initial values for the counters
2992 // and the min/max abs of the integration
2993 // timesteps.
2994 // NOTE: iter_counter is for keeping track of the max_steps
2995 // limits, step_counter counts the number of timesteps performed
2996 // with a nonzero h. Most of the time these two quantities
2997 // will be identical, apart from corner cases.
2998 std::size_t iter_counter = 0, step_counter = 0;
2999 T min_h = std::numeric_limits<T>::infinity(), max_h(0);
3000
3001 // Propagate the system up to the first grid point.
3002 // NOTE: this may not be needed strictly speaking if
3003 // the time is already grid[0], but it will ensure that
3004 // m_last_h is properly updated.
3005 // NOTE: this will *not* write the TCs, but, because we
3006 // know that the grid is strictly monotonic, we know that we
3007 // will take at least 1 TC-writing timestep before starting
3008 // to use the dense output.
3009 // NOTE: use the same max_steps for the initial propagation,
3010 // and don't pass the callback.
3011 const auto oc = std::get<0>(propagate_until(grid[0], kw::max_delta_t = max_delta_t, kw::max_steps = max_steps));
3012
3013 if (oc != taylor_outcome::time_limit && oc < taylor_outcome{0}) {
3014 // The outcome is not time_limit and it is not a continuing
3015 // terminal event. This means that a non-finite state was
3016 // encountered, or a stopping terminal event triggered, or
3017 // the step limit was hit.
3018 return std::tuple{oc, min_h, max_h, step_counter, std::move(retval)};
3019 }
3020
3021 // Add the first result to retval.
3022 retval.insert(retval.end(), m_state.begin(), m_state.end());
3023
3024 // Init the remaining time.
3025 auto rem_time = grid.back() - m_time;
3026
3027 // Check it.
3028 if (!isfinite(rem_time)) {
3029 throw std::invalid_argument("The final time passed to the propagate_grid() function of an adaptive Taylor "
3030 "integrator results in an overflow condition");
3031 }
3032
3033 // Cache the integration direction.
3034 const auto t_dir = (rem_time >= T(0));
3035
3036 // Iterate over the remaining grid points.
3037 for (decltype(grid.size()) cur_grid_idx = 1; cur_grid_idx < grid.size();) {
3038 // Establish the time range of the last
3039 // taken timestep.
3040 // NOTE: t0 < t1.
3041 const auto t0 = std::min(m_time, m_time - m_last_h);
3042 const auto t1 = std::max(m_time, m_time - m_last_h);
3043
3044 // Compute the state of the system via dense output for as many grid
3045 // points as possible, i.e., as long as the grid times
3046 // fall within the validity range for the dense output.
3047 while (true) {
3048 // Fetch the current time target.
3049 const auto cur_tt = grid[cur_grid_idx];
3050
3051 // NOTE: we force processing of all remaining grid points
3052 // if we are at the last timestep. We do this in order to avoid
3053 // numerical issues when deciding if the last grid point
3054 // falls within the range of validity of the dense output.
3055 if ((cur_tt >= t0 && cur_tt <= t1) || (rem_time == dfloat<T>(T(0)))) {
3056 // The current time target falls within the range of
3057 // validity of the dense output. Compute the dense
3058 // output in cur_tt.
3059 update_d_output(cur_tt);
3060
3061 // Add the result to retval.
3062 retval.insert(retval.end(), m_d_out.begin(), m_d_out.end());
3063 } else {
3064 // Cannot use dense output on the current time target,
3065 // need to take another step.
3066 break;
3067 }
3068
3069 // Move to the next time target, or break out
3070 // if we have no more.
3071 if (++cur_grid_idx == grid.size()) {
3072 break;
3073 }
3074 }
3075
3076 if (cur_grid_idx == grid.size()) {
3077 // No more grid points, exit.
3078 break;
3079 }
3080
3081 // Take the next step, making sure to write the Taylor coefficients
3082 // and to cap the timestep size so that we don't go past the
3083 // last grid point and we don't use a timestep exceeding max_delta_t.
3084 // NOTE: rem_time is guaranteed to be finite: we check it explicitly above
3085 // and we keep on decreasing its magnitude at the following iterations.
3086 // If some non-finite state/time is generated in
3087 // the step function, the integration will be stopped.
3088 assert((rem_time >= T(0)) == t_dir); // LCOV_EXCL_LINE
3089 const auto dt_limit
3090 = t_dir ? std::min(dfloat<T>(max_delta_t), rem_time) : std::max(dfloat<T>(-max_delta_t), rem_time);
3091 const auto [res, h] = step_impl(static_cast<T>(dt_limit), true);
3092
3093 if (res != taylor_outcome::success && res != taylor_outcome::time_limit && res < taylor_outcome{0}) {
3094 // Something went wrong in the propagation of the timestep, or we reached
3095 // a stopping terminal event.
3096
3097 if (res > taylor_outcome::success) {
3098 // In case of a stopping terminal event, we still want to update
3099 // the step counter.
3100 step_counter += static_cast<std::size_t>(h != 0);
3101 }
3102
3103 return std::tuple{res, min_h, max_h, step_counter, std::move(retval)};
3104 }
3105
3106 // Update the number of iterations.
3107 ++iter_counter;
3108
3109 // Update the number of steps.
3110 step_counter += static_cast<std::size_t>(h != 0);
3111
3112 // Update min_h/max_h, but only if the outcome is success (otherwise
3113 // the step was artificially clamped either by a time limit or
3114 // by a terminal event).
3115 if (res == taylor_outcome::success) {
3116 const auto abs_h = abs(h);
3117 min_h = std::min(min_h, abs_h);
3118 max_h = std::max(max_h, abs_h);
3119 }
3120
3121 // Step successful: invoke the callback, if needed.
3122 if (cb && !cb(*this)) {
3123 // Interruption via callback.
3124 return std::tuple{taylor_outcome::cb_stop, min_h, max_h, step_counter, std::move(retval)};
3125 }
3126
3127 // Check the iteration limit.
3128 // NOTE: if max_steps is 0 (i.e., no limit on the number of steps),
3129 // then this condition will never trigger as by this point we are
3130 // sure iter_counter is at least 1.
3131 if (iter_counter == max_steps) {
3132 return std::tuple{taylor_outcome::step_limit, min_h, max_h, step_counter, std::move(retval)};
3133 }
3134
3135 // Update the remaining time.
3136 // NOTE: if static_cast<T>(rem_time) was used as a timestep,
3137 // it means that we hit the time limit. Force rem_time to zero
3138 // to signal this, avoiding inconsistencies with grid.back() - m_time
3139 // not going exactly to zero due to numerical issues. A zero rem_time
3140 // will also force the processing of all remaining grid points.
3141 if (h == static_cast<T>(rem_time)) {
3142 assert(res == taylor_outcome::time_limit); // LCOV_EXCL_LINE
3143 rem_time = dfloat<T>(T(0));
3144 } else {
3145 rem_time = grid.back() - m_time;
3146 }
3147 }
3148
3149 // Everything went well, return time_limit.
3150 return std::tuple{taylor_outcome::time_limit, min_h, max_h, step_counter, std::move(retval)};
3151 }
3152
3153 template <typename T>
get_llvm_state() const3154 const llvm_state &taylor_adaptive_impl<T>::get_llvm_state() const
3155 {
3156 return m_llvm;
3157 }
3158
3159 template <typename T>
get_decomposition() const3160 const taylor_dc_t &taylor_adaptive_impl<T>::get_decomposition() const
3161 {
3162 return m_dc;
3163 }
3164
3165 template <typename T>
get_order() const3166 std::uint32_t taylor_adaptive_impl<T>::get_order() const
3167 {
3168 return m_order;
3169 }
3170
3171 template <typename T>
get_tol() const3172 T taylor_adaptive_impl<T>::get_tol() const
3173 {
3174 return m_tol;
3175 }
3176
3177 template <typename T>
get_high_accuracy() const3178 bool taylor_adaptive_impl<T>::get_high_accuracy() const
3179 {
3180 return m_high_accuracy;
3181 }
3182
3183 template <typename T>
get_compact_mode() const3184 bool taylor_adaptive_impl<T>::get_compact_mode() const
3185 {
3186 return m_compact_mode;
3187 }
3188
3189 template <typename T>
get_dim() const3190 std::uint32_t taylor_adaptive_impl<T>::get_dim() const
3191 {
3192 return m_dim;
3193 }
3194
3195 template <typename T>
update_d_output(T time,bool rel_time)3196 const std::vector<T> &taylor_adaptive_impl<T>::update_d_output(T time, bool rel_time)
3197 {
3198 // NOTE: "time" needs to be translated
3199 // because m_d_out_f expects a time coordinate
3200 // with respect to the starting time t0 of
3201 // the *previous* timestep.
3202 if (rel_time) {
3203 // Time coordinate relative to the current time.
3204 const auto h = m_last_h + time;
3205
3206 m_d_out_f(m_d_out.data(), m_tc.data(), &h);
3207 } else {
3208 // Absolute time coordinate.
3209 const auto h = time - (m_time - m_last_h);
3210
3211 m_d_out_f(m_d_out.data(), m_tc.data(), &h.hi);
3212 }
3213
3214 return m_d_out;
3215 }
3216
3217 // Explicit instantiation of the implementation classes/functions.
3218 // NOTE: on Windows apparently it is necessary to declare that
3219 // these instantiations are meant to be dll-exported.
3220 template class taylor_adaptive_impl<double>;
3221
3222 template HEYOKA_DLL_PUBLIC void taylor_adaptive_impl<double>::finalise_ctor_impl(const std::vector<expression> &,
3223 std::vector<double>, double, double,
3224 bool, bool, std::vector<double>,
3225 std::vector<t_event_t>,
3226 std::vector<nt_event_t>);
3227
3228 template HEYOKA_DLL_PUBLIC void
3229 taylor_adaptive_impl<double>::finalise_ctor_impl(const std::vector<std::pair<expression, expression>> &,
3230 std::vector<double>, double, double, bool, bool, std::vector<double>,
3231 std::vector<t_event_t>, std::vector<nt_event_t>);
3232
3233 template class taylor_adaptive_impl<long double>;
3234
3235 template HEYOKA_DLL_PUBLIC void
3236 taylor_adaptive_impl<long double>::finalise_ctor_impl(const std::vector<expression> &, std::vector<long double>,
3237 long double, long double, bool, bool, std::vector<long double>,
3238 std::vector<t_event_t>, std::vector<nt_event_t>);
3239
3240 template HEYOKA_DLL_PUBLIC void taylor_adaptive_impl<long double>::finalise_ctor_impl(
3241 const std::vector<std::pair<expression, expression>> &, std::vector<long double>, long double, long double, bool,
3242 bool, std::vector<long double>, std::vector<t_event_t>, std::vector<nt_event_t>);
3243
3244 #if defined(HEYOKA_HAVE_REAL128)
3245
3246 template class taylor_adaptive_impl<mppp::real128>;
3247
3248 template HEYOKA_DLL_PUBLIC void taylor_adaptive_impl<mppp::real128>::finalise_ctor_impl(
3249 const std::vector<expression> &, std::vector<mppp::real128>, mppp::real128, mppp::real128, bool, bool,
3250 std::vector<mppp::real128>, std::vector<t_event_t>, std::vector<nt_event_t>);
3251
3252 template HEYOKA_DLL_PUBLIC void taylor_adaptive_impl<mppp::real128>::finalise_ctor_impl(
3253 const std::vector<std::pair<expression, expression>> &, std::vector<mppp::real128>, mppp::real128, mppp::real128,
3254 bool, bool, std::vector<mppp::real128>, std::vector<t_event_t>, std::vector<nt_event_t>);
3255
3256 #endif
3257
3258 } // namespace detail
3259
3260 namespace detail
3261 {
3262
3263 template <typename T>
3264 template <typename U>
finalise_ctor_impl(const U & sys,std::vector<T> state,std::uint32_t batch_size,std::vector<T> time,T tol,bool high_accuracy,bool compact_mode,std::vector<T> pars,std::vector<t_event_t> tes,std::vector<nt_event_t> ntes)3265 void taylor_adaptive_batch_impl<T>::finalise_ctor_impl(const U &sys, std::vector<T> state, std::uint32_t batch_size,
3266 std::vector<T> time, T tol, bool high_accuracy,
3267 bool compact_mode, std::vector<T> pars,
3268 std::vector<t_event_t> tes, std::vector<nt_event_t> ntes)
3269 {
3270 #if defined(HEYOKA_ARCH_PPC)
3271 if constexpr (std::is_same_v<T, long double>) {
3272 throw not_implemented_error("'long double' computations are not supported on PowerPC");
3273 }
3274 #endif
3275
3276 using std::isfinite;
3277
3278 // Init the data members.
3279 m_batch_size = batch_size;
3280 m_state = std::move(state);
3281 m_time_hi = std::move(time);
3282 m_time_lo.resize(m_time_hi.size());
3283 m_pars = std::move(pars);
3284 m_high_accuracy = high_accuracy;
3285 m_compact_mode = compact_mode;
3286
3287 // Check input params.
3288 if (m_batch_size == 0u) {
3289 throw std::invalid_argument("The batch size in an adaptive Taylor integrator cannot be zero");
3290 }
3291
3292 if (std::any_of(m_state.begin(), m_state.end(), [](const auto &x) { return !isfinite(x); })) {
3293 throw std::invalid_argument(
3294 "A non-finite value was detected in the initial state of an adaptive Taylor integrator");
3295 }
3296
3297 if (m_state.size() % m_batch_size != 0u) {
3298 throw std::invalid_argument(
3299 "Invalid size detected in the initialization of an adaptive Taylor "
3300 "integrator: the state vector has a size of {}, which is not a multiple of the batch size ({})"_format(
3301 m_state.size(), m_batch_size));
3302 }
3303
3304 if (m_state.size() / m_batch_size != sys.size()) {
3305 throw std::invalid_argument(
3306 "Inconsistent sizes detected in the initialization of an adaptive Taylor "
3307 "integrator: the state vector has a dimension of {} and a batch size of {}, "
3308 "while the number of equations is {}"_format(m_state.size() / m_batch_size, m_batch_size, sys.size()));
3309 }
3310
3311 if (m_time_hi.size() != m_batch_size) {
3312 throw std::invalid_argument(
3313 "Invalid size detected in the initialization of an adaptive Taylor "
3314 "integrator: the time vector has a size of {}, which is not equal to the batch size ({})"_format(
3315 m_time_hi.size(), m_batch_size));
3316 }
3317 // NOTE: no need to check m_time_lo for finiteness, as it
3318 // was inited to zero already.
3319 if (std::any_of(m_time_hi.begin(), m_time_hi.end(), [](const auto &x) { return !isfinite(x); })) {
3320 throw std::invalid_argument(
3321 "A non-finite initial time was detected in the initialisation of an adaptive Taylor integrator");
3322 }
3323
3324 if (!isfinite(tol) || tol <= 0) {
3325 throw std::invalid_argument(
3326 "The tolerance in an adaptive Taylor integrator must be finite and positive, but it is {} instead"_format(
3327 tol));
3328 }
3329
3330 // Store the tolerance.
3331 m_tol = tol;
3332
3333 // Store the dimension of the system.
3334 m_dim = boost::numeric_cast<std::uint32_t>(sys.size());
3335
3336 // Do we have events?
3337 const auto with_events = !tes.empty() || !ntes.empty();
3338
3339 // Temporarily disable optimisations in s, so that
3340 // we don't optimise twice when adding the step
3341 // and then the d_out.
3342 std::optional<opt_disabler> od(m_llvm);
3343
3344 // Add the stepper function.
3345 if (with_events) {
3346 std::vector<expression> ee;
3347 // NOTE: no need for deep copies of the expressions: ee is never mutated
3348 // and we will be deep-copying it anyway when we do the decomposition.
3349 for (const auto &ev : tes) {
3350 ee.push_back(ev.get_expression());
3351 }
3352 for (const auto &ev : ntes) {
3353 ee.push_back(ev.get_expression());
3354 }
3355
3356 std::tie(m_dc, m_order) = taylor_add_adaptive_step_with_events<T>(
3357 m_llvm, "step_e", sys, tol, batch_size, high_accuracy, compact_mode, ee, high_accuracy);
3358 } else {
3359 std::tie(m_dc, m_order)
3360 = taylor_add_adaptive_step<T>(m_llvm, "step", sys, tol, batch_size, high_accuracy, compact_mode);
3361 }
3362
3363 // Fix m_pars' size, if necessary.
3364 const auto npars = n_pars_in_dc(m_dc);
3365 // LCOV_EXCL_START
3366 if (npars > std::numeric_limits<std::uint32_t>::max() / m_batch_size) {
3367 throw std::overflow_error("Overflow detected when computing the size of the parameter array in an adaptive "
3368 "Taylor integrator in batch mode");
3369 }
3370 // LCOV_EXCL_STOP
3371 if (m_pars.size() < npars * m_batch_size) {
3372 m_pars.resize(boost::numeric_cast<decltype(m_pars.size())>(npars * m_batch_size));
3373 } else if (m_pars.size() > npars * m_batch_size) {
3374 throw std::invalid_argument("Excessive number of parameter values passed to the constructor of an adaptive "
3375 "Taylor integrator in batch mode: {} parameter values were passed, but the ODE "
3376 "system contains only {} parameters "
3377 "(in batches of {})"_format(m_pars.size(), npars, m_batch_size));
3378 }
3379
3380 // Log runtimes in trace mode.
3381 spdlog::stopwatch sw;
3382
3383 // Add the function for the computation of
3384 // the dense output.
3385 taylor_add_d_out_function<T>(m_llvm, m_dim, m_order, m_batch_size, high_accuracy);
3386
3387 get_logger()->trace("Taylor batch dense output runtime: {}", sw);
3388 sw.reset();
3389
3390 // Restore the original optimisation level in s.
3391 od.reset();
3392
3393 // Run the optimisation pass manually.
3394 m_llvm.optimise();
3395
3396 get_logger()->trace("Taylor batch global opt pass runtime: {}", sw);
3397 sw.reset();
3398
3399 // Run the jit.
3400 m_llvm.compile();
3401
3402 get_logger()->trace("Taylor batch LLVM compilation runtime: {}", sw);
3403
3404 // Fetch the stepper.
3405 if (with_events) {
3406 m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
3407 } else {
3408 m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
3409 }
3410
3411 // Fetch the function to compute the dense output.
3412 m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
3413
3414 // Setup the vector for the Taylor coefficients.
3415 // LCOV_EXCL_START
3416 if (m_order == std::numeric_limits<std::uint32_t>::max()
3417 || m_state.size() > std::numeric_limits<decltype(m_tc.size())>::max() / (m_order + 1u)) {
3418 throw std::overflow_error(
3419 "Overflow detected in the initialisation of an adaptive Taylor integrator in batch mode: the order "
3420 "or the state size is too large");
3421 }
3422 // LCOV_EXCL_STOP
3423
3424 // NOTE: the size of m_state.size() already takes
3425 // into account the batch size.
3426 m_tc.resize(m_state.size() * (m_order + 1u));
3427
3428 // Setup m_last_h.
3429 m_last_h.resize(boost::numeric_cast<decltype(m_last_h.size())>(batch_size));
3430
3431 // Setup the vector for the dense output.
3432 // NOTE: the size of m_state.size() already takes
3433 // into account the batch size.
3434 m_d_out.resize(m_state.size());
3435
3436 // Prepare the temp vectors.
3437 m_pinf.resize(m_batch_size, std::numeric_limits<T>::infinity());
3438 m_minf.resize(m_batch_size, -std::numeric_limits<T>::infinity());
3439 // LCOV_EXCL_START
3440 if (m_batch_size > std::numeric_limits<std::uint32_t>::max() / 2u) {
3441 throw std::overflow_error("Overflow detected in the initialisation of an adaptive Taylor integrator in batch "
3442 "mode: the batch size is too large");
3443 }
3444 // LCOV_EXCL_STOP
3445 // NOTE: make it twice as big because we need twice the storage in step_impl().
3446 m_delta_ts.resize(boost::numeric_cast<decltype(m_delta_ts.size())>(m_batch_size * 2u));
3447
3448 // NOTE: init the outcome to success, the rest to zero.
3449 m_step_res.resize(boost::numeric_cast<decltype(m_step_res.size())>(m_batch_size),
3450 std::tuple{taylor_outcome::success, T(0)});
3451 m_prop_res.resize(boost::numeric_cast<decltype(m_prop_res.size())>(m_batch_size),
3452 std::tuple{taylor_outcome::success, T(0), T(0), std::size_t(0)});
3453
3454 m_ts_count.resize(boost::numeric_cast<decltype(m_ts_count.size())>(m_batch_size));
3455 m_min_abs_h.resize(m_batch_size);
3456 m_max_abs_h.resize(m_batch_size);
3457 m_cur_max_delta_ts.resize(m_batch_size);
3458 m_pfor_ts.resize(boost::numeric_cast<decltype(m_pfor_ts.size())>(m_batch_size));
3459 m_t_dir.resize(boost::numeric_cast<decltype(m_t_dir.size())>(m_batch_size));
3460 m_rem_time.resize(m_batch_size);
3461
3462 m_d_out_time.resize(m_batch_size);
3463
3464 // Init the event data structure if needed.
3465 // NOTE: this can be done in parallel with the rest of the constructor,
3466 // once we have m_order/m_dim/m_batch_size and we are done using tes/ntes.
3467 if (with_events) {
3468 m_ed_data = std::make_unique<ed_data>(std::move(tes), std::move(ntes), m_order, m_dim, m_batch_size);
3469 }
3470 }
3471
3472 template <typename T>
taylor_adaptive_batch_impl()3473 taylor_adaptive_batch_impl<T>::taylor_adaptive_batch_impl()
3474 : taylor_adaptive_batch_impl({prime("x"_var) = 0_dbl}, {T(0)}, 1u, kw::tol = T(1e-1))
3475 {
3476 }
3477
3478 template <typename T>
taylor_adaptive_batch_impl(const taylor_adaptive_batch_impl & other)3479 taylor_adaptive_batch_impl<T>::taylor_adaptive_batch_impl(const taylor_adaptive_batch_impl &other)
3480 // NOTE: make a manual copy of all members, apart from the function pointers.
3481 : m_batch_size(other.m_batch_size), m_state(other.m_state), m_time_hi(other.m_time_hi), m_time_lo(other.m_time_lo),
3482 m_llvm(other.m_llvm), m_dim(other.m_dim), m_order(other.m_order), m_tol(other.m_tol),
3483 m_high_accuracy(other.m_high_accuracy), m_compact_mode(other.m_compact_mode), m_pars(other.m_pars),
3484 m_tc(other.m_tc), m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_pinf(other.m_pinf), m_minf(other.m_minf),
3485 m_delta_ts(other.m_delta_ts), m_step_res(other.m_step_res), m_prop_res(other.m_prop_res),
3486 m_ts_count(other.m_ts_count), m_min_abs_h(other.m_min_abs_h), m_max_abs_h(other.m_max_abs_h),
3487 m_cur_max_delta_ts(other.m_cur_max_delta_ts), m_pfor_ts(other.m_pfor_ts), m_t_dir(other.m_t_dir),
3488 m_rem_time(other.m_rem_time), m_d_out_time(other.m_d_out_time),
3489 m_ed_data(other.m_ed_data ? std::make_unique<ed_data>(*other.m_ed_data) : nullptr)
3490 {
3491 // NOTE: make explicit deep copy of the decomposition.
3492 m_dc.reserve(other.m_dc.size());
3493 for (const auto &[ex, deps] : other.m_dc) {
3494 m_dc.emplace_back(copy(ex), deps);
3495 }
3496
3497 if (m_ed_data) {
3498 m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
3499 } else {
3500 m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
3501 }
3502 m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
3503 }
3504
3505 template <typename T>
3506 taylor_adaptive_batch_impl<T>::taylor_adaptive_batch_impl(taylor_adaptive_batch_impl &&) noexcept = default;
3507
3508 template <typename T>
operator =(const taylor_adaptive_batch_impl & other)3509 taylor_adaptive_batch_impl<T> &taylor_adaptive_batch_impl<T>::operator=(const taylor_adaptive_batch_impl &other)
3510 {
3511 if (this != &other) {
3512 *this = taylor_adaptive_batch_impl(other);
3513 }
3514
3515 return *this;
3516 }
3517
3518 template <typename T>
3519 taylor_adaptive_batch_impl<T> &
3520 taylor_adaptive_batch_impl<T>::operator=(taylor_adaptive_batch_impl &&) noexcept = default;
3521
3522 template <typename T>
3523 taylor_adaptive_batch_impl<T>::~taylor_adaptive_batch_impl() = default;
3524
3525 // NOTE: the save/load patterns mimic the copy constructor logic.
3526 template <typename T>
3527 template <typename Archive>
save_impl(Archive & ar,unsigned) const3528 void taylor_adaptive_batch_impl<T>::save_impl(Archive &ar, unsigned) const
3529 {
3530 // NOTE: save all members, apart from the function pointers.
3531 ar << m_batch_size;
3532 ar << m_state;
3533 ar << m_time_hi;
3534 ar << m_time_lo;
3535 ar << m_llvm;
3536 ar << m_dim;
3537 ar << m_dc;
3538 ar << m_order;
3539 ar << m_tol;
3540 ar << m_high_accuracy;
3541 ar << m_compact_mode;
3542 ar << m_pars;
3543 ar << m_tc;
3544 ar << m_last_h;
3545 ar << m_d_out;
3546 ar << m_pinf;
3547 ar << m_minf;
3548 ar << m_delta_ts;
3549 ar << m_step_res;
3550 ar << m_prop_res;
3551 ar << m_ts_count;
3552 ar << m_min_abs_h;
3553 ar << m_max_abs_h;
3554 ar << m_cur_max_delta_ts;
3555 ar << m_pfor_ts;
3556 ar << m_t_dir;
3557 ar << m_rem_time;
3558 ar << m_d_out_time;
3559 ar << m_ed_data;
3560 }
3561
3562 template <typename T>
3563 template <typename Archive>
load_impl(Archive & ar,unsigned version)3564 void taylor_adaptive_batch_impl<T>::load_impl(Archive &ar, unsigned version)
3565 {
3566 // LCOV_EXCL_START
3567 if (version < static_cast<unsigned>(boost::serialization::version<taylor_adaptive_batch_impl<T>>::type::value)) {
3568 throw std::invalid_argument("Unable to load a taylor_adaptive_batch integrator: "
3569 "the archive version ({}) is too old"_format(version));
3570 }
3571 // LCOV_EXCL_STOP
3572
3573 ar >> m_batch_size;
3574 ar >> m_state;
3575 ar >> m_time_hi;
3576 ar >> m_time_lo;
3577 ar >> m_llvm;
3578 ar >> m_dim;
3579 ar >> m_dc;
3580 ar >> m_order;
3581 ar >> m_tol;
3582 ar >> m_high_accuracy;
3583 ar >> m_compact_mode;
3584 ar >> m_pars;
3585 ar >> m_tc;
3586 ar >> m_last_h;
3587 ar >> m_d_out;
3588 ar >> m_pinf;
3589 ar >> m_minf;
3590 ar >> m_delta_ts;
3591 ar >> m_step_res;
3592 ar >> m_prop_res;
3593 ar >> m_ts_count;
3594 ar >> m_min_abs_h;
3595 ar >> m_max_abs_h;
3596 ar >> m_cur_max_delta_ts;
3597 ar >> m_pfor_ts;
3598 ar >> m_t_dir;
3599 ar >> m_rem_time;
3600 ar >> m_d_out_time;
3601 ar >> m_ed_data;
3602
3603 // Recover the function pointers.
3604 if (m_ed_data) {
3605 m_step_f = reinterpret_cast<step_f_e_t>(m_llvm.jit_lookup("step_e"));
3606 } else {
3607 m_step_f = reinterpret_cast<step_f_t>(m_llvm.jit_lookup("step"));
3608 }
3609 m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
3610 }
3611
3612 template <typename T>
save(boost::archive::binary_oarchive & ar,unsigned v) const3613 void taylor_adaptive_batch_impl<T>::save(boost::archive::binary_oarchive &ar, unsigned v) const
3614 {
3615 save_impl(ar, v);
3616 }
3617
3618 template <typename T>
load(boost::archive::binary_iarchive & ar,unsigned v)3619 void taylor_adaptive_batch_impl<T>::load(boost::archive::binary_iarchive &ar, unsigned v)
3620 {
3621 load_impl(ar, v);
3622 }
3623
3624 template <typename T>
set_time(const std::vector<T> & new_time)3625 void taylor_adaptive_batch_impl<T>::set_time(const std::vector<T> &new_time)
3626 {
3627 // Check the dimensionality of new_time.
3628 if (new_time.size() != m_batch_size) {
3629 throw std::invalid_argument(
3630 "Invalid number of new times specified in a Taylor integrator in batch mode: the batch size is {}, "
3631 "but the number of specified times is {}"_format(m_batch_size, new_time.size()));
3632 }
3633
3634 // Copy over the new times.
3635 // NOTE: do not use std::copy(), as it gives UB if new_time
3636 // and m_time_hi are the same object.
3637 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3638 m_time_hi[i] = new_time[i];
3639 }
3640 // Reset the lo part.
3641 std::fill(m_time_lo.begin(), m_time_lo.end(), T(0));
3642 }
3643
3644 // Implementation detail to make a single integration timestep.
3645 // The magnitude of the timestep is automatically deduced for each
3646 // state vector, but it will always be not greater than
3647 // the absolute value of the corresponding element in max_delta_ts.
3648 // For each state vector, the propagation
3649 // is done forward in time if max_delta_t >= 0, backwards in
3650 // time otherwise.
3651 //
3652 // The function will write to res a pair for each state
3653 // vector, containing a flag describing the outcome of the integration
3654 // and the integration timestep that was used.
3655 template <typename T>
step_impl(const std::vector<T> & max_delta_ts,bool wtc)3656 void taylor_adaptive_batch_impl<T>::step_impl(const std::vector<T> &max_delta_ts, bool wtc)
3657 {
3658 using std::abs;
3659 using std::isfinite;
3660
3661 // LCOV_EXCL_START
3662 // Check preconditions.
3663 assert(max_delta_ts.size() == m_batch_size);
3664 assert(std::none_of(max_delta_ts.begin(), max_delta_ts.end(), [](const auto &x) {
3665 using std::isnan;
3666 return isnan(x);
3667 }));
3668
3669 // Sanity check.
3670 assert(m_step_res.size() == m_batch_size);
3671 // LCOV_EXCL_STOP
3672
3673 // Copy max_delta_ts to the tmp buffer, twice.
3674 std::copy(max_delta_ts.begin(), max_delta_ts.end(), m_delta_ts.data());
3675 std::copy(max_delta_ts.begin(), max_delta_ts.end(), m_delta_ts.data() + m_batch_size);
3676
3677 // Helper to check if the state vector of a batch element
3678 // contains a non-finite value.
3679 auto check_nf_batch = [this](std::uint32_t batch_idx) {
3680 for (std::uint32_t i = 0; i < m_dim; ++i) {
3681 if (!isfinite(m_state[i * m_batch_size + batch_idx])) {
3682 return true;
3683 }
3684 }
3685 return false;
3686 };
3687
3688 if (m_step_f.index() == 0u) {
3689 assert(!m_ed_data); // LCOV_EXCL_LINE
3690
3691 std::get<0>(m_step_f)(m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(),
3692 wtc ? m_tc.data() : nullptr);
3693
3694 // Update the times and the last timesteps, and write out the result.
3695 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3696 // The timestep that was actually used for
3697 // this batch element.
3698 const auto h = m_delta_ts[i];
3699
3700 // Compute the new time in double-length arithmetic.
3701 const auto new_time = dfloat<T>(m_time_hi[i], m_time_lo[i]) + h;
3702 m_time_hi[i] = new_time.hi;
3703 m_time_lo[i] = new_time.lo;
3704
3705 // Update the size of the last timestep.
3706 m_last_h[i] = h;
3707
3708 if (!isfinite(new_time) || check_nf_batch(i)) {
3709 // Either the new time or state contain non-finite values,
3710 // return an error condition.
3711 m_step_res[i] = std::tuple{taylor_outcome::err_nf_state, h};
3712 } else {
3713 m_step_res[i] = std::tuple{
3714 // NOTE: use here the original value of
3715 // max_delta_ts, stored at the end of m_delta_ts,
3716 // in case max_delta_ts aliases integrator data
3717 // which was modified during the step.
3718 h == m_delta_ts[m_batch_size + i] ? taylor_outcome::time_limit : taylor_outcome::success, h};
3719 }
3720 }
3721 } else {
3722 assert(m_ed_data); // LCOV_EXCL_LINE
3723
3724 auto &ed_data = *m_ed_data;
3725
3726 // Invoke the stepper for event handling. We will record the norm infinity of the state vector +
3727 // event equations at the beginning of the timestep for later use.
3728 std::get<1>(m_step_f)(ed_data.m_ev_jet.data(), m_state.data(), m_pars.data(), m_time_hi.data(),
3729 m_delta_ts.data(), ed_data.m_max_abs_state.data());
3730
3731 // Compute the maximum absolute error on the Taylor series of the event equations, which we will use for
3732 // automatic cooldown deduction. If max_abs_state is not finite, set it to inf so that
3733 // in ed_data.detect_events() we skip event detection altogether.
3734 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3735 const auto max_abs_state = ed_data.m_max_abs_state[i];
3736
3737 if (isfinite(max_abs_state)) {
3738 // Are we in absolute or relative error control mode?
3739 const auto abs_or_rel = max_abs_state < 1;
3740
3741 // Estimate the size of the largest remainder in the Taylor
3742 // series of both the dynamical equations and the events.
3743 const auto max_r_size = abs_or_rel ? m_tol : (m_tol * max_abs_state);
3744
3745 // NOTE: depending on m_tol, max_r_size is arbitrarily small, but the real
3746 // integration error cannot be too small due to floating-point truncation.
3747 // This is the case for instance if we use sub-epsilon integration tolerances
3748 // to achieve Brouwer's law. In such a case, we cap the value of g_eps,
3749 // using eps * max_abs_state as an estimation of the smallest number
3750 // that can be resolved with the current floating-point type.
3751 // NOTE: the if condition in the next line is equivalent, in relative
3752 // error control mode, to:
3753 // if (m_tol < std::numeric_limits<T>::epsilon())
3754 if (max_r_size < std::numeric_limits<T>::epsilon() * max_abs_state) {
3755 ed_data.m_g_eps[i] = std::numeric_limits<T>::epsilon() * max_abs_state;
3756 } else {
3757 ed_data.m_g_eps[i] = max_r_size;
3758 }
3759 } else {
3760 ed_data.m_g_eps[i] = std::numeric_limits<T>::infinity();
3761 }
3762 }
3763
3764 // Write unconditionally the tcs.
3765 std::copy(ed_data.m_ev_jet.data(), ed_data.m_ev_jet.data() + m_dim * (m_order + 1u) * m_batch_size,
3766 m_tc.data());
3767
3768 // Do the event detection.
3769 ed_data.detect_events(m_delta_ts.data(), m_order, m_dim, m_batch_size);
3770
3771 // NOTE: before this point, we did not alter
3772 // any user-visible data in the integrator (just
3773 // temporary memory). From here until we start invoking
3774 // the callbacks, everything is noexcept, so we don't
3775 // risk leaving the integrator in a half-baked state.
3776
3777 // Sort the events by time.
3778 // NOTE: the time coordinates in m_d_(n)tes is relative
3779 // to the beginning of the timestep. It will be negative
3780 // for backward integration, thus we compare using
3781 // abs() so that the first events are those which
3782 // happen closer to the beginning of the timestep.
3783 // NOTE: the checks inside ed_data.detect_events() ensure
3784 // that we can safely sort the events' times.
3785 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3786 auto cmp = [](const auto &ev0, const auto &ev1) { return abs(std::get<1>(ev0)) < abs(std::get<1>(ev1)); };
3787 std::sort(ed_data.m_d_tes[i].begin(), ed_data.m_d_tes[i].end(), cmp);
3788 std::sort(ed_data.m_d_ntes[i].begin(), ed_data.m_d_ntes[i].end(), cmp);
3789
3790 // If we have terminal events we need
3791 // to update the value of h.
3792 if (!ed_data.m_d_tes[i].empty()) {
3793 m_delta_ts[i] = std::get<1>(ed_data.m_d_tes[i][0]);
3794 }
3795 }
3796
3797 // Update the state.
3798 m_d_out_f(m_state.data(), ed_data.m_ev_jet.data(), m_delta_ts.data());
3799
3800 // We will use this to capture the first exception thrown
3801 // by a callback, if any.
3802 std::exception_ptr eptr;
3803
3804 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3805 const auto h = m_delta_ts[i];
3806
3807 // Compute the new time in double-length arithmetic.
3808 const auto new_time = dfloat<T>(m_time_hi[i], m_time_lo[i]) + h;
3809 m_time_hi[i] = new_time.hi;
3810 m_time_lo[i] = new_time.lo;
3811
3812 // Store the last timestep.
3813 m_last_h[i] = h;
3814
3815 // Check if the time or the state vector are non-finite at the
3816 // end of the timestep.
3817 if (!isfinite(new_time) || check_nf_batch(i)) {
3818 // Let's also reset the cooldown values for this batch index,
3819 // as at this point they have become useless.
3820 reset_cooldowns(i);
3821
3822 m_step_res[i] = std::tuple{taylor_outcome::err_nf_state, h};
3823
3824 // Move to the next batch element.
3825 continue;
3826 }
3827
3828 // Update the cooldowns.
3829 for (auto &cd : ed_data.m_te_cooldowns[i]) {
3830 if (cd) {
3831 // Check if the timestep we just took
3832 // brought this event outside the cooldown.
3833 auto tmp = cd->first + h;
3834
3835 if (abs(tmp) >= cd->second) {
3836 // We are now outside the cooldown period
3837 // for this event, reset cd.
3838 cd.reset();
3839 } else {
3840 // Still in cooldown, update the
3841 // time spent in cooldown.
3842 cd->first = tmp;
3843 }
3844 }
3845 }
3846
3847 // If we don't have terminal events, we will invoke the callbacks
3848 // of *all* the non-terminal events. Otherwise, we need to figure
3849 // out which non-terminal events do not happen because their time
3850 // coordinate is past the the first terminal event.
3851 const auto ntes_end_it
3852 = ed_data.m_d_tes[i].empty()
3853 ? ed_data.m_d_ntes[i].end()
3854 : std::lower_bound(ed_data.m_d_ntes[i].begin(), ed_data.m_d_ntes[i].end(), h,
3855 [](const auto &ev, const auto &t) { return abs(std::get<1>(ev)) < abs(t); });
3856
3857 // Invoke the callbacks of the non-terminal events, which are guaranteed
3858 // to happen before the first terminal event.
3859 for (auto it = ed_data.m_d_ntes[i].begin(); it != ntes_end_it; ++it) {
3860 const auto &t = *it;
3861 const auto &cb = ed_data.m_ntes[std::get<0>(t)].get_callback();
3862 assert(cb); // LCOV_EXCL_LINE
3863 try {
3864 cb(*this, static_cast<T>(new_time - m_last_h[i] + std::get<1>(t)), std::get<2>(t), i);
3865 } catch (...) {
3866 if (!eptr) {
3867 eptr = std::current_exception();
3868 }
3869 }
3870 }
3871
3872 // The return value of the first
3873 // terminal event's callback. It will be
3874 // unused if there are no terminal events.
3875 bool te_cb_ret = false;
3876
3877 if (!ed_data.m_d_tes[i].empty()) {
3878 // Fetch the first terminal event.
3879 const auto te_idx = std::get<0>(ed_data.m_d_tes[i][0]);
3880 assert(te_idx < ed_data.m_tes.size()); // LCOV_EXCL_LINE
3881 const auto &te = ed_data.m_tes[te_idx];
3882
3883 // Set the corresponding cooldown.
3884 if (te.get_cooldown() >= 0) {
3885 // Cooldown explicitly provided by the user, use it.
3886 ed_data.m_te_cooldowns[i][te_idx].emplace(0, te.get_cooldown());
3887 } else {
3888 // Deduce the cooldown automatically.
3889 // NOTE: if m_g_eps[i] is not finite, we skipped event detection
3890 // altogether and thus we never end up here. If the derivative
3891 // of the event equation is not finite, the event is also skipped.
3892 ed_data.m_te_cooldowns[i][te_idx].emplace(
3893 0, taylor_deduce_cooldown(ed_data.m_g_eps[i], std::get<4>(ed_data.m_d_tes[i][0])));
3894 }
3895
3896 // Invoke the callback of the first terminal event, if it has one.
3897 if (te.get_callback()) {
3898 try {
3899 te_cb_ret = te.get_callback()(*this, std::get<2>(ed_data.m_d_tes[i][0]),
3900 std::get<3>(ed_data.m_d_tes[i][0]), i);
3901 } catch (...) {
3902 if (!eptr) {
3903 eptr = std::current_exception();
3904 }
3905 }
3906 }
3907 }
3908
3909 if (ed_data.m_d_tes[i].empty()) {
3910 // No terminal events detected, return success or time limit.
3911 m_step_res[i] = std::tuple{
3912 // NOTE: use here the original value of
3913 // max_delta_ts, stored at the end of m_delta_ts,
3914 // in case max_delta_ts aliases integrator data
3915 // which was modified during the step.
3916 h == m_delta_ts[m_batch_size + i] ? taylor_outcome::time_limit : taylor_outcome::success, h};
3917 } else {
3918 // Terminal event detected. Fetch its index.
3919 const auto ev_idx = static_cast<std::int64_t>(std::get<0>(ed_data.m_d_tes[i][0]));
3920
3921 // NOTE: if te_cb_ret is true, it means that the terminal event has
3922 // a callback and its invocation returned true (meaning that the
3923 // integration should continue). Otherwise, either the terminal event
3924 // has no callback or its callback returned false, meaning that the
3925 // integration must stop.
3926 m_step_res[i] = std::tuple{taylor_outcome{te_cb_ret ? ev_idx : (-ev_idx - 1)}, h};
3927 }
3928 }
3929
3930 // Check if any callback threw an exception, and re-throw
3931 // it in case.
3932 if (eptr) {
3933 std::rethrow_exception(eptr);
3934 }
3935 }
3936 }
3937
3938 template <typename T>
step(bool wtc)3939 void taylor_adaptive_batch_impl<T>::step(bool wtc)
3940 {
3941 step_impl(m_pinf, wtc);
3942 }
3943
3944 template <typename T>
step_backward(bool wtc)3945 void taylor_adaptive_batch_impl<T>::step_backward(bool wtc)
3946 {
3947 step_impl(m_minf, wtc);
3948 }
3949
3950 template <typename T>
step(const std::vector<T> & max_delta_ts,bool wtc)3951 void taylor_adaptive_batch_impl<T>::step(const std::vector<T> &max_delta_ts, bool wtc)
3952 {
3953 // Check the dimensionality of max_delta_ts.
3954 if (max_delta_ts.size() != m_batch_size) {
3955 throw std::invalid_argument(
3956 "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is {}, "
3957 "but the number of specified timesteps is {}"_format(m_batch_size, max_delta_ts.size()));
3958 }
3959
3960 // Make sure no values in max_delta_ts are nan.
3961 if (std::any_of(max_delta_ts.begin(), max_delta_ts.end(), [](const auto &x) {
3962 using std::isnan;
3963 return isnan(x);
3964 })) {
3965 throw std::invalid_argument(
3966 "Cannot invoke the step() function of an adaptive Taylor integrator in batch mode if "
3967 "one of the max timesteps is nan");
3968 }
3969
3970 step_impl(max_delta_ts, wtc);
3971 }
3972
3973 template <typename T>
propagate_for_impl(const std::vector<T> & delta_ts,std::size_t max_steps,const std::vector<T> & max_delta_ts,std::function<bool (taylor_adaptive_batch_impl &)> cb,bool wtc,bool with_c_out)3974 std::optional<continuous_output_batch<T>> taylor_adaptive_batch_impl<T>::propagate_for_impl(
3975 const std::vector<T> &delta_ts, std::size_t max_steps, const std::vector<T> &max_delta_ts,
3976 std::function<bool(taylor_adaptive_batch_impl &)> cb, bool wtc, bool with_c_out)
3977 {
3978 // Check the dimensionality of delta_ts.
3979 if (delta_ts.size() != m_batch_size) {
3980 throw std::invalid_argument("Invalid number of time intervals specified in a Taylor integrator in batch mode: "
3981 "the batch size is {}, but the number of specified time intervals is {}"_format(
3982 m_batch_size, delta_ts.size()));
3983 }
3984
3985 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
3986 m_pfor_ts[i] = dfloat<T>(m_time_hi[i], m_time_lo[i]) + delta_ts[i];
3987 }
3988
3989 // NOTE: max_delta_ts is checked in propagate_until_impl().
3990 return propagate_until_impl(m_pfor_ts, max_steps, max_delta_ts, std::move(cb), wtc, with_c_out);
3991 }
3992
3993 template <typename T>
propagate_until_impl(const std::vector<dfloat<T>> & ts,std::size_t max_steps,const std::vector<T> & max_delta_ts,std::function<bool (taylor_adaptive_batch_impl &)> cb,bool wtc,bool with_c_out)3994 std::optional<continuous_output_batch<T>> taylor_adaptive_batch_impl<T>::propagate_until_impl(
3995 const std::vector<dfloat<T>> &ts, std::size_t max_steps, const std::vector<T> &max_delta_ts,
3996 std::function<bool(taylor_adaptive_batch_impl &)> cb, bool wtc, bool with_c_out)
3997 {
3998 using std::abs;
3999 using std::isfinite;
4000 using std::isnan;
4001
4002 // NOTE: this function is called from either the other propagate_until() overload,
4003 // or propagate_for(). In both cases, we have already set up correctly the dimension of ts.
4004 assert(ts.size() == m_batch_size); // LCOV_EXCL_LINE
4005
4006 // Check the current times.
4007 if (std::any_of(m_time_hi.begin(), m_time_hi.end(), [](const auto &t) { return !isfinite(t); })
4008 || std::any_of(m_time_lo.begin(), m_time_lo.end(), [](const auto &t) { return !isfinite(t); })) {
4009 throw std::invalid_argument(
4010 "Cannot invoke the propagate_until() function of an adaptive Taylor integrator in batch mode if "
4011 "one of the current times is not finite");
4012 }
4013
4014 // Check the final times.
4015 if (std::any_of(ts.begin(), ts.end(), [](const auto &t) { return !isfinite(t); })) {
4016 throw std::invalid_argument("A non-finite time was passed to the propagate_until() function of an adaptive "
4017 "Taylor integrator in batch mode");
4018 }
4019
4020 // Check max_delta_ts.
4021 if (max_delta_ts.size() != m_batch_size) {
4022 throw std::invalid_argument(
4023 "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is {}, "
4024 "but the number of specified timesteps is {}"_format(m_batch_size, max_delta_ts.size()));
4025 }
4026 for (const auto &dt : max_delta_ts) {
4027 if (isnan(dt)) {
4028 throw std::invalid_argument("A nan max_delta_t was passed to the propagate_until() function of an adaptive "
4029 "Taylor integrator in batch mode");
4030 }
4031 if (dt <= 0) {
4032 throw std::invalid_argument("A non-positive max_delta_t was passed to the propagate_until() function of an "
4033 "adaptive Taylor integrator in batch mode");
4034 }
4035 }
4036
4037 // If with_c_out is true, we always need to write the Taylor coefficients.
4038 wtc = wtc || with_c_out;
4039
4040 // These vectors are used in the construction of the continuous output.
4041 // If continuous output is not requested, they will remain empty.
4042 std::vector<T> c_out_tcs, c_out_times_hi, c_out_times_lo;
4043 if (with_c_out) {
4044 // Push in the starting time.
4045 c_out_times_hi.insert(c_out_times_hi.end(), m_time_hi.begin(), m_time_hi.end());
4046 c_out_times_lo.insert(c_out_times_lo.end(), m_time_lo.begin(), m_time_lo.end());
4047 }
4048
4049 // Reset the counters and the min/max abs(h) vectors.
4050 std::size_t iter_counter = 0;
4051 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4052 m_ts_count[i] = 0;
4053 m_min_abs_h[i] = std::numeric_limits<T>::infinity();
4054 m_max_abs_h[i] = 0;
4055 }
4056
4057 // Compute the integration directions and init
4058 // the remaining times.
4059 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4060 m_rem_time[i] = ts[i] - dfloat<T>(m_time_hi[i], m_time_lo[i]);
4061 if (!isfinite(m_rem_time[i])) {
4062 throw std::invalid_argument("The final time passed to the propagate_until() function of an adaptive Taylor "
4063 "integrator in batch mode results in an overflow condition");
4064 }
4065
4066 m_t_dir[i] = (m_rem_time[i] >= T(0));
4067 }
4068
4069 // Helper to create the continuous output object.
4070 auto make_c_out = [&]() -> std::optional<continuous_output_batch<T>> {
4071 if (with_c_out) {
4072 if (c_out_times_hi.size() / m_batch_size < 2u) {
4073 // NOTE: this means that no successful steps
4074 // were taken.
4075 return {};
4076 }
4077
4078 // Construct the return value.
4079 continuous_output_batch<T> ret(m_llvm.make_similar());
4080
4081 // Fill in the data.
4082 ret.m_batch_size = m_batch_size;
4083 ret.m_tcs = std::move(c_out_tcs);
4084 ret.m_times_hi = std::move(c_out_times_hi);
4085 ret.m_times_lo = std::move(c_out_times_lo);
4086
4087 // Add padding to the times vectors to make the
4088 // vectorised upper_bound implementation well defined.
4089 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4090 ret.m_times_hi.push_back(m_t_dir[i] ? std::numeric_limits<T>::infinity()
4091 : -std::numeric_limits<T>::infinity());
4092 ret.m_times_lo.push_back(T(0));
4093 }
4094
4095 // Prepare the output vector.
4096 ret.m_output.resize(boost::numeric_cast<decltype(ret.m_output.size())>(m_dim * m_batch_size));
4097
4098 // Prepare the temp time vector.
4099 ret.m_tmp_tm.resize(boost::numeric_cast<decltype(ret.m_tmp_tm.size())>(m_batch_size));
4100
4101 // Add the continuous output function.
4102 ret.add_c_out_function(m_order, m_dim, m_high_accuracy);
4103
4104 return std::optional{std::move(ret)};
4105 } else {
4106 return {};
4107 }
4108 };
4109
4110 // Helper to update the continuous output data after a timestep.
4111 auto update_c_out = [&]() {
4112 if (with_c_out) {
4113 // Check if any batch element produced an error during the
4114 // last timestep. The only error that can arise is a non-finite
4115 // time/state.
4116 if (std::any_of(m_step_res.begin(), m_step_res.end(),
4117 [](const auto &tup) { return std::get<0>(tup) == taylor_outcome::err_nf_state; })) {
4118 // Don't update the continuous output.
4119 return;
4120 }
4121
4122 #if !defined(NDEBUG)
4123 std::vector<dfloat<T>> prev_times;
4124 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4125 prev_times.emplace_back(c_out_times_hi[c_out_times_hi.size() - m_batch_size + i],
4126 c_out_times_lo[c_out_times_lo.size() - m_batch_size + i]);
4127 }
4128 #endif
4129
4130 c_out_times_hi.insert(c_out_times_hi.end(), m_time_hi.begin(), m_time_hi.end());
4131 c_out_times_lo.insert(c_out_times_lo.end(), m_time_lo.begin(), m_time_lo.end());
4132
4133 #if !defined(NDEBUG)
4134 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4135 const dfloat<T> new_time(c_out_times_hi[c_out_times_hi.size() - m_batch_size + i],
4136 c_out_times_lo[c_out_times_lo.size() - m_batch_size + i]);
4137 assert(isfinite(new_time));
4138 if (m_t_dir[i]) {
4139 assert(!(new_time < prev_times[i]));
4140 } else {
4141 assert(!(new_time > prev_times[i]));
4142 }
4143 }
4144 #endif
4145
4146 c_out_tcs.insert(c_out_tcs.end(), m_tc.begin(), m_tc.end());
4147 }
4148 };
4149
4150 while (true) {
4151 // Compute the max integration times for this timestep.
4152 // NOTE: m_rem_time[i] is guaranteed to be finite: we check it explicitly above
4153 // and we keep on decreasing its magnitude at the following iterations.
4154 // If some non-finite state/time is generated in
4155 // the step function, the integration will be stopped.
4156 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4157 assert((m_rem_time[i] >= T(0)) == m_t_dir[i] || m_rem_time[i] == T(0)); // LCOV_EXCL_LINE
4158
4159 // Compute the time limit.
4160 const auto dt_limit = m_t_dir[i] ? std::min(dfloat<T>(max_delta_ts[i]), m_rem_time[i])
4161 : std::max(dfloat<T>(-max_delta_ts[i]), m_rem_time[i]);
4162
4163 // Store it.
4164 m_cur_max_delta_ts[i] = static_cast<T>(dt_limit);
4165 }
4166
4167 // Run the integration timestep.
4168 // NOTE: if dt_limit is zero, step_impl() will always return time_limit.
4169 step_impl(m_cur_max_delta_ts, wtc);
4170
4171 // Update the continuous output.
4172 // NOTE: this function will avoid updating the c_out if any non-finite
4173 // state is detected in the step that was just taken.
4174 update_c_out();
4175
4176 // Check if the integration timestep produced an error condition or we reached
4177 // a stopping terminal event.
4178 if (std::any_of(m_step_res.begin(), m_step_res.end(), [](const auto &tup) {
4179 const auto oc = std::get<0>(tup);
4180 return oc != taylor_outcome::success && oc != taylor_outcome::time_limit && oc < taylor_outcome{0};
4181 })) {
4182 // Setup m_prop_res before exiting.
4183 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4184 const auto [oc, h] = m_step_res[i];
4185 if (oc > taylor_outcome::success) {
4186 // In case of a stopping terminal event, we still want to update
4187 // the step counter.
4188 m_ts_count[i] += static_cast<std::size_t>(h != 0);
4189 }
4190
4191 m_prop_res[i] = std::tuple{oc, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4192 }
4193
4194 return make_c_out();
4195 }
4196
4197 // Update the iteration counter.
4198 ++iter_counter;
4199
4200 // Update the local step counters and min_h/max_h.
4201 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4202 const auto [res, h] = m_step_res[i];
4203
4204 // NOTE: the local step counters increase only if we integrated
4205 // for a nonzero time.
4206 m_ts_count[i] += static_cast<std::size_t>(h != 0);
4207
4208 // Update min_h/max_h only if the outcome is success (otherwise
4209 // the step was artificially clamped either by a time limit or
4210 // by a continuing terminal event).
4211 if (res == taylor_outcome::success) {
4212 const auto abs_h = abs(h);
4213 m_min_abs_h[i] = std::min(m_min_abs_h[i], abs_h);
4214 m_max_abs_h[i] = std::max(m_max_abs_h[i], abs_h);
4215 }
4216 }
4217
4218 // The step was successful, execute the callback.
4219 if (cb && !cb(*this)) {
4220 // Setup m_prop_res before exiting. We set all outcomes
4221 // to cb_stop regardless of the timestep outcome.
4222 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4223 m_prop_res[i] = std::tuple{taylor_outcome::cb_stop, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4224 }
4225
4226 return make_c_out();
4227 }
4228
4229 // Break out if we have reached the final time
4230 // for all batch elements.
4231 bool all_done = true;
4232 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4233 // NOTE: here we check h == rem_time, instead of just
4234 // res == time_limit, because clamping via max_delta_t
4235 // could also result in time_limit.
4236 if (std::get<1>(m_step_res[i]) != static_cast<T>(m_rem_time[i])) {
4237 all_done = false;
4238 break;
4239 }
4240 }
4241 if (all_done) {
4242 // Setup m_prop_res before exiting. The outcomes will all be time_limit.
4243 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4244 m_prop_res[i] = std::tuple{taylor_outcome::time_limit, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4245 }
4246
4247 return make_c_out();
4248 }
4249
4250 // Check the iteration limit.
4251 // NOTE: if max_steps is 0 (i.e., no limit on the number of steps),
4252 // then this condition will never trigger as by this point we are
4253 // sure iter_counter is at least 1.
4254 if (iter_counter == max_steps) {
4255 // We reached the max_steps limit: set the outcome for all batch elements
4256 // to step_limit.
4257 // NOTE: this is the same logic adopted when the integration is stopped
4258 // by the callback (see above).
4259 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4260 m_prop_res[i] = std::tuple{taylor_outcome::step_limit, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4261 }
4262
4263 return make_c_out();
4264 }
4265
4266 // Update the remaining times.
4267 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4268 const auto [res, h] = m_step_res[i];
4269
4270 // NOTE: if static_cast<T>(m_rem_time[i]) was used as a timestep,
4271 // it means that we hit the time limit. Force rem_time to zero
4272 // to signal this, so that zero-length steps will be taken
4273 // for all remaining iterations
4274 // NOTE: if m_rem_time[i] was previously set to zero, it
4275 // will end up being repeatedly set to zero here. This
4276 // should be harmless.
4277 if (h == static_cast<T>(m_rem_time[i])) {
4278 assert(res == taylor_outcome::time_limit); // LCOV_EXCL_LINE
4279 m_rem_time[i] = dfloat<T>(T(0));
4280 } else {
4281 m_rem_time[i] = ts[i] - dfloat<T>(m_time_hi[i], m_time_lo[i]);
4282 }
4283 }
4284 }
4285
4286 // LCOV_EXCL_START
4287 assert(false);
4288
4289 return {};
4290 // LCOV_EXCL_STOP
4291 }
4292
4293 template <typename T>
propagate_until_impl(const std::vector<T> & ts,std::size_t max_steps,const std::vector<T> & max_delta_ts,std::function<bool (taylor_adaptive_batch_impl &)> cb,bool wtc,bool with_c_out)4294 std::optional<continuous_output_batch<T>> taylor_adaptive_batch_impl<T>::propagate_until_impl(
4295 const std::vector<T> &ts, std::size_t max_steps, const std::vector<T> &max_delta_ts,
4296 std::function<bool(taylor_adaptive_batch_impl &)> cb, bool wtc, bool with_c_out)
4297 {
4298 // Check the dimensionality of ts.
4299 if (ts.size() != m_batch_size) {
4300 throw std::invalid_argument(
4301 "Invalid number of time limits specified in a Taylor integrator in batch mode: the "
4302 "batch size is {}, but the number of specified time limits is {}"_format(m_batch_size, ts.size()));
4303 }
4304
4305 // NOTE: re-use m_pfor_ts as tmp storage.
4306 assert(m_pfor_ts.size() == m_batch_size); // LCOV_EXCL_LINE
4307 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4308 m_pfor_ts[i] = dfloat<T>(ts[i]);
4309 }
4310
4311 // NOTE: max_delta_ts is checked in the other propagate_until_impl() overload.
4312 return propagate_until_impl(m_pfor_ts, max_steps, max_delta_ts, std::move(cb), wtc, with_c_out);
4313 }
4314
4315 template <typename T>
propagate_grid_impl(const std::vector<T> & grid,std::size_t max_steps,const std::vector<T> & max_delta_ts,std::function<bool (taylor_adaptive_batch_impl &)> cb)4316 std::vector<T> taylor_adaptive_batch_impl<T>::propagate_grid_impl(const std::vector<T> &grid, std::size_t max_steps,
4317 const std::vector<T> &max_delta_ts,
4318 std::function<bool(taylor_adaptive_batch_impl &)> cb)
4319 {
4320 using std::abs;
4321 using std::isnan;
4322
4323 // Helper to detect if an input value is nonfinite.
4324 auto is_nf = [](const T &t) {
4325 using std::isfinite;
4326 return !isfinite(t);
4327 };
4328
4329 if (grid.empty()) {
4330 throw std::invalid_argument(
4331 "Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if the time grid is empty");
4332 }
4333
4334 // Check that the grid size is a multiple of m_batch_size.
4335 if (grid.size() % m_batch_size != 0u) {
4336 throw std::invalid_argument(
4337 "Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: "
4338 "the grid has a size of {}, which is not a multiple of the batch size ({})"_format(grid.size(),
4339 m_batch_size));
4340 }
4341
4342 // Check the current time coordinates.
4343 if (std::any_of(m_time_hi.begin(), m_time_hi.end(), is_nf)
4344 || std::any_of(m_time_lo.begin(), m_time_lo.end(), is_nf)) {
4345 throw std::invalid_argument("Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if "
4346 "the current time is not finite");
4347 }
4348
4349 // Check max_delta_ts.
4350 if (max_delta_ts.size() != m_batch_size) {
4351 throw std::invalid_argument(
4352 "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is {}, "
4353 "but the number of specified timesteps is {}"_format(m_batch_size, max_delta_ts.size()));
4354 }
4355 for (const auto &dt : max_delta_ts) {
4356 if (isnan(dt)) {
4357 throw std::invalid_argument("A nan max_delta_t was passed to the propagate_grid() function of an adaptive "
4358 "Taylor integrator in batch mode");
4359 }
4360 if (dt <= 0) {
4361 throw std::invalid_argument("A non-positive max_delta_t was passed to the propagate_grid() function of an "
4362 "adaptive Taylor integrator in batch mode");
4363 }
4364 }
4365
4366 // The number of grid points.
4367 const auto n_grid_points = grid.size() / m_batch_size;
4368
4369 // Pointer to the grid data.
4370 const auto *const grid_ptr = grid.data();
4371
4372 // Check the input grid points.
4373 constexpr auto nf_err_msg
4374 = "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch mode";
4375 constexpr auto ig_err_msg = "A non-monotonic time grid was passed to propagate_grid() in an adaptive "
4376 "Taylor integrator in batch mode";
4377
4378 // Check the first point.
4379 if (std::any_of(grid_ptr, grid_ptr + m_batch_size, is_nf)) {
4380 throw std::invalid_argument(nf_err_msg);
4381 }
4382 if (n_grid_points > 1u) {
4383 // Establish the direction of the grid from
4384 // the first two batches of points.
4385 if (std::any_of(grid_ptr + m_batch_size, grid_ptr + m_batch_size + m_batch_size, is_nf)) {
4386 throw std::invalid_argument(nf_err_msg);
4387 }
4388 if (grid_ptr[m_batch_size] == grid_ptr[0]) {
4389 throw std::invalid_argument(ig_err_msg);
4390 }
4391
4392 const auto grid_direction = grid_ptr[m_batch_size] > grid_ptr[0];
4393 for (std::uint32_t i = 1; i < m_batch_size; ++i) {
4394 if ((grid_ptr[m_batch_size + i] > grid_ptr[i]) != grid_direction) {
4395 throw std::invalid_argument(ig_err_msg);
4396 }
4397 }
4398
4399 // Check that the remaining points are finite and that
4400 // they are ordered monotonically.
4401 for (decltype(grid.size()) i = 2; i < n_grid_points; ++i) {
4402 if (std::any_of(grid_ptr + i * m_batch_size, grid_ptr + (i + 1u) * m_batch_size, is_nf)) {
4403 throw std::invalid_argument(nf_err_msg);
4404 }
4405
4406 if (std::any_of(
4407 grid_ptr + i * m_batch_size, grid_ptr + (i + 1u) * m_batch_size,
4408 [this, grid_direction](const T &t) { return (t > *(&t - m_batch_size)) != grid_direction; })) {
4409 throw std::invalid_argument(ig_err_msg);
4410 }
4411 }
4412 }
4413
4414 // Pre-allocate the return value.
4415 std::vector<T> retval;
4416 // LCOV_EXCL_START
4417 if (get_dim() > std::numeric_limits<decltype(retval.size())>::max() / grid.size()) {
4418 throw std::overflow_error("Overflow detected in the creation of the return value of propagate_grid() in an "
4419 "adaptive Taylor integrator in batch mode");
4420 }
4421 // LCOV_EXCL_STOP
4422 // NOTE: fill with NaNs, so that the missing entries
4423 // are signalled with NaN if we exit early.
4424 retval.resize(grid.size() * get_dim(), std::numeric_limits<T>::quiet_NaN());
4425
4426 // Propagate the system up to the first batch of grid points.
4427 // NOTE: this will *not* write the TCs, but because we know that
4428 // the grid is strictly monotonic, we are sure we will take at least
4429 // one TC-writing timestep below before trying to use the dense output.
4430 std::vector<T> pgrid_tmp;
4431 pgrid_tmp.resize(boost::numeric_cast<decltype(pgrid_tmp.size())>(m_batch_size));
4432 std::copy(grid_ptr, grid_ptr + m_batch_size, pgrid_tmp.begin());
4433 // NOTE: use the same max_steps for the initial propagation,
4434 // and don't pass the callback.
4435 propagate_until(pgrid_tmp, kw::max_delta_t = max_delta_ts, kw::max_steps = max_steps);
4436
4437 // Check the result of the integration.
4438 if (std::any_of(m_prop_res.begin(), m_prop_res.end(), [](const auto &t) {
4439 // Check if the outcome is not time_limit and it is not a continuing
4440 // terminal event. This means that a non-finite state was
4441 // encountered, or a stopping terminal event triggered, or the step
4442 // limit was hit.
4443 const auto oc = std::get<0>(t);
4444 return oc != taylor_outcome::time_limit && oc < taylor_outcome{0};
4445 })) {
4446 // NOTE: for consistency with the scalar implementation,
4447 // keep the outcomes from propagate_until() but we reset
4448 // min/max h and the step counter.
4449 for (auto &[_, min_h, max_h, ts_count] : m_prop_res) {
4450 min_h = std::numeric_limits<T>::infinity();
4451 max_h = 0;
4452 ts_count = 0;
4453 }
4454
4455 return retval;
4456 }
4457
4458 // Add the first result to retval.
4459 std::copy(m_state.begin(), m_state.end(), retval.begin());
4460
4461 // Init the remaining times and directions.
4462 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4463 m_rem_time[i] = grid_ptr[(n_grid_points - 1u) * m_batch_size + i] - dfloat<T>(m_time_hi[i], m_time_lo[i]);
4464
4465 // Check it.
4466 if (!isfinite(m_rem_time[i])) {
4467 throw std::invalid_argument("The final time passed to the propagate_grid() function of an adaptive Taylor "
4468 "integrator in batch mode results in an overflow condition");
4469 }
4470
4471 m_t_dir[i] = (m_rem_time[i] >= T(0));
4472 }
4473
4474 // Reset the counters and the min/max abs(h) vectors.
4475 std::size_t iter_counter = 0;
4476 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4477 m_ts_count[i] = 0;
4478 m_min_abs_h[i] = std::numeric_limits<T>::infinity();
4479 m_max_abs_h[i] = 0;
4480 }
4481
4482 // NOTE: in general, an integration timestep will cover a different number
4483 // of grid points for each batch element. We thus need to track the grid
4484 // index separately for each batch element. We will start with index
4485 // 1 for all batch elements, since all batch elements have been propagated to
4486 // index 0 already.
4487 std::vector<decltype(grid.size())> cur_grid_idx(
4488 boost::numeric_cast<typename std::vector<decltype(grid.size())>::size_type>(m_batch_size), 1);
4489
4490 // Vectors to keep track of the time range of the last taken timestep.
4491 std::vector<dfloat<T>> t0(boost::numeric_cast<typename std::vector<dfloat<T>>::size_type>(m_batch_size)), t1(t0);
4492
4493 // Vector of flags to keep track of the batch elements
4494 // we can compute dense output for.
4495 std::vector<unsigned> dflags(boost::numeric_cast<std::vector<unsigned>::size_type>(m_batch_size));
4496
4497 // NOTE: loop until we have processed all grid points
4498 // for all batch elements.
4499 auto cont_cond = [n_grid_points, &cur_grid_idx]() {
4500 return std::any_of(cur_grid_idx.begin(), cur_grid_idx.end(),
4501 [n_grid_points](auto idx) { return idx < n_grid_points; });
4502 };
4503
4504 while (cont_cond()) {
4505 // Establish the time ranges of the last
4506 // taken timestep.
4507 // NOTE: t0 < t1.
4508 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4509 const dfloat<T> cur_time(m_time_hi[i], m_time_lo[i]), cmp = cur_time - m_last_h[i];
4510
4511 t0[i] = std::min(cur_time, cmp);
4512 t1[i] = std::max(cur_time, cmp);
4513 }
4514
4515 // Reset dflags.
4516 std::fill(dflags.begin(), dflags.end(), 1u);
4517
4518 // Compute the state of the system via dense output for as many grid
4519 // points as possible, i.e., as long as the grid times
4520 // fall within the validity range for the dense output of at least
4521 // one batch element.
4522 while (true) {
4523 // Establish and count for which batch elements we
4524 // can still compute dense output.
4525 std::uint32_t counter = 0;
4526 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4527 // Fetch the grid index for the current batch element.
4528 const auto gidx = cur_grid_idx[i];
4529
4530 if (dflags[i] && gidx < n_grid_points) {
4531 // The current batch element has not been eliminated
4532 // yet from the candidate list and it still has grid
4533 // points available. Determine if the current grid point
4534 // falls within the validity domain for the dense output.
4535 // NOTE: if we are at the last timestep for this batch
4536 // element, force processing of all remaining grid points.
4537 // We do this to avoid numerical issues when deciding if
4538 // he last grid point falls within the range of validity
4539 // of the dense output.
4540 const auto idx = gidx * m_batch_size + i;
4541 const auto d_avail
4542 = (grid_ptr[idx] >= t0[i] && grid_ptr[idx] <= t1[i]) || (m_rem_time[i] == dfloat<T>(T(0)));
4543 dflags[i] = d_avail;
4544 counter += d_avail;
4545
4546 // Copy over the grid point to pgrid_tmp regardless
4547 // of whether d_avail is true or false.
4548 pgrid_tmp[i] = grid_ptr[idx];
4549 } else {
4550 // Either the batch element had already been eliminated
4551 // previously, or there are no more grid points available.
4552 // Make sure the batch element is marked as eliminated.
4553 dflags[i] = 0;
4554 }
4555 }
4556
4557 if (counter == 0u) {
4558 // Cannot use dense output on any of the batch elements,
4559 // need to take another step.
4560 break;
4561 }
4562
4563 // Compute the dense output.
4564 update_d_output(pgrid_tmp);
4565
4566 // Add the results to retval and bump up the values in cur_grid_idx.
4567 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4568 if (dflags[i] != 0u) {
4569 const auto gidx = cur_grid_idx[i];
4570
4571 for (std::uint32_t j = 0; j < m_dim; ++j) {
4572 retval[gidx * m_batch_size * m_dim + j * m_batch_size + i] = m_d_out[j * m_batch_size + i];
4573 }
4574
4575 assert(cur_grid_idx[i] < n_grid_points); // LCOV_EXCL_LINE
4576 ++cur_grid_idx[i];
4577 }
4578 }
4579
4580 // Check if we exhausted all grid points for all batch elements.
4581 if (!cont_cond()) {
4582 break;
4583 }
4584 }
4585
4586 // Check if we exhausted all grid points for all batch elements.
4587 if (!cont_cond()) {
4588 break;
4589 }
4590
4591 // Take the next step, making sure to write the Taylor coefficients
4592 // and to cap the timestep size so that we don't go past the
4593 // last grid point and we don't use a timestep exceeding max_delta_t.
4594 // NOTE: m_rem_time is guaranteed to be finite: we check it explicitly above
4595 // and we keep on decreasing its magnitude at the following iterations.
4596 // If some non-finite state/time is generated in
4597 // the step function, the integration will be stopped.
4598 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4599 // Max delta_t for the current batch element.
4600 const auto max_delta_t = max_delta_ts[i];
4601
4602 // Compute the step limit for the current batch element.
4603 assert((m_rem_time[i] >= T(0)) == m_t_dir[i] || m_rem_time[i] == T(0)); // LCOV_EXCL_LINE
4604 const auto dt_limit = m_t_dir[i] ? std::min(dfloat<T>(max_delta_t), m_rem_time[i])
4605 : std::max(dfloat<T>(-max_delta_t), m_rem_time[i]);
4606
4607 pgrid_tmp[i] = static_cast<T>(dt_limit);
4608 }
4609 step_impl(pgrid_tmp, true);
4610
4611 // Check the result of the integration.
4612 if (std::any_of(m_step_res.begin(), m_step_res.end(), [](const auto &t) {
4613 // Something went wrong in the propagation of the timestep, or we reached
4614 // a stopping terminal event.
4615 const auto oc = std::get<0>(t);
4616 return oc != taylor_outcome::success && oc != taylor_outcome::time_limit && oc < taylor_outcome{0};
4617 })) {
4618 // Setup m_prop_res before exiting.
4619 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4620 const auto [oc, h] = m_step_res[i];
4621 if (oc > taylor_outcome::success) {
4622 // In case of a stopping terminal event, we still want to update
4623 // the step counter.
4624 m_ts_count[i] += static_cast<std::size_t>(h != 0);
4625 }
4626
4627 m_prop_res[i] = std::tuple{oc, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4628 }
4629
4630 return retval;
4631 }
4632
4633 // Update the number of iterations.
4634 ++iter_counter;
4635
4636 // Update m_rem_time, the local step counters, min_h/max_h.
4637 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4638 const auto [res, h] = m_step_res[i];
4639
4640 // NOTE: the local step counters increase only if we integrated
4641 // for a nonzero time.
4642 m_ts_count[i] += static_cast<std::size_t>(h != 0);
4643
4644 // Update the remaining time.
4645 // NOTE: if static_cast<T>(m_rem_time[i]) was used as a timestep,
4646 // it means that we hit the time limit. Force rem_time to zero
4647 // to signal this, so that zero-length steps will be taken
4648 // for all remaining iterations, thus always triggering the
4649 // time_limit outcome. A zero m_rem_time[i]
4650 // will also force the processing of all remaining grid points.
4651 // NOTE: if m_rem_time[i] was previously set to zero, it
4652 // will end up being repeatedly set to zero here. This
4653 // should be harmless.
4654 if (h == static_cast<T>(m_rem_time[i])) {
4655 assert(res == taylor_outcome::time_limit); // LCOV_EXCL_LINE
4656 m_rem_time[i] = dfloat<T>(T(0));
4657 } else {
4658 m_rem_time[i]
4659 = grid_ptr[(n_grid_points - 1u) * m_batch_size + i] - dfloat<T>(m_time_hi[i], m_time_lo[i]);
4660 }
4661
4662 // Update min_h/max_h, but only if the outcome is success (otherwise
4663 // the step was artificially clamped either by a time limit or
4664 // by a continuing terminal event).
4665 if (res == taylor_outcome::success) {
4666 const auto abs_h = abs(h);
4667 m_min_abs_h[i] = std::min(m_min_abs_h[i], abs_h);
4668 m_max_abs_h[i] = std::max(m_max_abs_h[i], abs_h);
4669 }
4670 }
4671
4672 // Step successful: invoke the callback, if needed.
4673 if (cb && !cb(*this)) {
4674 // Setup m_prop_res before exiting. We set all outcomes
4675 // to cb_stop regardless of the timestep outcome.
4676 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4677 m_prop_res[i] = std::tuple{taylor_outcome::cb_stop, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4678 }
4679
4680 return retval;
4681 }
4682
4683 // Check the iteration limit.
4684 // NOTE: if max_steps is 0 (i.e., no limit on the number of steps),
4685 // then this condition will never trigger as by this point we are
4686 // sure iter_counter is at least 1.
4687 if (iter_counter == max_steps) {
4688 // We reached the max_steps limit: set the outcome for all batch elements
4689 // to step_limit.
4690 // NOTE: this is the same logic adopted when the integration is stopped
4691 // by the callback (see above).
4692 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4693 m_prop_res[i] = std::tuple{taylor_outcome::step_limit, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4694 }
4695
4696 return retval;
4697 }
4698 }
4699
4700 // Everything went fine, set all outcomes to time_limit.
4701 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4702 m_prop_res[i] = std::tuple{taylor_outcome::time_limit, m_min_abs_h[i], m_max_abs_h[i], m_ts_count[i]};
4703 }
4704
4705 return retval;
4706 }
4707
4708 template <typename T>
get_llvm_state() const4709 const llvm_state &taylor_adaptive_batch_impl<T>::get_llvm_state() const
4710 {
4711 return m_llvm;
4712 }
4713
4714 template <typename T>
get_decomposition() const4715 const taylor_dc_t &taylor_adaptive_batch_impl<T>::get_decomposition() const
4716 {
4717 return m_dc;
4718 }
4719
4720 template <typename T>
get_order() const4721 std::uint32_t taylor_adaptive_batch_impl<T>::get_order() const
4722 {
4723 return m_order;
4724 }
4725
4726 template <typename T>
get_tol() const4727 T taylor_adaptive_batch_impl<T>::get_tol() const
4728 {
4729 return m_tol;
4730 }
4731
4732 template <typename T>
get_high_accuracy() const4733 bool taylor_adaptive_batch_impl<T>::get_high_accuracy() const
4734 {
4735 return m_high_accuracy;
4736 }
4737
4738 template <typename T>
get_compact_mode() const4739 bool taylor_adaptive_batch_impl<T>::get_compact_mode() const
4740 {
4741 return m_compact_mode;
4742 }
4743
4744 template <typename T>
get_batch_size() const4745 std::uint32_t taylor_adaptive_batch_impl<T>::get_batch_size() const
4746 {
4747 return m_batch_size;
4748 }
4749
4750 template <typename T>
get_dim() const4751 std::uint32_t taylor_adaptive_batch_impl<T>::get_dim() const
4752 {
4753 return m_dim;
4754 }
4755
4756 template <typename T>
update_d_output(const std::vector<T> & time,bool rel_time)4757 const std::vector<T> &taylor_adaptive_batch_impl<T>::update_d_output(const std::vector<T> &time, bool rel_time)
4758 {
4759 // Check the dimensionality of time.
4760 if (time.size() != m_batch_size) {
4761 throw std::invalid_argument(
4762 "Invalid number of time coordinates specified for the dense output in a Taylor integrator in batch "
4763 "mode: the batch size is {}, but the number of time coordinates is {}"_format(m_batch_size, time.size()));
4764 }
4765
4766 // NOTE: "time" needs to be translated
4767 // because m_d_out_f expects a time coordinate
4768 // with respect to the starting time t0 of
4769 // the *previous* timestep.
4770 if (rel_time) {
4771 // Time coordinate relative to the current time.
4772 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4773 m_d_out_time[i] = m_last_h[i] + time[i];
4774 }
4775 } else {
4776 // Absolute time coordinate.
4777 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4778 m_d_out_time[i] = static_cast<T>(time[i] - (dfloat<T>(m_time_hi[i], m_time_lo[i]) - m_last_h[i]));
4779 }
4780 }
4781
4782 m_d_out_f(m_d_out.data(), m_tc.data(), m_d_out_time.data());
4783
4784 return m_d_out;
4785 }
4786
4787 template <typename T>
reset_cooldowns()4788 void taylor_adaptive_batch_impl<T>::reset_cooldowns()
4789 {
4790 for (std::uint32_t i = 0; i < m_batch_size; ++i) {
4791 reset_cooldowns(i);
4792 }
4793 }
4794
4795 template <typename T>
reset_cooldowns(std::uint32_t i)4796 void taylor_adaptive_batch_impl<T>::reset_cooldowns(std::uint32_t i)
4797 {
4798 if (!m_ed_data) {
4799 throw std::invalid_argument("No events were defined for this integrator");
4800 }
4801
4802 if (i >= m_batch_size) {
4803 throw std::invalid_argument(
4804 "Cannot reset the cooldowns at batch index {}: the batch size for this integrator is only {}"_format(
4805 i, m_batch_size));
4806 }
4807
4808 for (auto &cd : m_ed_data->m_te_cooldowns[i]) {
4809 cd.reset();
4810 }
4811 }
4812
4813 // Explicit instantiation of the batch implementation classes.
4814 template class taylor_adaptive_batch_impl<double>;
4815
4816 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<double>::finalise_ctor_impl(
4817 const std::vector<expression> &, std::vector<double>, std::uint32_t, std::vector<double>, double, bool, bool,
4818 std::vector<double>, std::vector<t_event_t>, std::vector<nt_event_t>);
4819
4820 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<double>::finalise_ctor_impl(
4821 const std::vector<std::pair<expression, expression>> &, std::vector<double>, std::uint32_t, std::vector<double>,
4822 double, bool, bool, std::vector<double>, std::vector<t_event_t>, std::vector<nt_event_t>);
4823
4824 template class taylor_adaptive_batch_impl<long double>;
4825
4826 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<long double>::finalise_ctor_impl(
4827 const std::vector<expression> &, std::vector<long double>, std::uint32_t, std::vector<long double>, long double,
4828 bool, bool, std::vector<long double>, std::vector<t_event_t>, std::vector<nt_event_t>);
4829
4830 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<long double>::finalise_ctor_impl(
4831 const std::vector<std::pair<expression, expression>> &, std::vector<long double>, std::uint32_t,
4832 std::vector<long double>, long double, bool, bool, std::vector<long double>, std::vector<t_event_t>,
4833 std::vector<nt_event_t>);
4834
4835 #if defined(HEYOKA_HAVE_REAL128)
4836
4837 template class taylor_adaptive_batch_impl<mppp::real128>;
4838
4839 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<mppp::real128>::finalise_ctor_impl(
4840 const std::vector<expression> &, std::vector<mppp::real128>, std::uint32_t, std::vector<mppp::real128>,
4841 mppp::real128, bool, bool, std::vector<mppp::real128>, std::vector<t_event_t>, std::vector<nt_event_t>);
4842
4843 template HEYOKA_DLL_PUBLIC void taylor_adaptive_batch_impl<mppp::real128>::finalise_ctor_impl(
4844 const std::vector<std::pair<expression, expression>> &, std::vector<mppp::real128>, std::uint32_t,
4845 std::vector<mppp::real128>, mppp::real128, bool, bool, std::vector<mppp::real128>, std::vector<t_event_t>,
4846 std::vector<nt_event_t>);
4847
4848 #endif
4849
4850 } // namespace detail
4851
4852 namespace detail
4853 {
4854
4855 namespace
4856 {
4857
4858 // NOTE: in compact mode, care must be taken when adding multiple jet functions to the same llvm state
4859 // with the same floating-point type, batch size and number of u variables. The potential issue there
4860 // is that when the first jet is added, the compact mode AD functions are created and then optimised.
4861 // The optimisation pass might alter the functions in a way that makes them incompatible with subsequent
4862 // uses in the second jet (e.g., an argument might be removed from the signature because it is a
4863 // compile-time constant). A workaround to avoid issues is to set the optimisation level to zero
4864 // in the state, add the 2 jets and then run a single optimisation pass.
4865 // NOTE: document this eventually.
4866 template <typename T, typename U>
taylor_add_jet_impl(llvm_state & s,const std::string & name,const U & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)4867 auto taylor_add_jet_impl(llvm_state &s, const std::string &name, const U &sys, std::uint32_t order,
4868 std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
4869 const std::vector<expression> &sv_funcs)
4870 {
4871 if (s.is_compiled()) {
4872 throw std::invalid_argument("A function for the computation of the jet of Taylor derivatives cannot be added "
4873 "to an llvm_state after compilation");
4874 }
4875
4876 if (order == 0u) {
4877 throw std::invalid_argument("The order of a Taylor jet cannot be zero");
4878 }
4879
4880 if (batch_size == 0u) {
4881 throw std::invalid_argument("The batch size of a Taylor jet cannot be zero");
4882 }
4883
4884 #if defined(HEYOKA_ARCH_PPC)
4885 if constexpr (std::is_same_v<T, long double>) {
4886 throw not_implemented_error("'long double' computations are not supported on PowerPC");
4887 }
4888 #endif
4889
4890 auto &builder = s.builder();
4891
4892 // Record the number of equations/variables.
4893 const auto n_eq = boost::numeric_cast<std::uint32_t>(sys.size());
4894
4895 // Record the number of sv_funcs before consuming it.
4896 const auto n_sv_funcs = boost::numeric_cast<std::uint32_t>(sv_funcs.size());
4897
4898 // Decompose the system of equations.
4899 // NOTE: don't use structured bindings due to the
4900 // usual issues with lambdas.
4901 const auto td_res = taylor_decompose(sys, sv_funcs);
4902 const auto &dc = td_res.first;
4903 const auto &sv_funcs_dc = td_res.second;
4904
4905 assert(sv_funcs_dc.size() == n_sv_funcs); // LCOV_EXCL_LINE
4906
4907 // Compute the number of u variables.
4908 assert(dc.size() > n_eq); // LCOV_EXCL_LINE
4909 const auto n_uvars = boost::numeric_cast<std::uint32_t>(dc.size() - n_eq);
4910
4911 // Prepare the function prototype. The first argument is a float pointer to the in/out array,
4912 // the second argument a const float pointer to the pars, the third argument
4913 // a float pointer to the time. These arrays cannot overlap.
4914 auto *fp_t = to_llvm_type<T>(s.context());
4915 std::vector<llvm::Type *> fargs(3, llvm::PointerType::getUnqual(fp_t));
4916 // The function does not return anything.
4917 auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
4918 assert(ft != nullptr); // LCOV_EXCL_LINE
4919 // Now create the function.
4920 auto *f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, name, &s.module());
4921 if (f == nullptr) {
4922 throw std::invalid_argument(
4923 "Unable to create a function for the computation of the jet of Taylor derivatives with name '{}'"_format(
4924 name));
4925 }
4926
4927 // Set the names/attributes of the function arguments.
4928 auto *in_out = f->args().begin();
4929 in_out->setName("in_out");
4930 in_out->addAttr(llvm::Attribute::NoCapture);
4931 in_out->addAttr(llvm::Attribute::NoAlias);
4932
4933 auto *par_ptr = in_out + 1;
4934 par_ptr->setName("par_ptr");
4935 par_ptr->addAttr(llvm::Attribute::NoCapture);
4936 par_ptr->addAttr(llvm::Attribute::NoAlias);
4937 par_ptr->addAttr(llvm::Attribute::ReadOnly);
4938
4939 auto *time_ptr = par_ptr + 1;
4940 time_ptr->setName("time_ptr");
4941 time_ptr->addAttr(llvm::Attribute::NoCapture);
4942 time_ptr->addAttr(llvm::Attribute::NoAlias);
4943 time_ptr->addAttr(llvm::Attribute::ReadOnly);
4944
4945 // Create a new basic block to start insertion into.
4946 auto *bb = llvm::BasicBlock::Create(s.context(), "entry", f);
4947 assert(bb != nullptr); // LCOV_EXCL_LINE
4948 builder.SetInsertPoint(bb);
4949
4950 // Compute the jet of derivatives.
4951 auto diff_variant = taylor_compute_jet<T>(s, in_out, par_ptr, time_ptr, dc, sv_funcs_dc, n_eq, n_uvars, order,
4952 batch_size, compact_mode, high_accuracy);
4953
4954 // Write the derivatives to in_out.
4955 // NOTE: overflow checking. We need to be able to index into the jet array
4956 // (size (n_eq + n_sv_funcs) * (order + 1) * batch_size)
4957 // using uint32_t.
4958 // LCOV_EXCL_START
4959 if (order == std::numeric_limits<std::uint32_t>::max()
4960 || (order + 1u) > std::numeric_limits<std::uint32_t>::max() / batch_size
4961 || n_eq > std::numeric_limits<std::uint32_t>::max() - n_sv_funcs
4962 || n_eq + n_sv_funcs > std::numeric_limits<std::uint32_t>::max() / ((order + 1u) * batch_size)) {
4963 throw std::overflow_error("An overflow condition was detected while adding a Taylor jet");
4964 }
4965 // LCOV_EXCL_STOP
4966
4967 if (compact_mode) {
4968 auto diff_arr = std::get<llvm::Value *>(diff_variant);
4969
4970 // Create a global read-only array containing the values in sv_funcs_dc, if any
4971 // (otherwise, svf_ptr will be null).
4972 auto svf_ptr = taylor_c_make_sv_funcs_arr(s, sv_funcs_dc);
4973
4974 // Write the order 0 of the sv_funcs, if needed.
4975 if (svf_ptr != nullptr) {
4976 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_sv_funcs), [&](llvm::Value *arr_idx) {
4977 // Fetch the u var index from svf_ptr.
4978 assert(llvm_depr_GEP_type_check(svf_ptr, builder.getInt32Ty())); // LCOV_EXCL_LINE
4979 auto cur_idx = builder.CreateLoad(builder.getInt32Ty(),
4980 builder.CreateInBoundsGEP(builder.getInt32Ty(), svf_ptr, arr_idx));
4981
4982 // Load the derivative value from diff_arr.
4983 auto diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), cur_idx);
4984
4985 // Compute the index in the output pointer.
4986 auto out_idx = builder.CreateMul(builder.CreateAdd(builder.getInt32(n_eq), arr_idx),
4987 builder.getInt32(batch_size));
4988
4989 // Store into in_out.
4990 assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
4991 store_vector_to_memory(builder, builder.CreateInBoundsGEP(fp_t, in_out, out_idx), diff_val);
4992 });
4993 }
4994
4995 // Write the other orders.
4996 llvm_loop_u32(
4997 s, builder.getInt32(1), builder.CreateAdd(builder.getInt32(order), builder.getInt32(1)),
4998 [&](llvm::Value *cur_order) {
4999 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_idx) {
5000 // Load the derivative value from diff_arr.
5001 auto diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_idx);
5002
5003 // Compute the index in the output pointer.
5004 auto out_idx = builder.CreateAdd(
5005 builder.CreateMul(builder.getInt32((n_eq + n_sv_funcs) * batch_size), cur_order),
5006 builder.CreateMul(cur_idx, builder.getInt32(batch_size)));
5007
5008 // Store into in_out.
5009 assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5010 store_vector_to_memory(builder, builder.CreateInBoundsGEP(fp_t, in_out, out_idx), diff_val);
5011 });
5012
5013 if (svf_ptr != nullptr) {
5014 llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_sv_funcs), [&](llvm::Value *arr_idx) {
5015 // Fetch the u var index from svf_ptr.
5016 assert(llvm_depr_GEP_type_check(svf_ptr, builder.getInt32Ty())); // LCOV_EXCL_LINE
5017 auto cur_idx = builder.CreateLoad(
5018 builder.getInt32Ty(), builder.CreateInBoundsGEP(builder.getInt32Ty(), svf_ptr, arr_idx));
5019
5020 // Load the derivative value from diff_arr.
5021 auto diff_val = taylor_c_load_diff(s, diff_arr, n_uvars, cur_order, cur_idx);
5022
5023 // Compute the index in the output pointer.
5024 auto out_idx = builder.CreateAdd(
5025 builder.CreateMul(builder.getInt32((n_eq + n_sv_funcs) * batch_size), cur_order),
5026 builder.CreateMul(builder.CreateAdd(builder.getInt32(n_eq), arr_idx),
5027 builder.getInt32(batch_size)));
5028
5029 // Store into in_out.
5030 assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5031 store_vector_to_memory(builder, builder.CreateInBoundsGEP(fp_t, in_out, out_idx), diff_val);
5032 });
5033 }
5034 });
5035 } else {
5036 const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_variant);
5037
5038 // Write the order 0 of the sv_funcs.
5039 for (std::uint32_t j = 0; j < n_sv_funcs; ++j) {
5040 // Index in the jet of derivatives.
5041 // NOTE: in non-compact mode, diff_arr contains the derivatives only of the
5042 // state variables and sv_funcs (not all u vars), hence the indexing is
5043 // n_eq + j.
5044 const auto arr_idx = n_eq + j;
5045 assert(arr_idx < diff_arr.size()); // LCOV_EXCL_LINE
5046 const auto val = diff_arr[arr_idx];
5047
5048 // Index in the output array.
5049 const auto out_idx = (n_eq + j) * batch_size;
5050
5051 assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5052 auto *out_ptr
5053 = builder.CreateInBoundsGEP(fp_t, in_out, builder.getInt32(static_cast<std::uint32_t>(out_idx)));
5054 store_vector_to_memory(builder, out_ptr, val);
5055 }
5056
5057 for (decltype(diff_arr.size()) cur_order = 1; cur_order <= order; ++cur_order) {
5058 for (std::uint32_t j = 0; j < n_eq; ++j) {
5059 // Index in the jet of derivatives.
5060 // NOTE: in non-compact mode, diff_arr contains the derivatives only of the
5061 // state variables and sv_funcs (not all u vars), hence the indexing is
5062 // cur_order * (n_eq + n_sv_funcs) + j.
5063 const auto arr_idx = cur_order * (n_eq + n_sv_funcs) + j;
5064 assert(arr_idx < diff_arr.size()); // LCOV_EXCL_LINE
5065 const auto val = diff_arr[arr_idx];
5066
5067 // Index in the output array.
5068 const auto out_idx = (n_eq + n_sv_funcs) * batch_size * cur_order + j * batch_size;
5069
5070 assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5071 auto *out_ptr
5072 = builder.CreateInBoundsGEP(fp_t, in_out, builder.getInt32(static_cast<std::uint32_t>(out_idx)));
5073 store_vector_to_memory(builder, out_ptr, val);
5074 }
5075
5076 for (std::uint32_t j = 0; j < n_sv_funcs; ++j) {
5077 const auto arr_idx = cur_order * (n_eq + n_sv_funcs) + n_eq + j;
5078 assert(arr_idx < diff_arr.size()); // LCOV_EXCL_LINE
5079 const auto val = diff_arr[arr_idx];
5080
5081 const auto out_idx = (n_eq + n_sv_funcs) * batch_size * cur_order + (n_eq + j) * batch_size;
5082
5083 assert(llvm_depr_GEP_type_check(in_out, fp_t)); // LCOV_EXCL_LINE
5084 auto *out_ptr
5085 = builder.CreateInBoundsGEP(fp_t, in_out, builder.getInt32(static_cast<std::uint32_t>(out_idx)));
5086 store_vector_to_memory(builder, out_ptr, val);
5087 }
5088 }
5089 }
5090
5091 // Finish off the function.
5092 builder.CreateRetVoid();
5093
5094 // Verify it.
5095 s.verify_function(f);
5096
5097 // Run the optimisation pass.
5098 s.optimise();
5099
5100 return dc;
5101 }
5102
5103 } // namespace
5104
5105 } // namespace detail
5106
taylor_add_jet_dbl(llvm_state & s,const std::string & name,const std::vector<expression> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5107 taylor_dc_t taylor_add_jet_dbl(llvm_state &s, const std::string &name, const std::vector<expression> &sys,
5108 std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5109 const std::vector<expression> &sv_funcs)
5110 {
5111 return detail::taylor_add_jet_impl<double>(s, name, sys, order, batch_size, high_accuracy, compact_mode, sv_funcs);
5112 }
5113
taylor_add_jet_ldbl(llvm_state & s,const std::string & name,const std::vector<expression> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5114 taylor_dc_t taylor_add_jet_ldbl(llvm_state &s, const std::string &name, const std::vector<expression> &sys,
5115 std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5116 const std::vector<expression> &sv_funcs)
5117 {
5118 return detail::taylor_add_jet_impl<long double>(s, name, sys, order, batch_size, high_accuracy, compact_mode,
5119 sv_funcs);
5120 }
5121
5122 #if defined(HEYOKA_HAVE_REAL128)
5123
taylor_add_jet_f128(llvm_state & s,const std::string & name,const std::vector<expression> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5124 taylor_dc_t taylor_add_jet_f128(llvm_state &s, const std::string &name, const std::vector<expression> &sys,
5125 std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5126 const std::vector<expression> &sv_funcs)
5127 {
5128 return detail::taylor_add_jet_impl<mppp::real128>(s, name, sys, order, batch_size, high_accuracy, compact_mode,
5129 sv_funcs);
5130 }
5131
5132 #endif
5133
taylor_add_jet_dbl(llvm_state & s,const std::string & name,const std::vector<std::pair<expression,expression>> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5134 taylor_dc_t taylor_add_jet_dbl(llvm_state &s, const std::string &name,
5135 const std::vector<std::pair<expression, expression>> &sys, std::uint32_t order,
5136 std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5137 const std::vector<expression> &sv_funcs)
5138 {
5139 return detail::taylor_add_jet_impl<double>(s, name, sys, order, batch_size, high_accuracy, compact_mode, sv_funcs);
5140 }
5141
taylor_add_jet_ldbl(llvm_state & s,const std::string & name,const std::vector<std::pair<expression,expression>> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5142 taylor_dc_t taylor_add_jet_ldbl(llvm_state &s, const std::string &name,
5143 const std::vector<std::pair<expression, expression>> &sys, std::uint32_t order,
5144 std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5145 const std::vector<expression> &sv_funcs)
5146 {
5147 return detail::taylor_add_jet_impl<long double>(s, name, sys, order, batch_size, high_accuracy, compact_mode,
5148 sv_funcs);
5149 }
5150
5151 #if defined(HEYOKA_HAVE_REAL128)
5152
taylor_add_jet_f128(llvm_state & s,const std::string & name,const std::vector<std::pair<expression,expression>> & sys,std::uint32_t order,std::uint32_t batch_size,bool high_accuracy,bool compact_mode,const std::vector<expression> & sv_funcs)5153 taylor_dc_t taylor_add_jet_f128(llvm_state &s, const std::string &name,
5154 const std::vector<std::pair<expression, expression>> &sys, std::uint32_t order,
5155 std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
5156 const std::vector<expression> &sv_funcs)
5157 {
5158 return detail::taylor_add_jet_impl<mppp::real128>(s, name, sys, order, batch_size, high_accuracy, compact_mode,
5159 sv_funcs);
5160 }
5161
5162 #endif
5163
5164 } // namespace heyoka
5165