1 // Copyright 2020, 2021 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
2 //
3 // This file is part of the heyoka library.
4 //
5 // This Source Code Form is subject to the terms of the Mozilla
6 // Public License v. 2.0. If a copy of the MPL was not distributed
7 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 
9 #include <heyoka/config.hpp>
10 
11 #include <algorithm>
12 #include <cassert>
13 #include <cmath>
14 #include <cstddef>
15 #include <cstdint>
16 #include <functional>
17 #include <initializer_list>
18 #include <limits>
19 #include <stdexcept>
20 #include <string>
21 #include <typeindex>
22 #include <typeinfo>
23 #include <unordered_map>
24 #include <utility>
25 #include <vector>
26 
27 #include <boost/math/constants/constants.hpp>
28 #include <boost/numeric/conversion/cast.hpp>
29 
30 #include <fmt/format.h>
31 
32 #include <llvm/Config/llvm-config.h>
33 #include <llvm/IR/Attributes.h>
34 #include <llvm/IR/BasicBlock.h>
35 #include <llvm/IR/Constant.h>
36 #include <llvm/IR/Constants.h>
37 #include <llvm/IR/DataLayout.h>
38 #include <llvm/IR/DerivedTypes.h>
39 #include <llvm/IR/Function.h>
40 #include <llvm/IR/GlobalVariable.h>
41 #include <llvm/IR/IRBuilder.h>
42 #include <llvm/IR/InstrTypes.h>
43 #include <llvm/IR/Intrinsics.h>
44 #include <llvm/IR/LLVMContext.h>
45 #include <llvm/IR/Module.h>
46 #include <llvm/IR/Operator.h>
47 #include <llvm/IR/Type.h>
48 #include <llvm/IR/Value.h>
49 #include <llvm/Support/Alignment.h>
50 #include <llvm/Support/Casting.h>
51 #include <llvm/Support/raw_ostream.h>
52 
53 #if defined(HEYOKA_HAVE_REAL128)
54 
55 #include <mp++/real128.hpp>
56 
57 #endif
58 
59 #include <heyoka/detail/binomial.hpp>
60 #include <heyoka/detail/llvm_fwd.hpp>
61 #include <heyoka/detail/llvm_helpers.hpp>
62 #include <heyoka/detail/llvm_vector_type.hpp>
63 #include <heyoka/detail/logging_impl.hpp>
64 #include <heyoka/detail/sleef.hpp>
65 #include <heyoka/detail/visibility.hpp>
66 #include <heyoka/llvm_state.hpp>
67 #include <heyoka/number.hpp>
68 
69 #if defined(_MSC_VER) && !defined(__clang__)
70 
71 // NOTE: MSVC has issues with the other "using"
72 // statement form.
73 using namespace fmt::literals;
74 
75 #else
76 
77 using fmt::literals::operator""_format;
78 
79 #endif
80 
81 namespace heyoka::detail
82 {
83 
84 namespace
85 {
86 
87 // The global type map to associate a C++ type to an LLVM type.
__anonb5603a6a0202() 88 const auto type_map = []() {
89     std::unordered_map<std::type_index, llvm::Type *(*)(llvm::LLVMContext &)> retval;
90 
91     // Try to associate C++ double to LLVM double.
92     if (std::numeric_limits<double>::is_iec559 && std::numeric_limits<double>::digits == 53) {
93         retval[typeid(double)] = [](llvm::LLVMContext &c) {
94             auto ret = llvm::Type::getDoubleTy(c);
95             assert(ret != nullptr);
96             return ret;
97         };
98     }
99 
100     // Try to associate C++ long double to an LLVM fp type.
101     if (std::numeric_limits<long double>::is_iec559) {
102         if (std::numeric_limits<long double>::digits == 53) {
103             retval[typeid(long double)] = [](llvm::LLVMContext &c) {
104                 // IEEE double-precision format (this is the case on MSVC for instance).
105                 auto ret = llvm::Type::getDoubleTy(c);
106                 assert(ret != nullptr);
107                 return ret;
108             };
109 #if defined(HEYOKA_ARCH_X86)
110         } else if (std::numeric_limits<long double>::digits == 64) {
111             retval[typeid(long double)] = [](llvm::LLVMContext &c) {
112                 // x86 extended precision format.
113                 auto ret = llvm::Type::getX86_FP80Ty(c);
114                 assert(ret != nullptr);
115                 return ret;
116             };
117 #endif
118         } else if (std::numeric_limits<long double>::digits == 113) {
119             retval[typeid(long double)] = [](llvm::LLVMContext &c) {
120                 // IEEE quadruple-precision format (e.g., ARM 64).
121                 auto ret = llvm::Type::getFP128Ty(c);
122                 assert(ret != nullptr);
123                 return ret;
124             };
125         }
126     }
127 
128 #if defined(HEYOKA_HAVE_REAL128)
129 
130     // Associate mppp::real128 to fp128.
131     retval[typeid(mppp::real128)] = [](llvm::LLVMContext &c) {
132         auto ret = llvm::Type::getFP128Ty(c);
133         assert(ret != nullptr);
134         return ret;
135     };
136 
137 #endif
138 
139     return retval;
140 }();
141 
142 } // namespace
143 
144 // Implementation of the function to associate a C++ type to
145 // an LLVM type.
to_llvm_type_impl(llvm::LLVMContext & c,const std::type_info & tp)146 llvm::Type *to_llvm_type_impl(llvm::LLVMContext &c, const std::type_info &tp)
147 {
148     const auto it = type_map.find(tp);
149 
150     if (it == type_map.end()) {
151         throw std::invalid_argument("Unable to associate the C++ type '{}' to an LLVM type"_format(tp.name()));
152     } else {
153         return it->second(c);
154     }
155 }
156 
157 // Helper to produce a unique string for the type t.
llvm_mangle_type(llvm::Type * t)158 std::string llvm_mangle_type(llvm::Type *t)
159 {
160     assert(t != nullptr);
161 
162     if (auto *v_t = llvm::dyn_cast<llvm_vector_type>(t)) {
163         // If the type is a vector, get the name of the element type
164         // and append the vector size.
165         return "{}_{}"_format(llvm_type_name(v_t->getElementType()), v_t->getNumElements());
166     } else {
167         // Otherwise just return the type name.
168         return llvm_type_name(t);
169     }
170 }
171 
172 // Helper to load into a vector of size vector_size the sequential scalar data starting at ptr.
173 // If vector_size is 1, a scalar is loaded instead.
load_vector_from_memory(ir_builder & builder,llvm::Value * ptr,std::uint32_t vector_size)174 llvm::Value *load_vector_from_memory(ir_builder &builder, llvm::Value *ptr, std::uint32_t vector_size)
175 {
176     assert(vector_size > 0u);
177 
178     if (vector_size == 1u) {
179         // Scalar case.
180         return builder.CreateLoad(ptr);
181     }
182 
183     // Fetch the pointer type (this will result in an assertion
184     // failure if ptr is not a pointer).
185     auto ptr_t = llvm::cast<llvm::PointerType>(ptr->getType());
186 
187     // Create the vector type.
188     auto vector_t = make_vector_type(ptr_t->getElementType(), vector_size);
189     assert(vector_t != nullptr);
190 
191     // Create the output vector.
192     auto ret = static_cast<llvm::Value *>(llvm::UndefValue::get(vector_t));
193 
194     // Fill it.
195     for (std::uint32_t i = 0; i < vector_size; ++i) {
196         ret = builder.CreateInsertElement(ret,
197                                           builder.CreateLoad(builder.CreateInBoundsGEP(ptr, {builder.getInt32(i)})), i);
198     }
199 
200     return ret;
201 }
202 
203 // Helper to store the content of vector vec to the pointer ptr. If vec is not a vector,
204 // a plain store will be performed.
store_vector_to_memory(ir_builder & builder,llvm::Value * ptr,llvm::Value * vec)205 void store_vector_to_memory(ir_builder &builder, llvm::Value *ptr, llvm::Value *vec)
206 {
207     if (auto v_ptr_t = llvm::dyn_cast<llvm_vector_type>(vec->getType())) {
208         // Determine the vector size.
209         const auto vector_size = boost::numeric_cast<std::uint32_t>(v_ptr_t->getNumElements());
210 
211         for (std::uint32_t i = 0; i < vector_size; ++i) {
212             builder.CreateStore(builder.CreateExtractElement(vec, i),
213                                 builder.CreateInBoundsGEP(ptr, {builder.getInt32(i)}));
214         }
215     } else {
216         // Not a vector, store vec directly.
217         builder.CreateStore(vec, ptr);
218     }
219 }
220 
221 // Gather a vector of type vec_tp from the vector of pointers ptrs. align is the alignment of the
222 // scalar values stored in ptrs.
gather_vector_from_memory(ir_builder & builder,llvm::Type * vec_tp,llvm::Value * ptrs,std::size_t align)223 llvm::Value *gather_vector_from_memory(ir_builder &builder, llvm::Type *vec_tp, llvm::Value *ptrs, std::size_t align)
224 {
225     if (llvm::isa<llvm_vector_type>(vec_tp)) {
226         assert(llvm::isa<llvm_vector_type>(ptrs->getType()));
227 
228         return builder.CreateMaskedGather(
229 #if LLVM_VERSION_MAJOR >= 13
230             // NOTE: new initial argument required since LLVM 13
231             // (the vector type to gather).
232             vec_tp,
233 #endif
234             ptrs,
235 #if LLVM_VERSION_MAJOR == 10
236             align
237 #else
238             llvm::Align(align)
239 #endif
240         );
241     } else {
242         assert(!llvm::isa<llvm_vector_type>(ptrs->getType()));
243 
244         return builder.CreateLoad(ptrs);
245     }
246 }
247 
248 // Create a SIMD vector of size vector_size filled with the value c. If vector_size is 1,
249 // c will be returned.
vector_splat(ir_builder & builder,llvm::Value * c,std::uint32_t vector_size)250 llvm::Value *vector_splat(ir_builder &builder, llvm::Value *c, std::uint32_t vector_size)
251 {
252     assert(vector_size > 0u);
253 
254     if (vector_size == 1u) {
255         return c;
256     }
257 
258     return builder.CreateVectorSplat(boost::numeric_cast<unsigned>(vector_size), c);
259 }
260 
make_vector_type(llvm::Type * t,std::uint32_t vector_size)261 llvm::Type *make_vector_type(llvm::Type *t, std::uint32_t vector_size)
262 {
263     assert(t != nullptr);
264     assert(vector_size > 0u);
265 
266     if (vector_size == 1u) {
267         return t;
268     } else {
269         auto retval = llvm_vector_type::get(t, boost::numeric_cast<unsigned>(vector_size));
270 
271         assert(retval != nullptr);
272 
273         return retval;
274     }
275 }
276 
277 // Convert the input LLVM vector to a std::vector of values. If vec is not a vector,
278 // return {vec}.
vector_to_scalars(ir_builder & builder,llvm::Value * vec)279 std::vector<llvm::Value *> vector_to_scalars(ir_builder &builder, llvm::Value *vec)
280 {
281     if (auto vec_t = llvm::dyn_cast<llvm_vector_type>(vec->getType())) {
282         // Fetch the vector width.
283         auto vector_size = vec_t->getNumElements();
284 
285         // Extract the vector elements one by one.
286         std::vector<llvm::Value *> ret;
287         for (decltype(vector_size) i = 0; i < vector_size; ++i) {
288             ret.push_back(builder.CreateExtractElement(vec, boost::numeric_cast<std::uint64_t>(i)));
289             assert(ret.back() != nullptr);
290         }
291 
292         return ret;
293     } else {
294         return {vec};
295     }
296 }
297 
298 // Convert a std::vector of values into an LLVM vector of the corresponding size.
299 // If scalars contains only 1 value, return that value.
scalars_to_vector(ir_builder & builder,const std::vector<llvm::Value * > & scalars)300 llvm::Value *scalars_to_vector(ir_builder &builder, const std::vector<llvm::Value *> &scalars)
301 {
302     assert(!scalars.empty());
303 
304     // Fetch the vector size.
305     const auto vector_size = scalars.size();
306 
307     if (vector_size == 1u) {
308         return scalars[0];
309     }
310 
311     // Fetch the scalar type.
312     auto scalar_t = scalars[0]->getType();
313 
314     // Create the corresponding vector type.
315     auto vector_t = make_vector_type(scalar_t, boost::numeric_cast<std::uint32_t>(vector_size));
316     assert(vector_t != nullptr);
317 
318     // Create an empty vector.
319     llvm::Value *vec = llvm::UndefValue::get(vector_t);
320     assert(vec != nullptr);
321 
322     // Fill it up.
323     for (auto i = 0u; i < vector_size; ++i) {
324         assert(scalars[i]->getType() == scalar_t);
325 
326         vec = builder.CreateInsertElement(vec, scalars[i], i);
327     }
328 
329     return vec;
330 }
331 
332 // Pairwise reduction of a vector of LLVM values.
pairwise_reduce(std::vector<llvm::Value * > & vals,const std::function<llvm::Value * (llvm::Value *,llvm::Value *)> & f)333 llvm::Value *pairwise_reduce(std::vector<llvm::Value *> &vals,
334                              const std::function<llvm::Value *(llvm::Value *, llvm::Value *)> &f)
335 {
336     assert(!vals.empty());
337     assert(f);
338 
339     // LCOV_EXCL_START
340     if (vals.size() == std::numeric_limits<decltype(vals.size())>::max()) {
341         throw std::overflow_error("Overflow detected in pairwise_reduce()");
342     }
343     // LCOV_EXCL_STOP
344 
345     while (vals.size() != 1u) {
346         std::vector<llvm::Value *> new_vals;
347 
348         for (decltype(vals.size()) i = 0; i < vals.size(); i += 2u) {
349             if (i + 1u == vals.size()) {
350                 // We are at the last element of the vector
351                 // and the size of the vector is odd. Just append
352                 // the existing value.
353                 new_vals.push_back(vals[i]);
354             } else {
355                 new_vals.push_back(f(vals[i], vals[i + 1u]));
356             }
357         }
358 
359         new_vals.swap(vals);
360     }
361 
362     return vals[0];
363 }
364 
365 // Pairwise summation of a vector of LLVM values.
366 // https://en.wikipedia.org/wiki/Pairwise_summation
pairwise_sum(ir_builder & builder,std::vector<llvm::Value * > & sum)367 llvm::Value *pairwise_sum(ir_builder &builder, std::vector<llvm::Value *> &sum)
368 {
369     return pairwise_reduce(
370         sum, [&builder](llvm::Value *a, llvm::Value *b) -> llvm::Value * { return builder.CreateFAdd(a, b); });
371 }
372 
373 // Helper to invoke an intrinsic function with arguments 'args'. 'types' are the argument type(s) for
374 // overloaded intrinsics.
llvm_invoke_intrinsic(llvm_state & s,const std::string & name,const std::vector<llvm::Type * > & types,const std::vector<llvm::Value * > & args)375 llvm::Value *llvm_invoke_intrinsic(llvm_state &s, const std::string &name, const std::vector<llvm::Type *> &types,
376                                    const std::vector<llvm::Value *> &args)
377 {
378     // Fetch the intrinsic ID from the name.
379     const auto intrinsic_ID = llvm::Function::lookupIntrinsicID(name);
380     if (intrinsic_ID == 0) {
381         throw std::invalid_argument("Cannot fetch the ID of the intrinsic '{}'"_format(name));
382     }
383 
384     // Fetch the declaration.
385     // NOTE: for generic intrinsics to work, we need to specify
386     // the desired argument type(s). See:
387     // https://stackoverflow.com/questions/11985247/llvm-insert-intrinsic-function-cos
388     // And the docs of the getDeclaration() function.
389     auto callee_f = llvm::Intrinsic::getDeclaration(&s.module(), intrinsic_ID, types);
390     if (callee_f == nullptr) {
391         throw std::invalid_argument("Error getting the declaration of the intrinsic '{}'"_format(name));
392     }
393     if (!callee_f->isDeclaration()) {
394         // It does not make sense to have a definition of a builtin.
395         throw std::invalid_argument("The intrinsic '{}' must be only declared, not defined"_format(name));
396     }
397 
398     // Check the number of arguments.
399     if (callee_f->arg_size() != args.size()) {
400         throw std::invalid_argument(
401             "Incorrect # of arguments passed while calling the intrinsic '{}': {} are "
402             "expected, but {} were provided instead"_format(name, callee_f->arg_size(), args.size()));
403     }
404 
405     // Create the function call.
406     auto r = s.builder().CreateCall(callee_f, args);
407     assert(r != nullptr);
408 
409     return r;
410 }
411 
412 // Helper to invoke an external function called 'name' with arguments args and return type ret_type.
llvm_invoke_external(llvm_state & s,const std::string & name,llvm::Type * ret_type,const std::vector<llvm::Value * > & args,const std::vector<int> & attrs)413 llvm::Value *llvm_invoke_external(llvm_state &s, const std::string &name, llvm::Type *ret_type,
414                                   const std::vector<llvm::Value *> &args, const std::vector<int> &attrs)
415 {
416     // Look up the name in the global module table.
417     auto callee_f = s.module().getFunction(name);
418 
419     if (callee_f == nullptr) {
420         // The function does not exist yet, make the prototype.
421         std::vector<llvm::Type *> arg_types;
422         for (auto a : args) {
423             arg_types.push_back(a->getType());
424         }
425         auto *ft = llvm::FunctionType::get(ret_type, arg_types, false);
426         callee_f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, name, &s.module());
427         if (callee_f == nullptr) {
428             throw std::invalid_argument("Unable to create the prototype for the external function '{}'"_format(name));
429         }
430 
431         // Add the function attributes.
432         for (const auto &att : attrs) {
433             // NOTE: convert back to the LLVM attribute enum.
434             callee_f->addFnAttr(boost::numeric_cast<llvm::Attribute::AttrKind>(att));
435         }
436     } else {
437         // The function declaration exists already. Check that it is only a
438         // declaration and not a definition.
439         if (!callee_f->isDeclaration()) {
440             throw std::invalid_argument("Cannot call the function '{}' as an external function, because "
441                                         "it is defined as an internal module function"_format(name));
442         }
443         // Check the number of arguments.
444         if (callee_f->arg_size() != args.size()) {
445             throw std::invalid_argument(
446                 "Incorrect # of arguments passed while calling the external function '{}': {} "
447                 "are expected, but {} were provided instead"_format(name, callee_f->arg_size(), args.size()));
448         }
449         // NOTE: perhaps in the future we should consider adding more checks here
450         // (e.g., argument types, return type).
451     }
452 
453     // Create the function call.
454     auto r = s.builder().CreateCall(callee_f, args);
455     assert(r != nullptr);
456     // NOTE: we used to have r->setTailCall(true) here, but:
457     // - when optimising, the tail call attribute is automatically
458     //   added,
459     // - it is not 100% clear to me whether it is always safe to enable it:
460     // https://llvm.org/docs/CodeGenerator.html#tail-calls
461 
462     return r;
463 }
464 
465 // Helper to invoke an internal module function called 'name' with arguments 'args'.
llvm_invoke_internal(llvm_state & s,const std::string & name,const std::vector<llvm::Value * > & args)466 llvm::Value *llvm_invoke_internal(llvm_state &s, const std::string &name, const std::vector<llvm::Value *> &args)
467 {
468     auto callee_f = s.module().getFunction(name);
469 
470     if (callee_f == nullptr) {
471         throw std::invalid_argument("Unknown internal function: '{}'"_format(name));
472     }
473 
474     if (callee_f->isDeclaration()) {
475         throw std::invalid_argument("The internal function '{}' cannot be just a "
476                                     "declaration, a definition is needed"_format(name));
477     }
478 
479     // Check the number of arguments.
480     if (callee_f->arg_size() != args.size()) {
481         throw std::invalid_argument(
482             "Incorrect # of arguments passed while calling the internal function '{}': {} are "
483             "expected, but {} were provided instead"_format(name, callee_f->arg_size(), args.size()));
484     }
485     // NOTE: perhaps in the future we should consider adding more checks here
486     // (e.g., argument types).
487 
488     // Create the function call.
489     auto r = s.builder().CreateCall(callee_f, args);
490     assert(r != nullptr);
491     // NOTE: we used to have r->setTailCall(true) here, but:
492     // - when optimising, the tail call attribute is automatically
493     //   added,
494     // - it is not 100% clear to me whether it is always safe to enable it:
495     // https://llvm.org/docs/CodeGenerator.html#tail-calls
496 
497     return r;
498 }
499 
500 // Create an LLVM for loop in the form:
501 //
502 // for (auto i = begin; i < end; i = next_cur(i)) {
503 //   body(i);
504 // }
505 //
506 // The default implementation of i = next_cur(i),
507 // if next_cur is not provided, is ++i.
508 //
509 // begin/end must be 32-bit unsigned integer values.
llvm_loop_u32(llvm_state & s,llvm::Value * begin,llvm::Value * end,const std::function<void (llvm::Value *)> & body,const std::function<llvm::Value * (llvm::Value *)> & next_cur)510 void llvm_loop_u32(llvm_state &s, llvm::Value *begin, llvm::Value *end, const std::function<void(llvm::Value *)> &body,
511                    const std::function<llvm::Value *(llvm::Value *)> &next_cur)
512 {
513     assert(body);
514     assert(begin->getType() == end->getType());
515     assert(begin->getType() == s.builder().getInt32Ty());
516 
517     auto &context = s.context();
518     auto &builder = s.builder();
519 
520     // Fetch the current function.
521     assert(builder.GetInsertBlock() != nullptr);
522     auto f = builder.GetInsertBlock()->getParent();
523     assert(f != nullptr);
524 
525     // Pre-create loop and afterloop blocks. Note that these have just
526     // been created, they have not been inserted yet in the IR.
527     auto *loop_bb = llvm::BasicBlock::Create(context);
528     auto *after_bb = llvm::BasicBlock::Create(context);
529 
530     // NOTE: we need a special case if the body of the loop is
531     // never to be executed (that is, begin >= end).
532     // In such a case, we will jump directly to after_bb.
533     // NOTE: unsigned integral comparison.
534     auto skip_cond = builder.CreateICmp(llvm::CmpInst::ICMP_UGE, begin, end);
535     builder.CreateCondBr(skip_cond, after_bb, loop_bb);
536 
537     // Get a reference to the current block for
538     // later usage in the phi node.
539     auto preheader_bb = builder.GetInsertBlock();
540 
541     // Add the loop block and start insertion into it.
542     f->getBasicBlockList().push_back(loop_bb);
543     builder.SetInsertPoint(loop_bb);
544 
545     // Create the phi node and add the first pair of arguments.
546     auto cur = builder.CreatePHI(builder.getInt32Ty(), 2);
547     cur->addIncoming(begin, preheader_bb);
548 
549     // Execute the loop body and the post-body code.
550     llvm::Value *next;
551     try {
552         body(cur);
553 
554         // Compute the next value of the iteration. Use the next_cur
555         // function if provided, otherwise, by default, increase cur by 1.
556         // NOTE: addition works regardless of integral signedness.
557         next = next_cur ? next_cur(cur) : builder.CreateAdd(cur, builder.getInt32(1));
558     } catch (...) {
559         // NOTE: at this point after_bb has not been
560         // inserted into any parent, and thus it will not
561         // be cleaned up automatically. Do it manually.
562         after_bb->deleteValue();
563 
564         throw;
565     }
566 
567     // Compute the end condition.
568     // NOTE: we use the unsigned less-than predicate.
569     auto end_cond = builder.CreateICmp(llvm::CmpInst::ICMP_ULT, next, end);
570 
571     // Get a reference to the current block for later use,
572     // and insert the "after loop" block.
573     auto loop_end_bb = builder.GetInsertBlock();
574     f->getBasicBlockList().push_back(after_bb);
575 
576     // Insert the conditional branch into the end of loop_end_bb.
577     builder.CreateCondBr(end_cond, loop_bb, after_bb);
578 
579     // Any new code will be inserted in after_bb.
580     builder.SetInsertPoint(after_bb);
581 
582     // Add a new entry to the PHI node for the backedge.
583     cur->addIncoming(next, loop_end_bb);
584 }
585 
586 // Given an input pointer value, return the
587 // pointed-to type.
pointee_type(llvm::Value * ptr)588 llvm::Type *pointee_type(llvm::Value *ptr)
589 {
590     assert(llvm::isa<llvm::PointerType>(ptr->getType())); // LCOV_EXCL_LINE
591 
592     return ptr->getType()->getPointerElementType();
593 }
594 
595 // Small helper to fetch a string representation
596 // of an LLVM type.
llvm_type_name(llvm::Type * t)597 std::string llvm_type_name(llvm::Type *t)
598 {
599     assert(t != nullptr);
600 
601     std::string retval;
602     llvm::raw_string_ostream ostr(retval);
603 
604     t->print(ostr, false, true);
605 
606     return ostr.str();
607 }
608 
609 // This function will return true if:
610 //
611 // - the return type of f is ret, and
612 // - the argument types of f are the same as in 'args'.
613 //
614 // Otherwise, the function will return false.
compare_function_signature(llvm::Function * f,llvm::Type * ret,const std::vector<llvm::Type * > & args)615 bool compare_function_signature(llvm::Function *f, llvm::Type *ret, const std::vector<llvm::Type *> &args)
616 {
617     assert(f != nullptr);
618     assert(ret != nullptr);
619 
620     if (ret != f->getReturnType()) {
621         // Mismatched return types.
622         return false;
623     }
624 
625     auto it = f->arg_begin();
626     for (auto arg_type : args) {
627         if (it == f->arg_end() || it->getType() != arg_type) {
628             // f has fewer arguments than args, or the current
629             // arguments' types do not match.
630             return false;
631         }
632         ++it;
633     }
634 
635     // In order for the signatures to match,
636     // we must be at the end of f's arguments list
637     // (otherwise f has more arguments than args).
638     return it == f->arg_end();
639 }
640 
641 // Create an LLVM if statement in the form:
642 // if (cond) {
643 //   then_f();
644 // } else {
645 //   else_f();
646 // }
llvm_if_then_else(llvm_state & s,llvm::Value * cond,const std::function<void ()> & then_f,const std::function<void ()> & else_f)647 void llvm_if_then_else(llvm_state &s, llvm::Value *cond, const std::function<void()> &then_f,
648                        const std::function<void()> &else_f)
649 {
650     auto &context = s.context();
651     auto &builder = s.builder();
652 
653     assert(cond->getType() == builder.getInt1Ty());
654 
655     // Fetch the current function.
656     assert(builder.GetInsertBlock() != nullptr);
657     auto f = builder.GetInsertBlock()->getParent();
658     assert(f != nullptr);
659 
660     // Create and insert the "then" block.
661     auto *then_bb = llvm::BasicBlock::Create(context, "", f);
662 
663     // Create but do not insert the "else" and merge blocks.
664     auto *else_bb = llvm::BasicBlock::Create(context);
665     auto *merge_bb = llvm::BasicBlock::Create(context);
666 
667     // Create the conditional jump.
668     builder.CreateCondBr(cond, then_bb, else_bb);
669 
670     // Emit the code for the "then" branch.
671     builder.SetInsertPoint(then_bb);
672     try {
673         then_f();
674     } catch (...) {
675         // NOTE: else_bb and merge_bb have not been
676         // inserted into any parent yet, clean them
677         // up manually.
678         else_bb->deleteValue();
679         merge_bb->deleteValue();
680 
681         throw;
682     }
683 
684     // Jump to the merge block.
685     builder.CreateBr(merge_bb);
686 
687     // Emit the "else" block.
688     f->getBasicBlockList().push_back(else_bb);
689     builder.SetInsertPoint(else_bb);
690     try {
691         else_f();
692     } catch (...) {
693         // NOTE: merge_bb has not been
694         // inserted into any parent yet, clean it
695         // up manually.
696         merge_bb->deleteValue();
697 
698         throw;
699     }
700 
701     // Jump to the merge block.
702     builder.CreateBr(merge_bb);
703 
704     // Emit the merge block.
705     f->getBasicBlockList().push_back(merge_bb);
706     builder.SetInsertPoint(merge_bb);
707 }
708 
709 // Helper to create a global zero-inited array variable in the module m
710 // with type t. The array is mutable and with internal linkage.
make_global_zero_array(llvm::Module & m,llvm::ArrayType * t)711 llvm::Value *make_global_zero_array(llvm::Module &m, llvm::ArrayType *t)
712 {
713     assert(t != nullptr);
714 
715     // Make the global array.
716     auto gl_arr = new llvm::GlobalVariable(m, t, false, llvm::GlobalVariable::InternalLinkage,
717                                            llvm::ConstantAggregateZero::get(t));
718 
719     // Return it.
720     return gl_arr;
721 }
722 
723 // Helper to invoke an external function on a vector argument.
724 // The function will be called on each element of the vector separately,
725 // and the return will be re-assembled as a vector.
call_extern_vec(llvm_state & s,llvm::Value * arg,const std::string & fname)726 llvm::Value *call_extern_vec(llvm_state &s, llvm::Value *arg, const std::string &fname)
727 {
728     assert(arg != nullptr);
729 
730     auto &builder = s.builder();
731 
732     // Decompose the argument into scalars.
733     auto scalars = vector_to_scalars(builder, arg);
734 
735     // Invoke the function on each scalar.
736     std::vector<llvm::Value *> retvals;
737     for (auto scal : scalars) {
738         retvals.push_back(llvm_invoke_external(
739             s, fname, scal->getType(), {scal},
740             // NOTE: in theory we may add ReadNone here as well,
741             // but for some reason, at least up to LLVM 10,
742             // this causes strange codegen issues. Revisit
743             // in the future.
744             {llvm::Attribute::NoUnwind, llvm::Attribute::Speculatable, llvm::Attribute::WillReturn}));
745     }
746 
747     // Build a vector with the results.
748     return scalars_to_vector(builder, retvals);
749 }
750 
751 // Helper to invoke an external function on 2 vector arguments.
752 // The function will be called on each elements of the vectors separately,
753 // and the return will be re-assembled as a vector.
call_extern_vec(llvm_state & s,llvm::Value * arg0,llvm::Value * arg1,const std::string & fname)754 llvm::Value *call_extern_vec(llvm_state &s, llvm::Value *arg0, llvm::Value *arg1, const std::string &fname)
755 {
756     assert(arg0 != nullptr);
757     assert(arg1 != nullptr);
758     assert(arg0->getType() == arg1->getType());
759 
760     auto &builder = s.builder();
761 
762     // Decompose the arguments into scalars.
763     auto scalars0 = vector_to_scalars(builder, arg0);
764     auto scalars1 = vector_to_scalars(builder, arg1);
765 
766     // Invoke the function on each pair of scalars.
767     std::vector<llvm::Value *> retvals;
768     for (decltype(scalars0.size()) i = 0; i < scalars0.size(); ++i) {
769         retvals.push_back(llvm_invoke_external(
770             s, fname, scalars0[i]->getType(), {scalars0[i], scalars1[i]},
771             // NOTE: in theory we may add ReadNone here as well,
772             // but for some reason, at least up to LLVM 10,
773             // this causes strange codegen issues. Revisit
774             // in the future.
775             {llvm::Attribute::NoUnwind, llvm::Attribute::Speculatable, llvm::Attribute::WillReturn}));
776     }
777 
778     // Build a vector with the results.
779     return scalars_to_vector(builder, retvals);
780 }
781 
782 // Create an LLVM for loop in the form:
783 //
784 // while (cond()) {
785 //   body();
786 // }
llvm_while_loop(llvm_state & s,const std::function<llvm::Value * ()> & cond,const std::function<void ()> & body)787 void llvm_while_loop(llvm_state &s, const std::function<llvm::Value *()> &cond, const std::function<void()> &body)
788 {
789     assert(body);
790     assert(cond);
791 
792     auto &context = s.context();
793     auto &builder = s.builder();
794 
795     // Fetch the current function.
796     assert(builder.GetInsertBlock() != nullptr);
797     auto f = builder.GetInsertBlock()->getParent();
798     assert(f != nullptr);
799 
800     // Do a first evaluation of cond.
801     // NOTE: if this throws, we have not created any block
802     // yet, no need for manual cleanup.
803     auto cmp = cond();
804     assert(cmp != nullptr);
805     assert(cmp->getType() == builder.getInt1Ty());
806 
807     // Pre-create loop and afterloop blocks. Note that these have just
808     // been created, they have not been inserted yet in the IR.
809     auto *loop_bb = llvm::BasicBlock::Create(context);
810     auto *after_bb = llvm::BasicBlock::Create(context);
811 
812     // NOTE: we need a special case if the body of the loop is
813     // never to be executed (that is, cond returns false).
814     // In such a case, we will jump directly to after_bb.
815     builder.CreateCondBr(builder.CreateNot(cmp), after_bb, loop_bb);
816 
817     // Get a reference to the current block for
818     // later usage in the phi node.
819     auto preheader_bb = builder.GetInsertBlock();
820 
821     // Add the loop block and start insertion into it.
822     f->getBasicBlockList().push_back(loop_bb);
823     builder.SetInsertPoint(loop_bb);
824 
825     // Create the phi node and add the first pair of arguments.
826     auto cur = builder.CreatePHI(builder.getInt1Ty(), 2);
827     cur->addIncoming(cmp, preheader_bb);
828 
829     // Execute the loop body and the post-body code.
830     try {
831         body();
832 
833         // Compute the end condition.
834         cmp = cond();
835         assert(cmp != nullptr);
836         assert(cmp->getType() == builder.getInt1Ty());
837     } catch (...) {
838         // NOTE: at this point after_bb has not been
839         // inserted into any parent, and thus it will not
840         // be cleaned up automatically. Do it manually.
841         after_bb->deleteValue();
842 
843         throw;
844     }
845 
846     // Get a reference to the current block for later use,
847     // and insert the "after loop" block.
848     auto loop_end_bb = builder.GetInsertBlock();
849     f->getBasicBlockList().push_back(after_bb);
850 
851     // Insert the conditional branch into the end of loop_end_bb.
852     builder.CreateCondBr(cmp, loop_bb, after_bb);
853 
854     // Any new code will be inserted in after_bb.
855     builder.SetInsertPoint(after_bb);
856 
857     // Add a new entry to the PHI node for the backedge.
858     cur->addIncoming(cmp, loop_end_bb);
859 }
860 
861 // Helper to compute sin and cos simultaneously.
llvm_sincos(llvm_state & s,llvm::Value * x)862 std::pair<llvm::Value *, llvm::Value *> llvm_sincos(llvm_state &s, llvm::Value *x)
863 {
864     auto &context = s.context();
865 
866 #if defined(HEYOKA_HAVE_REAL128)
867     // Determine the scalar type of the vector arguments.
868     auto *x_t = x->getType()->getScalarType();
869 
870     if (x_t == llvm::Type::getFP128Ty(context)) {
871         // NOTE: for __float128 we cannot use the intrinsics, we need
872         // to call an external function.
873         auto &builder = s.builder();
874 
875         // Convert the vector argument to scalars.
876         auto x_scalars = vector_to_scalars(builder, x);
877 
878         // Execute the sincosq() function on the scalar values and store
879         // the results in res_scalars.
880         // NOTE: need temp storage because sincosq uses pointers
881         // for output values.
882         auto s_all = builder.CreateAlloca(x_t);
883         auto c_all = builder.CreateAlloca(x_t);
884         std::vector<llvm::Value *> res_sin, res_cos;
885         for (decltype(x_scalars.size()) i = 0; i < x_scalars.size(); ++i) {
886             llvm_invoke_external(
887                 s, "sincosq", builder.getVoidTy(), {x_scalars[i], s_all, c_all},
888                 {llvm::Attribute::NoUnwind, llvm::Attribute::Speculatable, llvm::Attribute::WillReturn});
889 
890             res_sin.emplace_back(builder.CreateLoad(s_all));
891             res_cos.emplace_back(builder.CreateLoad(c_all));
892         }
893 
894         // Reconstruct the return value as a vector.
895         return {scalars_to_vector(builder, res_sin), scalars_to_vector(builder, res_cos)};
896     } else {
897 #endif
898         if (auto vec_t = llvm::dyn_cast<llvm_vector_type>(x->getType())) {
899             // NOTE: although there exists a SLEEF function for computing sin/cos
900             // at the same time, we cannot use it directly because it returns a pair
901             // of SIMD vectors rather than a single one and that does not play
902             // well with the calling conventions. In theory we could write a wrapper
903             // for these sincos functions using pointers for output values,
904             // but compiling such a wrapper requires correctly
905             // setting up the SIMD compilation flags. Perhaps we can consider this in the
906             // future to improve performance.
907             const auto sfn_sin = sleef_function_name(context, "sin", vec_t->getElementType(),
908                                                      boost::numeric_cast<std::uint32_t>(vec_t->getNumElements()));
909             const auto sfn_cos = sleef_function_name(context, "cos", vec_t->getElementType(),
910                                                      boost::numeric_cast<std::uint32_t>(vec_t->getNumElements()));
911 
912             if (!sfn_sin.empty() && !sfn_cos.empty()) {
913                 auto ret_sin = llvm_invoke_external(
914                     s, sfn_sin, vec_t, {x},
915                     // NOTE: in theory we may add ReadNone here as well,
916                     // but for some reason, at least up to LLVM 10,
917                     // this causes strange codegen issues. Revisit
918                     // in the future.
919                     {llvm::Attribute::NoUnwind, llvm::Attribute::Speculatable, llvm::Attribute::WillReturn});
920 
921                 auto ret_cos = llvm_invoke_external(
922                     s, sfn_cos, vec_t, {x},
923                     // NOTE: in theory we may add ReadNone here as well,
924                     // but for some reason, at least up to LLVM 10,
925                     // this causes strange codegen issues. Revisit
926                     // in the future.
927                     {llvm::Attribute::NoUnwind, llvm::Attribute::Speculatable, llvm::Attribute::WillReturn});
928 
929                 return {ret_sin, ret_cos};
930             }
931         }
932 
933         // Compute sin and cos via intrinsics.
934         auto *sin_x = llvm_invoke_intrinsic(s, "llvm.sin", {x->getType()}, {x});
935         auto *cos_x = llvm_invoke_intrinsic(s, "llvm.cos", {x->getType()}, {x});
936 
937         return {sin_x, cos_x};
938 #if defined(HEYOKA_HAVE_REAL128)
939     }
940 #endif
941 }
942 
943 // Helper to compute abs(x_v).
llvm_abs(llvm_state & s,llvm::Value * x_v)944 llvm::Value *llvm_abs(llvm_state &s, llvm::Value *x_v)
945 {
946 #if defined(HEYOKA_HAVE_REAL128)
947     // Determine the scalar type of the vector argument.
948     auto *x_t = x_v->getType()->getScalarType();
949 
950     if (x_t == llvm::Type::getFP128Ty(s.context())) {
951         return call_extern_vec(s, x_v, "fabsq");
952     } else {
953 #endif
954         return llvm_invoke_intrinsic(s, "llvm.fabs", {x_v->getType()}, {x_v});
955 #if defined(HEYOKA_HAVE_REAL128)
956     }
957 #endif
958 }
959 
960 // Helper to reduce x modulo y, that is, to compute:
961 // x - y * floor(x / y).
llvm_modulus(llvm_state & s,llvm::Value * x,llvm::Value * y)962 llvm::Value *llvm_modulus(llvm_state &s, llvm::Value *x, llvm::Value *y)
963 {
964     auto &builder = s.builder();
965 
966 #if defined(HEYOKA_HAVE_REAL128)
967     // Determine the scalar type of the vector arguments.
968     auto *x_t = x->getType()->getScalarType();
969 
970     auto &context = s.context();
971 
972     if (x_t == llvm::Type::getFP128Ty(context)) {
973         return call_extern_vec(s, x, y, "heyoka_modulus128");
974     } else {
975 #endif
976         auto quo = builder.CreateFDiv(x, y);
977         auto fl_quo = llvm_invoke_intrinsic(s, "llvm.floor", {quo->getType()}, {quo});
978 
979         return builder.CreateFSub(x, builder.CreateFMul(y, fl_quo));
980 #if defined(HEYOKA_HAVE_REAL128)
981     }
982 #endif
983 }
984 
985 // Minimum value, floating-point arguments. Implemented as std::min():
986 // return (b < a) ? b : a;
llvm_min(llvm_state & s,llvm::Value * a,llvm::Value * b)987 llvm::Value *llvm_min(llvm_state &s, llvm::Value *a, llvm::Value *b)
988 {
989     auto &builder = s.builder();
990 
991     return builder.CreateSelect(builder.CreateFCmpOLT(b, a), b, a);
992 }
993 
994 // Maximum value, floating-point arguments. Implemented as std::max():
995 // return (a < b) ? b : a;
llvm_max(llvm_state & s,llvm::Value * a,llvm::Value * b)996 llvm::Value *llvm_max(llvm_state &s, llvm::Value *a, llvm::Value *b)
997 {
998     auto &builder = s.builder();
999 
1000     return builder.CreateSelect(builder.CreateFCmpOLT(a, b), b, a);
1001 }
1002 
1003 // Branchless sign function.
1004 // NOTE: requires FP value.
llvm_sgn(llvm_state & s,llvm::Value * val)1005 llvm::Value *llvm_sgn(llvm_state &s, llvm::Value *val)
1006 {
1007     assert(val != nullptr);
1008     assert(val->getType()->getScalarType()->isFloatingPointTy());
1009 
1010     auto &builder = s.builder();
1011 
1012     // Build the zero constant.
1013     auto zero = llvm::Constant::getNullValue(val->getType());
1014 
1015     // Run the comparisons.
1016     auto cmp0 = builder.CreateFCmpOLT(zero, val);
1017     auto cmp1 = builder.CreateFCmpOLT(val, zero);
1018 
1019     // Convert to int32.
1020     llvm::Type *int_type;
1021     if (auto *v_t = llvm::dyn_cast<llvm_vector_type>(cmp0->getType())) {
1022         int_type = make_vector_type(builder.getInt32Ty(), boost::numeric_cast<std::uint32_t>(v_t->getNumElements()));
1023     } else {
1024         int_type = builder.getInt32Ty();
1025     }
1026     auto icmp0 = builder.CreateZExt(cmp0, int_type);
1027     auto icmp1 = builder.CreateZExt(cmp1, int_type);
1028 
1029     // Compute and return the result.
1030     return builder.CreateSub(icmp0, icmp1);
1031 }
1032 
1033 // Two-argument arctan.
1034 // NOTE: requires FP values of the same type.
llvm_atan2(llvm_state & s,llvm::Value * y,llvm::Value * x)1035 llvm::Value *llvm_atan2(llvm_state &s, llvm::Value *y, llvm::Value *x)
1036 {
1037     assert(y != nullptr);
1038     assert(x != nullptr);
1039     assert(y->getType() == x->getType());
1040     assert(y->getType()->getScalarType()->isFloatingPointTy());
1041 
1042     auto &context = s.context();
1043 
1044     // Determine the scalar type of the vector arguments.
1045     auto *x_t = x->getType()->getScalarType();
1046 
1047 #if defined(HEYOKA_HAVE_REAL128)
1048     if (x_t == llvm::Type::getFP128Ty(context)) {
1049         return call_extern_vec(s, y, x, "atan2q");
1050     } else {
1051 #endif
1052         if (x_t == to_llvm_type<double>(context)) {
1053             if (auto vec_t = llvm::dyn_cast<llvm_vector_type>(x->getType())) {
1054                 if (const auto sfn = sleef_function_name(context, "atan2", vec_t->getElementType(),
1055                                                          boost::numeric_cast<std::uint32_t>(vec_t->getNumElements()));
1056                     !sfn.empty()) {
1057                     return llvm_invoke_external(
1058                         s, sfn, vec_t, {y, x},
1059                         // NOTE: in theory we may add ReadNone here as well,
1060                         // but for some reason, at least up to LLVM 10,
1061                         // this causes strange codegen issues. Revisit
1062                         // in the future.
1063                         {llvm::Attribute::NoUnwind, llvm::Attribute::Speculatable, llvm::Attribute::WillReturn});
1064                 }
1065             }
1066 
1067             return call_extern_vec(s, y, x, "atan2");
1068         } else if (x_t == to_llvm_type<long double>(context)) {
1069             return call_extern_vec(s, y, x,
1070 #if defined(_MSC_VER)
1071                                    // NOTE: it seems like the MSVC stdlib does not have an atan2l function,
1072                                    // because LLVM complains about the symbol "atan2l" not being
1073                                    // defined. Hence, use our own wrapper instead.
1074                                    "heyoka_atan2l"
1075 #else
1076                                "atan2l"
1077 #endif
1078             );
1079             // LCOV_EXCL_START
1080         } else {
1081             throw std::invalid_argument(
1082                 "Invalid floating-point type encountered in the LLVM implementation of atan2()");
1083         }
1084         // LCOV_EXCL_STOP
1085 #if defined(HEYOKA_HAVE_REAL128)
1086     }
1087 #endif
1088 }
1089 
1090 namespace
1091 {
1092 
1093 // Add a function to count the number of sign changes in the coefficients
1094 // of a polynomial of degree n. The coefficients are SIMD vectors of size batch_size
1095 // and scalar type T.
1096 template <typename T>
llvm_add_csc_impl(llvm_state & s,std::uint32_t n,std::uint32_t batch_size)1097 llvm::Function *llvm_add_csc_impl(llvm_state &s, std::uint32_t n, std::uint32_t batch_size)
1098 {
1099     assert(batch_size > 0u);
1100 
1101     // Overflow check: we need to be able to index
1102     // into the array of coefficients.
1103     // LCOV_EXCL_START
1104     if (n == std::numeric_limits<std::uint32_t>::max()
1105         || batch_size > std::numeric_limits<std::uint32_t>::max() / (n + 1u)) {
1106         throw std::overflow_error("Overflow detected while adding a sign changes counter function");
1107     }
1108     // LCOV_EXCL_STOP
1109 
1110     auto &md = s.module();
1111     auto &builder = s.builder();
1112     auto &context = s.context();
1113 
1114     // Fetch the floating-point type.
1115     auto tp = to_llvm_vector_type<T>(context, batch_size);
1116 
1117     // Fetch the function name.
1118     const auto fname = "heyoka_csc_degree_{}_{}"_format(n, llvm_mangle_type(tp));
1119 
1120     // The function arguments:
1121     // - pointer to the return value,
1122     // - pointer to the array of coefficients.
1123     // NOTE: both pointers are to the scalar counterparts
1124     // of the vector types, so that we can call this from regular
1125     // C++ code.
1126     std::vector<llvm::Type *> fargs{llvm::PointerType::getUnqual(builder.getInt32Ty()),
1127                                     llvm::PointerType::getUnqual(to_llvm_type<T>(context))};
1128 
1129     // Try to see if we already created the function.
1130     auto f = md.getFunction(fname);
1131 
1132     if (f == nullptr) {
1133         // The function was not created before, do it now.
1134 
1135         // Fetch the current insertion block.
1136         auto orig_bb = builder.GetInsertBlock();
1137 
1138         // The return type is void.
1139         auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
1140         // Create the function
1141         f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, fname, &md);
1142         assert(f != nullptr);
1143 
1144         // Fetch the necessary function arguments.
1145         auto out_ptr = f->args().begin();
1146         out_ptr->setName("out_ptr");
1147         out_ptr->addAttr(llvm::Attribute::NoCapture);
1148         out_ptr->addAttr(llvm::Attribute::NoAlias);
1149         out_ptr->addAttr(llvm::Attribute::WriteOnly);
1150 
1151         auto cf_ptr = f->args().begin() + 1;
1152         cf_ptr->setName("cf_ptr");
1153         cf_ptr->addAttr(llvm::Attribute::NoCapture);
1154         cf_ptr->addAttr(llvm::Attribute::NoAlias);
1155         cf_ptr->addAttr(llvm::Attribute::ReadOnly);
1156 
1157         // Create a new basic block to start insertion into.
1158         builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", f));
1159 
1160         // Fetch the type for storing the last_nz_idx variable.
1161         auto last_nz_idx_t = make_vector_type(builder.getInt32Ty(), batch_size);
1162 
1163         // The initial last nz idx is zero for all batch elements.
1164         auto last_nz_idx = builder.CreateAlloca(last_nz_idx_t);
1165         builder.CreateStore(llvm::Constant::getNullValue(last_nz_idx_t), last_nz_idx);
1166 
1167         // NOTE: last_nz_idx is an index into the poly coefficient vector. Thus, in batch
1168         // mode, when loading from a vector of indices, we will have to apply an offset.
1169         // For instance, for batch_size = 4 and last_nz_idx = [0, 1, 1, 2], the actual
1170         // memory indices to load the scalar coefficients from are:
1171         // - 0 * 4 + 0 = 0
1172         // - 1 * 4 + 1 = 5
1173         // - 1 * 4 + 2 = 6
1174         // - 2 * 4 + 3 = 11.
1175         // That is, last_nz_idx * batch_size + offset, where offset is [0, 1, 2, 3].
1176         llvm::Value *offset;
1177         if (batch_size == 1u) {
1178             // In scalar mode the offset is simply zero.
1179             offset = builder.getInt32(0);
1180         } else {
1181             offset = llvm::UndefValue::get(make_vector_type(builder.getInt32Ty(), batch_size));
1182             for (std::uint32_t i = 0; i < batch_size; ++i) {
1183                 offset = builder.CreateInsertElement(offset, builder.getInt32(i), i);
1184             }
1185         }
1186 
1187         // Init the vector of coefficient pointers with the base pointer value.
1188         auto cf_ptr_v = vector_splat(builder, cf_ptr, batch_size);
1189 
1190         // Init the return value with zero.
1191         auto retval = builder.CreateAlloca(last_nz_idx_t);
1192         builder.CreateStore(llvm::Constant::getNullValue(last_nz_idx_t), retval);
1193 
1194         // The iteration range is [1, n].
1195         llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(n + 1u), [&](llvm::Value *cur_n) {
1196             // Load the current poly coefficient(s).
1197             auto cur_cf = load_vector_from_memory(
1198                 builder, builder.CreateInBoundsGEP(cf_ptr, {builder.CreateMul(cur_n, builder.getInt32(batch_size))}),
1199                 batch_size);
1200 
1201             // Load the last nonzero coefficient(s).
1202             auto last_nz_ptr_idx = builder.CreateAdd(
1203                 offset, builder.CreateMul(builder.CreateLoad(last_nz_idx),
1204                                           vector_splat(builder, builder.getInt32(batch_size), batch_size)));
1205             auto last_nz_ptr = builder.CreateInBoundsGEP(cf_ptr_v, {last_nz_ptr_idx});
1206             auto last_nz_cf = batch_size > 1u
1207                                   ? gather_vector_from_memory(builder, cur_cf->getType(), last_nz_ptr, alignof(T))
1208                                   : static_cast<llvm::Value *>(builder.CreateLoad(last_nz_ptr));
1209 
1210             // Compute the sign of the current coefficient(s).
1211             auto cur_sgn = llvm_sgn(s, cur_cf);
1212 
1213             // Compute the sign of the last nonzero coefficient(s).
1214             auto last_nz_sgn = llvm_sgn(s, last_nz_cf);
1215 
1216             // Add them and check if the result is zero (this indicates a sign change).
1217             auto cmp = builder.CreateICmpEQ(builder.CreateAdd(cur_sgn, last_nz_sgn),
1218                                             llvm::Constant::getNullValue(cur_sgn->getType()));
1219 
1220             // We also need to check if last_nz_sgn is zero. If that is the case, it means
1221             // we haven't found any nonzero coefficient yet for the polynomial and we must
1222             // not modify retval yet.
1223             auto zero_cmp = builder.CreateICmpEQ(last_nz_sgn, llvm::Constant::getNullValue(last_nz_sgn->getType()));
1224             cmp = builder.CreateSelect(zero_cmp, llvm::Constant::getNullValue(cmp->getType()), cmp);
1225 
1226             // Update retval.
1227             builder.CreateStore(builder.CreateAdd(builder.CreateLoad(retval), builder.CreateZExt(cmp, last_nz_idx_t)),
1228                                 retval);
1229 
1230             // Update last_nz_idx.
1231             builder.CreateStore(
1232                 builder.CreateSelect(builder.CreateICmpEQ(cur_sgn, llvm::Constant::getNullValue(cur_sgn->getType())),
1233                                      builder.CreateLoad(last_nz_idx), vector_splat(builder, cur_n, batch_size)),
1234                 last_nz_idx);
1235         });
1236 
1237         // Store the result.
1238         store_vector_to_memory(builder, out_ptr, builder.CreateLoad(retval));
1239 
1240         // Return.
1241         builder.CreateRetVoid();
1242 
1243         // Verify.
1244         s.verify_function(f);
1245 
1246         // Restore the original insertion block.
1247         builder.SetInsertPoint(orig_bb);
1248     } else {
1249         // LCOV_EXCL_START
1250         // The function was created before. Check if the signatures match.
1251         if (!compare_function_signature(f, builder.getVoidTy(), fargs)) {
1252             throw std::invalid_argument(
1253                 "Inconsistent function signature for the sign changes counter function detected");
1254         }
1255         // LCOV_EXCL_STOP
1256     }
1257 
1258     return f;
1259 }
1260 
1261 } // namespace
1262 
llvm_add_csc_dbl(llvm_state & s,std::uint32_t n,std::uint32_t batch_size)1263 llvm::Function *llvm_add_csc_dbl(llvm_state &s, std::uint32_t n, std::uint32_t batch_size)
1264 {
1265     return llvm_add_csc_impl<double>(s, n, batch_size);
1266 }
1267 
llvm_add_csc_ldbl(llvm_state & s,std::uint32_t n,std::uint32_t batch_size)1268 llvm::Function *llvm_add_csc_ldbl(llvm_state &s, std::uint32_t n, std::uint32_t batch_size)
1269 {
1270     return llvm_add_csc_impl<long double>(s, n, batch_size);
1271 }
1272 
1273 #if defined(HEYOKA_HAVE_REAL128)
1274 
llvm_add_csc_f128(llvm_state & s,std::uint32_t n,std::uint32_t batch_size)1275 llvm::Function *llvm_add_csc_f128(llvm_state &s, std::uint32_t n, std::uint32_t batch_size)
1276 {
1277     return llvm_add_csc_impl<mppp::real128>(s, n, batch_size);
1278 }
1279 
1280 #endif
1281 
1282 namespace
1283 {
1284 
1285 // Variable template for the constant pi at different levels of precision.
1286 template <typename T>
1287 const auto inv_kep_E_pi = boost::math::constants::pi<T>();
1288 
1289 #if defined(HEYOKA_HAVE_REAL128)
1290 
1291 template <>
1292 const mppp::real128 inv_kep_E_pi<mppp::real128> = mppp::pi_128;
1293 
1294 #endif
1295 
1296 // Implementation of the inverse Kepler equation.
1297 template <typename T>
llvm_add_inv_kep_E_impl(llvm_state & s,std::uint32_t batch_size)1298 llvm::Function *llvm_add_inv_kep_E_impl(llvm_state &s, std::uint32_t batch_size)
1299 {
1300     using std::nextafter;
1301 
1302     assert(batch_size > 0u);
1303 
1304     auto &md = s.module();
1305     auto &builder = s.builder();
1306     auto &context = s.context();
1307 
1308     // Fetch the floating-point type.
1309     auto tp = to_llvm_vector_type<T>(context, batch_size);
1310 
1311     // Fetch the function name.
1312     const auto fname = "heyoka_inv_kep_E_{}"_format(llvm_mangle_type(tp));
1313 
1314     // The function arguments:
1315     // - eccentricity,
1316     // - mean anomaly.
1317     std::vector<llvm::Type *> fargs{tp, tp};
1318 
1319     // Try to see if we already created the function.
1320     auto f = md.getFunction(fname);
1321 
1322     if (f == nullptr) {
1323         // The function was not created before, do it now.
1324 
1325         // Fetch the current insertion block.
1326         auto orig_bb = builder.GetInsertBlock();
1327 
1328         // The return type is tp.
1329         auto *ft = llvm::FunctionType::get(tp, fargs, false);
1330         // Create the function
1331         f = llvm::Function::Create(ft, llvm::Function::InternalLinkage, fname, &md);
1332         assert(f != nullptr);
1333 
1334         // Fetch the necessary function arguments.
1335         auto ecc = f->args().begin();
1336         auto M_arg = f->args().begin() + 1;
1337 
1338         // Create a new basic block to start insertion into.
1339         builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", f));
1340 
1341         // Create the return value.
1342         auto retval = builder.CreateAlloca(tp);
1343 
1344         // Reduce M modulo 2*pi.
1345         auto M = llvm_modulus(s, M_arg, vector_splat(builder, codegen<T>(s, number{2 * inv_kep_E_pi<T>}), batch_size));
1346 
1347         // Compute the initial guess from the usual elliptic expansion
1348         // to the third order in eccentricities:
1349         // E = M + e*sin(M) + e**2*sin(M)*cos(M) + e**3*sin(M)*(3/2*cos(M)**2 - 1/2) + ...
1350         auto [sin_M, cos_M] = llvm_sincos(s, M);
1351         // e*sin(M).
1352         auto e_sin_M = builder.CreateFMul(ecc, sin_M);
1353         // e*cos(M).
1354         auto e_cos_M = builder.CreateFMul(ecc, cos_M);
1355         // e**2.
1356         auto e2 = builder.CreateFMul(ecc, ecc);
1357         // cos(M)**2.
1358         auto cos_M_2 = builder.CreateFMul(cos_M, cos_M);
1359 
1360         // 3/2 and 1/2 constants.
1361         auto c_3_2 = vector_splat(builder, codegen<T>(s, number{T(3) / 2}), batch_size);
1362         auto c_1_2 = vector_splat(builder, codegen<T>(s, number{T(1) / 2}), batch_size);
1363 
1364         // M + e*sin(M).
1365         auto tmp1 = builder.CreateFAdd(M, e_sin_M);
1366         // e**2*sin(M)*cos(M).
1367         auto tmp2 = builder.CreateFMul(e_sin_M, e_cos_M);
1368         // e**3*sin(M).
1369         auto tmp3 = builder.CreateFMul(e2, e_sin_M);
1370         // 3/2*cos(M)**2 - 1/2.
1371         auto tmp4 = builder.CreateFSub(builder.CreateFMul(c_3_2, cos_M_2), c_1_2);
1372 
1373         // Put it together.
1374         auto ig1 = builder.CreateFAdd(tmp1, tmp2);
1375         auto ig2 = builder.CreateFMul(tmp3, tmp4);
1376         auto ig = builder.CreateFAdd(ig1, ig2);
1377 
1378         // Make extra sure the initial guess is in the [0, 2*pi) range.
1379         auto lb = vector_splat(builder, codegen<T>(s, number{0.}), batch_size);
1380         auto ub = vector_splat(builder, codegen<T>(s, number{nextafter(2 * inv_kep_E_pi<T>, T(0))}), batch_size);
1381         ig = llvm_max(s, ig, lb);
1382         ig = llvm_min(s, ig, ub);
1383 
1384         // Store it.
1385         builder.CreateStore(ig, retval);
1386 
1387         // Create the counter.
1388         auto *counter = builder.CreateAlloca(builder.getInt32Ty());
1389         builder.CreateStore(builder.getInt32(0), counter);
1390 
1391         // Variables to store sin(E) and cos(E).
1392         auto sin_E = builder.CreateAlloca(tp);
1393         auto cos_E = builder.CreateAlloca(tp);
1394 
1395         // Write the initial values for sin_E and cos_E.
1396         auto sin_cos_E = llvm_sincos(s, builder.CreateLoad(retval));
1397         builder.CreateStore(sin_cos_E.first, sin_E);
1398         builder.CreateStore(sin_cos_E.second, cos_E);
1399 
1400         // Variable to hold the value of f(E) = E - e*sin(E) - M.
1401         auto fE = builder.CreateAlloca(tp);
1402         // Helper to compute f(E).
1403         auto fE_compute = [&]() {
1404             auto ret = builder.CreateFMul(ecc, builder.CreateLoad(sin_E));
1405             ret = builder.CreateFSub(builder.CreateLoad(retval), ret);
1406             return builder.CreateFSub(ret, M);
1407         };
1408         // Compute and store the initial value of f(E).
1409         builder.CreateStore(fE_compute(), fE);
1410 
1411         // Define the stopping condition functor.
1412         // NOTE: hard-code this for the time being.
1413         auto max_iter = builder.getInt32(50);
1414         auto loop_cond = [&,
1415                           // NOTE: tolerance is 4 * eps.
1416                           tol = vector_splat(builder, codegen<T>(s, number{std::numeric_limits<T>::epsilon() * 4}),
1417                                              batch_size)]() -> llvm::Value * {
1418             auto c_cond = builder.CreateICmpULT(builder.CreateLoad(counter), max_iter);
1419 
1420             // Keep on iterating as long as abs(f(E)) > tol.
1421             // NOTE: need reduction only in batch mode.
1422             auto tol_check = builder.CreateFCmpOGT(llvm_abs(s, builder.CreateLoad(fE)), tol);
1423             auto tol_cond = (batch_size == 1u) ? tol_check : builder.CreateOrReduce(tol_check);
1424 
1425             // NOTE: this is a way of creating a logical AND.
1426             return builder.CreateSelect(c_cond, tol_cond, llvm::Constant::getNullValue(tol_cond->getType()));
1427         };
1428 
1429         // Run the loop.
1430         llvm_while_loop(s, loop_cond, [&, one_c = vector_splat(builder, codegen<T>(s, number{1.}), batch_size)]() {
1431             // Compute the new value.
1432             auto old_val = builder.CreateLoad(retval);
1433             auto new_val = builder.CreateFDiv(
1434                 builder.CreateLoad(fE), builder.CreateFSub(one_c, builder.CreateFMul(ecc, builder.CreateLoad(cos_E))));
1435             new_val = builder.CreateFSub(old_val, new_val);
1436 
1437             // Bisect if new_val > ub.
1438             // NOTE: '>' is fine here, ub is the maximum allowed value.
1439             auto bcheck = builder.CreateFCmpOGT(new_val, ub);
1440             new_val = builder.CreateSelect(
1441                 bcheck,
1442                 builder.CreateFMul(vector_splat(builder, codegen<T>(s, number{T(1) / 2}), batch_size),
1443                                    builder.CreateFAdd(old_val, ub)),
1444                 new_val);
1445 
1446             // Bisect if new_val < lb.
1447             bcheck = builder.CreateFCmpOLT(new_val, lb);
1448             new_val = builder.CreateSelect(
1449                 bcheck,
1450                 builder.CreateFMul(vector_splat(builder, codegen<T>(s, number{T(1) / 2}), batch_size),
1451                                    builder.CreateFAdd(old_val, lb)),
1452                 new_val);
1453 
1454             // Store the new value.
1455             builder.CreateStore(new_val, retval);
1456 
1457             // Update sin_E/cos_E.
1458             sin_cos_E = llvm_sincos(s, new_val);
1459             builder.CreateStore(sin_cos_E.first, sin_E);
1460             builder.CreateStore(sin_cos_E.second, cos_E);
1461 
1462             // Update f(E).
1463             builder.CreateStore(fE_compute(), fE);
1464 
1465             // Update the counter.
1466             builder.CreateStore(builder.CreateAdd(builder.CreateLoad(counter), builder.getInt32(1)), counter);
1467         });
1468 
1469         // Check the counter.
1470         llvm_if_then_else(
1471             s, builder.CreateICmpEQ(builder.CreateLoad(counter), max_iter),
1472             [&]() {
1473                 llvm_invoke_external(s, "heyoka_inv_kep_E_max_iter", builder.getVoidTy(), {},
1474                                      {llvm::Attribute::NoUnwind, llvm::Attribute::WillReturn});
1475             },
1476             []() {});
1477 
1478         // Return the result.
1479         builder.CreateRet(builder.CreateLoad(retval));
1480 
1481         // Verify.
1482         s.verify_function(f);
1483 
1484         // Restore the original insertion block.
1485         builder.SetInsertPoint(orig_bb);
1486     } else {
1487         // The function was created before. Check if the signatures match.
1488         if (!compare_function_signature(f, tp, fargs)) {
1489             throw std::invalid_argument("Inconsistent function signature for the inverse Kepler equation detected");
1490         }
1491     }
1492 
1493     return f;
1494 }
1495 
1496 } // namespace
1497 
llvm_add_inv_kep_E_dbl(llvm_state & s,std::uint32_t batch_size)1498 llvm::Function *llvm_add_inv_kep_E_dbl(llvm_state &s, std::uint32_t batch_size)
1499 {
1500     return llvm_add_inv_kep_E_impl<double>(s, batch_size);
1501 }
1502 
llvm_add_inv_kep_E_ldbl(llvm_state & s,std::uint32_t batch_size)1503 llvm::Function *llvm_add_inv_kep_E_ldbl(llvm_state &s, std::uint32_t batch_size)
1504 {
1505     return llvm_add_inv_kep_E_impl<long double>(s, batch_size);
1506 }
1507 
1508 #if defined(HEYOKA_HAVE_REAL128)
1509 
llvm_add_inv_kep_E_f128(llvm_state & s,std::uint32_t batch_size)1510 llvm::Function *llvm_add_inv_kep_E_f128(llvm_state &s, std::uint32_t batch_size)
1511 {
1512     return llvm_add_inv_kep_E_impl<mppp::real128>(s, batch_size);
1513 }
1514 
1515 #endif
1516 
1517 namespace
1518 {
1519 
1520 // Helper to create a global const array containing
1521 // all binomial coefficients up to (n, n). The coefficients are stored
1522 // as scalars and the return value is a pointer to the first coefficient.
1523 // The array has shape (n + 1, n + 1) and it is stored in row-major format.
1524 template <typename T>
llvm_add_bc_array_impl(llvm_state & s,std::uint32_t n)1525 llvm::Value *llvm_add_bc_array_impl(llvm_state &s, std::uint32_t n)
1526 {
1527     // Overflow check.
1528     // LCOV_EXCL_START
1529     if (n == std::numeric_limits<std::uint32_t>::max()
1530         || (n + 1u) > std::numeric_limits<std::uint32_t>::max() / (n + 1u)) {
1531         throw std::overflow_error("Overflow detected while adding an array of binomial coefficients");
1532     }
1533     // LCOV_EXCL_STOP
1534 
1535     auto &md = s.module();
1536     auto &builder = s.builder();
1537     auto &context = s.context();
1538 
1539     // Fetch the array type.
1540     auto *arr_type
1541         = llvm::ArrayType::get(to_llvm_type<T>(context), boost::numeric_cast<std::uint64_t>((n + 1u) * (n + 1u)));
1542 
1543     // Generate the binomials as constants.
1544     std::vector<llvm::Constant *> bc_const;
1545     for (std::uint32_t i = 0; i <= n; ++i) {
1546         for (std::uint32_t j = 0; j <= n; ++j) {
1547             // NOTE: the Boost implementation requires j <= i. We don't care about
1548             // j > i anyway.
1549             const auto val = (j <= i) ? binomial<T>(i, j) : T(0);
1550             bc_const.push_back(llvm::cast<llvm::Constant>(codegen<T>(s, number{val})));
1551         }
1552     }
1553 
1554     // Create the global array.
1555     auto *bc_const_arr = llvm::ConstantArray::get(arr_type, bc_const);
1556     auto *g_bc_const_arr = new llvm::GlobalVariable(md, bc_const_arr->getType(), true,
1557                                                     llvm::GlobalVariable::InternalLinkage, bc_const_arr);
1558 
1559     // Get out a pointer to the beginning of the array.
1560     return builder.CreateInBoundsGEP(g_bc_const_arr, {builder.getInt32(0), builder.getInt32(0)});
1561 }
1562 
1563 } // namespace
1564 
llvm_add_bc_array_dbl(llvm_state & s,std::uint32_t n)1565 llvm::Value *llvm_add_bc_array_dbl(llvm_state &s, std::uint32_t n)
1566 {
1567     return llvm_add_bc_array_impl<double>(s, n);
1568 }
1569 
llvm_add_bc_array_ldbl(llvm_state & s,std::uint32_t n)1570 llvm::Value *llvm_add_bc_array_ldbl(llvm_state &s, std::uint32_t n)
1571 {
1572     return llvm_add_bc_array_impl<long double>(s, n);
1573 }
1574 
1575 #if defined(HEYOKA_HAVE_REAL128)
1576 
llvm_add_bc_array_f128(llvm_state & s,std::uint32_t n)1577 llvm::Value *llvm_add_bc_array_f128(llvm_state &s, std::uint32_t n)
1578 {
1579     return llvm_add_bc_array_impl<mppp::real128>(s, n);
1580 }
1581 
1582 #endif
1583 
1584 namespace
1585 {
1586 
1587 // RAII helper to temporarily disable fast
1588 // math flags in a builder.
1589 class fmf_disabler
1590 {
1591     ir_builder &m_builder;
1592     llvm::FastMathFlags m_orig_fmf;
1593 
1594 public:
fmf_disabler(ir_builder & b)1595     explicit fmf_disabler(ir_builder &b) : m_builder(b), m_orig_fmf(m_builder.getFastMathFlags())
1596     {
1597         // Reset the fast math flags.
1598         m_builder.setFastMathFlags(llvm::FastMathFlags{});
1599     }
~fmf_disabler()1600     ~fmf_disabler()
1601     {
1602         // Restore the original fast math flags.
1603         m_builder.setFastMathFlags(m_orig_fmf);
1604     }
1605 
1606     fmf_disabler(const fmf_disabler &) = delete;
1607     fmf_disabler(fmf_disabler &&) = delete;
1608 
1609     fmf_disabler &operator=(const fmf_disabler &) = delete;
1610     fmf_disabler &operator=(fmf_disabler &&) = delete;
1611 };
1612 
1613 } // namespace
1614 
1615 // NOTE: see the code in dfloat.hpp for the double-length primitives.
1616 
1617 // Addition.
llvm_dl_add(llvm_state & state,llvm::Value * x_hi,llvm::Value * x_lo,llvm::Value * y_hi,llvm::Value * y_lo)1618 std::pair<llvm::Value *, llvm::Value *> llvm_dl_add(llvm_state &state, llvm::Value *x_hi, llvm::Value *x_lo,
1619                                                     llvm::Value *y_hi, llvm::Value *y_lo)
1620 {
1621     auto &builder = state.builder();
1622 
1623     // Temporarily disable the fast math flags.
1624     fmf_disabler fd(builder);
1625 
1626     auto S = builder.CreateFAdd(x_hi, y_hi);
1627     auto T = builder.CreateFAdd(x_lo, y_lo);
1628     auto e = builder.CreateFSub(S, x_hi);
1629     auto f = builder.CreateFSub(T, x_lo);
1630 
1631     auto t1 = builder.CreateFSub(S, e);
1632     t1 = builder.CreateFSub(x_hi, t1);
1633     auto s = builder.CreateFSub(y_hi, e);
1634     s = builder.CreateFAdd(s, t1);
1635 
1636     t1 = builder.CreateFSub(T, f);
1637     t1 = builder.CreateFSub(x_lo, t1);
1638     auto t = builder.CreateFSub(y_lo, f);
1639     t = builder.CreateFAdd(t, t1);
1640 
1641     s = builder.CreateFAdd(s, T);
1642     auto H = builder.CreateFAdd(S, s);
1643     auto h = builder.CreateFSub(S, H);
1644     h = builder.CreateFAdd(h, s);
1645 
1646     h = builder.CreateFAdd(h, t);
1647     e = builder.CreateFAdd(H, h);
1648     f = builder.CreateFSub(H, e);
1649     f = builder.CreateFAdd(f, h);
1650 
1651     return {e, f};
1652 }
1653 
1654 // Less-than.
llvm_dl_lt(llvm_state & state,llvm::Value * x_hi,llvm::Value * x_lo,llvm::Value * y_hi,llvm::Value * y_lo)1655 llvm::Value *llvm_dl_lt(llvm_state &state, llvm::Value *x_hi, llvm::Value *x_lo, llvm::Value *y_hi, llvm::Value *y_lo)
1656 {
1657     auto &builder = state.builder();
1658 
1659     // Temporarily disable the fast math flags.
1660     fmf_disabler fd(builder);
1661 
1662     auto cond1 = builder.CreateFCmpOLT(x_hi, y_hi);
1663     auto cond2 = builder.CreateFCmpOEQ(x_hi, y_hi);
1664     auto cond3 = builder.CreateFCmpOLT(x_lo, y_lo);
1665     // NOTE: this is a logical AND.
1666     auto cond4 = builder.CreateSelect(cond2, cond3, llvm::ConstantInt::getNullValue(cond3->getType()));
1667     // NOTE: this is a logical OR.
1668     auto cond = builder.CreateSelect(cond1, llvm::ConstantInt::getAllOnesValue(cond4->getType()), cond4);
1669 
1670     return cond;
1671 }
1672 
1673 // Greater-than.
llvm_dl_gt(llvm_state & state,llvm::Value * x_hi,llvm::Value * x_lo,llvm::Value * y_hi,llvm::Value * y_lo)1674 llvm::Value *llvm_dl_gt(llvm_state &state, llvm::Value *x_hi, llvm::Value *x_lo, llvm::Value *y_hi, llvm::Value *y_lo)
1675 {
1676     auto &builder = state.builder();
1677 
1678     // Temporarily disable the fast math flags.
1679     fmf_disabler fd(builder);
1680 
1681     auto cond1 = builder.CreateFCmpOGT(x_hi, y_hi);
1682     auto cond2 = builder.CreateFCmpOEQ(x_hi, y_hi);
1683     auto cond3 = builder.CreateFCmpOGT(x_lo, y_lo);
1684     // NOTE: this is a logical AND.
1685     auto cond4 = builder.CreateSelect(cond2, cond3, llvm::ConstantInt::getNullValue(cond3->getType()));
1686     // NOTE: this is a logical OR.
1687     auto cond = builder.CreateSelect(cond1, llvm::ConstantInt::getAllOnesValue(cond4->getType()), cond4);
1688 
1689     return cond;
1690 }
1691 
1692 // NOTE: this will check that a pointer ptr passed to
1693 // a GEP instruction points, after the removal of vector,
1694 // to a value of type tp. This how the deprecated CreateInBoundsGEP()
1695 // function is implemented.
llvm_depr_GEP_type_check(llvm::Value * ptr,llvm::Type * tp)1696 bool llvm_depr_GEP_type_check(llvm::Value *ptr, llvm::Type *tp)
1697 {
1698     assert(llvm::isa<llvm::PointerType>(ptr->getType())); // LCOV_EXCL_LINE
1699 
1700     return ptr->getType()->getScalarType()->getPointerElementType() == tp;
1701 }
1702 
1703 } // namespace heyoka::detail
1704 
1705 // NOTE: this function will be called by the LLVM implementation
1706 // of the inverse Kepler function when the maximum number of iterations
1707 // is exceeded.
heyoka_inv_kep_E_max_iter()1708 extern "C" HEYOKA_DLL_PUBLIC void heyoka_inv_kep_E_max_iter() noexcept
1709 {
1710     heyoka::detail::get_logger()->warn("iteration limit exceeded while solving the elliptic inverse Kepler equation");
1711 }
1712