1 // Big Number arithmetic.                               A C Norman, 2019-21
2 
3 // To use this, go "#include "arithlib.hpp".
4 
5 #ifndef __arithlib_hpp
6 #define __arithlib_hpp 1
7 
8 // To do:
9 //    Write full documentation! [Meanwhile there is a reasonably extended
10 //     commentary included as comments here, and a file arithtest.cpp that
11 //     can accompany it and illustrate its use]
12 
13 /**************************************************************************
14  * Copyright (C) 2019-21, Codemist.                      A C Norman       *
15  *                                                                        *
16  * Redistribution and use in source and binary forms, with or without     *
17  * modification, are permitted provided that the following conditions are *
18  * met:                                                                   *
19  *                                                                        *
20  *     * Redistributions of source code must retain the relevant          *
21  *       copyright notice, this list of conditions and the following      *
22  *       disclaimer.                                                      *
23  *     * Redistributions in binary form must reproduce the above          *
24  *       copyright notice, this list of conditions and the following      *
25  *       disclaimer in the documentation and/or other materials provided  *
26  *       with the distribution.                                           *
27  *                                                                        *
28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    *
29  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      *
30  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS      *
31  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE         *
32  * COPYRIGHT OWNERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,   *
33  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,   *
34  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS  *
35  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
36  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR  *
37  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF     *
38  * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH   *
39  * DAMAGE.                                                                *
40  *************************************************************************/
41 
42 
43 
44 // There are quite a lot of bignumber packages out there on the web,
45 // but none of them seemed to be such that I could readily use them
46 // for arithmetic within a Lisp at all easily, for instance because of
47 // the storage management arangements used.
48 //
49 // This code is for use as a header-only library, so just "#include" it
50 // to be able to use it. All the use of the word "inline" is so that if
51 // you #include this from several source files there will not be trouble when
52 // you link them together: you should still only end up with one copy of
53 // each function in memory. Note that until a few years ago the expectation
54 // was that tagging a function as "inline" was advice to the compiler to
55 // merge its definition into the body of functions that called it. When
56 // that is to be done one wants to put the function definition in a header
57 // file so that all compilation units gain access to it. If it was made
58 // "static" there would risk being a compiled copy included  along with
59 // every compilation uint, while if it was "extern" it could seem to be
60 // multiply defined. Use of "inline" resolves that, leaving just one
61 // copy in the eventual linked binary. From C++17 onwards data as well as
62 // code can be tagged "inline" in this sense. Until I can be confident that
63 // everybody will use a compiler that supports C++17 I need to go through
64 // quite odd-looking steps so that when this header is included from several
65 // compilation units I do not end up with multiply-defined entities.
66 //
67 // This code uses 64-bit digits and a 2s complement representation for
68 // negative numbers. This means it will work best on 64-bit platforms
69 // (which by now are by far the most important), and it provides bitwise
70 // logical operations (logand and logor) as well as arithmetic. It will work
71 // best where the C++ compiler supports a 128-bit integral type, but does not
72 // need that. It should work on 32-bit systems as well, although one should
73 // expect performance there to be lower.
74 //
75 // The code here tried to arrange that any operations that might overflow are
76 // done using unsigned types, because in C++ overflow in signed arithmetic
77 // yields undefined results - ie on some machines the values delivered could
78 // be quite unrelated to the desired ones. This means that I do plenty of
79 // arithmetic rather as
80 //     int c = (int)((unsigned)a + (unsigned)b);
81 // and I rely on the result being as would be seen with natural 2s complement
82 // working. From C++20 onwards this is likely to be guaranteed by the standard,
83 // but at present it is not, so although this could will work on almost all
84 // known machines if judged against the standard at is at best relying on
85 // implementation defined behaviour and avoiding undefined behaviour.
86 //
87 // If "softfloat_h" is defined I will suppose that there is a type "float128"
88 // available and I will support conversions to and from that. In part because
89 // this type is not supported in any standard way I will be assuming that the
90 // version I use is as provided by the SoftFloat IEEE Floating-Point Arithmetic
91 // package released by John R. Hauser. That does everything in software so
92 // absolute speed may be modest, but it is nicely portable. For more
93 // information see http://www.jhauser.us/arithmetic/SoftFloat.html.
94 //
95 // I will provide two levels of access and abstraction. At the low level
96 // a big number is represented as and array of uint64_t digits along with
97 // a size_t value that indicates the number of digits in use. The most
98 // significant digit in any number lives in memory with type uint64_t but
99 // is treated as signed (ie int64_t) in the arithmetic. For the purposes
100 // of the bitwise operations (and, or, xor and not) negative values are
101 // processed as if they had an infinite number of 1 bits above their
102 // most significant stored digit.
103 // If a positive value has a top digit whose unsigned value has its top bit
104 // set then an additional zero digit is used ahead of that, and equivalently
105 // for negative values.
106 //
107 // Vectors to represent numbers are allocated using a function reserve()
108 // which takes an argument indicating how long the number might be. It will
109 // often be necessary to reserve memory in a conservative manner, ie to
110 // allocate more memory than will eventually prove to be needed.
111 // At the end of an arithmetic operation a reserved block of memory can be
112 // passed to abandon() when it is no longer required, or there can be a
113 // call to confirm_size() to establish the exact size that is to be retained.
114 // A variant call confirm_size_x() is used when the vector whose size is being
115 // confirmed is not the one that was most recently allocated: the intent
116 // there was that under some schemes discarding or shortening the most
117 // recently allocated item might be especially easy to implement well.
118 // confirm_size() returns a handle for the vector, not the vector itself.
119 // Depending on build options it is also possible that small integers will be
120 // represented almost directly: such cases will be referred to as fixnums.
121 // So for the benefit of higher levels of abstraction every number is stored
122 // using a "handle", where the handle can be tested to see it is holds the
123 // value of the number within itself as a fixnum or whether it is a pointer
124 // to a vector of digits. While the code here does not absolutely mandate it,
125 // the expectation is that all vectors will be allocated at addresses that are
126 // a multiple of sizeof(uint64_t) and that means that some low bits in a
127 // handle are available to tag fixnums.
128 //
129 // In addition to numbers I will generate strings (I have code to make a
130 // string representation of a number, with hex, decimal, octal and binary
131 // variants). Space for the string will be allocated using reserve_string()
132 // and finalized using confirm_size_string(), with both of those indicating
133 // sizes in bytes. Note that when you reserve or confirm string size the
134 // length quoted is the number of characters that will be present excluding
135 // any header or termination markers - reserve_string() will allow for the
136 // needs of suchlike.
137 //
138 // A higher level packaging represents numbers using a class Bignum. This
139 // has just one field which will hold a potentially encoded version of a
140 // pointer to a vector that is the number (ie a handle). When the handle
141 // identified a vector the first item in the vector will be a header word
142 // containing length information. That is followed by the uint64_t digits
143 // representing the numeric value. The representation of the header and the
144 // encoding of handles can be configured in one of several ways, these being
145 // intended to provide models of the implementation intended for different
146 // use cases.
147 //
148 // Overall the code has conditional compilation providing for 3 prototype
149 // arrangements. These are MALLOC, NEW and LISP. It is envisaged that some
150 // users of this code will need to modify it to allow it to interface with the
151 // rest of their software, and these three schemes give at least sketches of
152 // various possibilites. The short explanation is that MALLOC uses malloc()
153 // and free() for memory management and does not use fixnums, so that all
154 // numbers (however small) are stored as vectors. This is perhaps the simplest
155 // scheme, if not the highest performance. NEW exploits many more C++ features.
156 // Storage management uses "new" and "delete" at the lowest level, but the
157 // code keeps its own lists of previously used memory blocks in a manner that
158 // greatly reduces the call on C++ memory management work. This version stores
159 // handles that refer to vectors as even numbers and ones that are fixnums
160 // with their bottom bit set, so fixnums are 63 bits wide. The C++ class
161 // Bignum and a range of operator overloads lead to this being a simple
162 // version to use for casual C++ code, and it is the default version built.
163 // LISP is the version that originally motivated me to implement this. It has
164 // a subsidiary configuration option that allows for systems where garbage
165 // collection is or is not conservative. This could be a good starting point
166 // for a bignum system to be used as part of the run-time system for any
167 // language, not just Lisp. However the interface code here is liable to need
168 // detailed review and revision since it mediates between the data structures
169 // used here and whatever is present in the Lisp (or whatever!) that will
170 // use it. I initially developed and tested this using a Lisp called "vsl"
171 // and intend to migrate it for use in "csl". Both of these use low-bit
172 // tagging of data and the precise values for tag bits and their layout
173 // within header words has to be adhered to here, as has the Lisp's ideas
174 // about the way that header words are stored.
175 //
176 // Here is some more information about each scheme:
177 //
178 // MALLOC:
179 //   A bignum with n digits is held in a vector of length n+1, and the
180 //   "encoded pointer" to it is a native pointer to the second element.
181 //   If this pointer is p then the number of words (n) is kept at p[-1]
182 //   and the least significant digit of the number is at p[0]. reserve()
183 //   uses malloc() to obtain space. confirm_size() uses realloc() to trim
184 //   the allocated space, and abandon() maps onto use of free(). This
185 //   uses C rather than C++ memory management because it wants to use realloc
186 //   which is not supported in the tidy C++ world. Performance of the code
187 //   as a whole will be sensitive to the malloc/realloc/free implementation
188 //   on the platform that is in use. To allow for a user who wished to
189 //   customize allocation, all calls to the basic memory allocation primitives
190 //   are made indirectly so that alternative equivalents can be plugged in.
191 //   Strings and allocated using malloc() and returned as simple nul-terminated
192 //   C strings. They must be released using free() after use.
193 //
194 // NEW:
195 //   A bignum with n digits will be stored in a vector whose size is the
196 //   next power of 2 strictly larger than n. As with the MALLOC case
197 //   the numeric data starts one word into this block and the initial word
198 //   of the block contains a header of length information. Here the header
199 //   is split into two 32-bit parts. One contains the length of the number
200 //   as before (but note that in general that will not fill up the entire
201 //   memory block), the other contains log2(block_size), ie it is a compact
202 //   indication of the size of the block. There will be free-chains for
203 //   blocks of size 2,4,8,... so that abandon() just pushes the released
204 //   memory onto one and reserve() can often merely retrieve a previously
205 //   used block. In most cases confirm_size just needs to write the actual
206 //   length of a number into the header word. When two large numbers are
207 //   subtracted the number of digits in the result may be much smaller than
208 //   the size of block that had to have been reserved. To allow for that sort
209 //   of situation confirm_size() reserves the right to notice cases where used
210 //   size in a block is much smaller than the capacity, and in such cases
211 //   allocate a fresh smaller block and copy information into it, allowing it
212 //   to abandon the overlarge chunk of memory.
213 //   The reference to the vector of digits is held using type intptr_t and
214 //   can be cast to obtain the address of the least significant digit of the
215 //   value. But so that this scheme as a whole provides better performance
216 //   for general users, small integer values will be handled specially. If
217 //   the "encoded pointer" has its bottom bit set than it represents a 63-bit
218 //   signed value. The intent here is that the class Bignum, by containing
219 //   just one integer-sized field, can be stored and passed around really
220 //   efficiently, and if in its use most arithmetic remains on values that
221 //   fit within 63 bits it will not do much storage allocation at all. If this
222 //   works well it should represent a reasonably convenient and tolerably
223 //   efficient C++ facility for general use.
224 //   Strings live in store allocated using "new char[nnn]" and are returned
225 //   as C style strings that must be disposed of using "delete". The use of
226 //   C rather than C++ style strings because I hope that makes storage
227 //   management interaction clearer.
228 //   In this case there is an extra option DEBUG_OVERRUN which enables some
229 //   simple checks for memory block overflow. reserve() always arranges that
230 //   there will be a "spare" word just beyond the top used word in a vector,
231 //   and it initializes this to a slightly unlikely value. When confirm_size
232 //   or abandon() are used it can then verify that this guard word has not
233 //   been corrupted. This may not be 100% foolproof but is nevertheless helps
234 //   while developing or maintaining the library!
235 //
236 // LISP:
237 //   The arrangements here are based on the arrangements I have in my VSL
238 //   and CSL Lisp implementations. I still hope that between the above options
239 //   and this one the code can be adapted reasonably easily. As before the
240 //   basic representation of a number with n digits is a vector of length
241 //   n+1, with the initial word containing a header. In VSL/CSL a header word
242 //   contains some tage bits identifying it as a header, then some type
243 //   bite that here will indicate that it is a header of a big number. Finally
244 //   it contains a length field. The exact bit-patterns and packing here will
245 //   be specific to the particular Lisp (obviously!). A reference to a big
246 //   number will be the address of the header word of this vector plus some
247 //   tag bits in the bottom 3 bits. This "low tagging" relies on all block
248 //   of memory being aligned to 8-byte boundaries (even on 32-bit platforms).
249 //   On a 32-bit system the header will only occupy the first 32-bits of the
250 //   initial 64-bit memory unit, and the second 32-bit region is spare and
251 //   would be best set to zero.
252 //   There are two expectations about memory management. The first is that
253 //   garbage collection has left a large block of contiguous memory within
254 //   which new material can be allocated linearly. Under this supposition the
255 //   most recently allocated block of memory can be shrunk or discarded by
256 //   simply resetting a heap-fringe pointer. The second is that it will
257 //   at least occasionally be desirable to perform linear scans of all memory.
258 //   To support that when a block that is not the most recently allocated one
259 //   is shrunk or discarded a header word is placed in the released space
260 //   leaving a valid but dummy Lisp item there.
261 //   Those issue motivate the distinction between confirm_size and
262 //   confirm_size_x. [Note that the implementation may not (yet) do all that
263 //   I intended in that respect!]
264 //   Usually calls to memory allocation primitives are made without any special
265 //   concern for garbage collector safety of other pointers, and so in its
266 //   current form this code insists on running in a context where the garbage
267 //   collector is conservative, so that for instance the untagged pointers to
268 //   raw vectors of digits are safe. This aspect of the code may well limit
269 //   its direct usefulness. So too allow for a system that uses a precise
270 //   garbage collector I allow for a "#define PRECISE_GC 1" option and in
271 //   that case whenever calls to memory allocation are performed within the
272 //   low-level code I will use functions "push()" and "pop()" and
273 //   expect that they save values somewhere that the garbage collector can
274 //   find. Note that this scheme does not automate keeping large bignum
275 //   calculations expressed via operator overloading safe! It is mostly aimed
276 //   at making the low level code usable. A typical case where push() and
277 //   pop() are needed will be the code to multiply two big numbers. That can
278 //   work out how large its result will be and then needs to call reserve()
279 //   to get space for it. Across the call to reserve it will need to push
280 //   its two arguments, because a copying garbage collector might relocate
281 //   them.
282 //   Further though: having "push and pop" suggests that a potentially large
283 //   number of items might need to be saved. I suspect that the most I can
284 //   ever encounter here will be perhaps 3. So having a fixed number of
285 //   dedicated list basis to stash things in might be nicer.
286 //   In Lisp mode it is anticipated that as well as a tagged representation
287 //   of small integers that the generic arithemetic will need to support
288 //   floating point numbers (and possibly multiple widths of floating point
289 //   values, plus ratios and complex numbers) and so the dispatch on tagged
290 //   numbers needs to live at a higher level within the Lisp then just thise
291 //   code. Thus while the big-number functions here are set up so they can
292 //   return fixnum results and while there are entrypoints for performing
293 //   arithmetic between bignums and fixnums  (ie between uint64_t * and
294 //   int64_t values) it is the responsibility of somebody else to decide which
295 //   functions to call when.
296 //   Strings are allocated using reserve_string() and finalized using
297 //   confirm_size_string. For Lisp purposes they will need to have a header
298 //   word containing the string length.
299 //
300 // It might be helpful if I provide my own thoughts about when you might decide
301 // to use this code and when you will probably not. Wikipedia lists rather
302 // a large number of arbitrary precision arithmetic packages on the web page
303 // en.wikipedia.org/wiki/List_of_arbitrary-precision_arithmetic_software.
304 // As well as free-stanidng libraries a range of programming languages feature
305 // big-number arithmetic as a standard feature. It may be fair to suggest
306 // that for use from C++ the most visible option is GMP with some users liking
307 // to use it via Boost. Given a view that GMP is the market leader I will
308 // set out some points comparing arithlib with it.
309 // First GMP is well established, it aims for top performance, it has fast
310 // algorithms for huge arithmetic as well as for sane-sized numbers. In
311 // contrast arithlib is new and neither well established nor truly heavily
312 // tested. It does not even try to provide algorithms that will only become
313 // useful for arithmetic on numbers that are many many thousands of digits
314 // (eg FFT-style multiplication). It can thus be expected to be generally
315 // slower than GMP.
316 // However potential advantages of arithlib are
317 // (1) It is subject to a permissive BSD license, while GMP is dual licensed
318 //     under LGPL3 or GPL2. For some users or some projects this may matter!
319 // (2) Rather than having assembly code versions for a wide but finite range
320 //     of particular hosts, arithlib follows the "Trust your Compiler" policy
321 //     and expects that a sufficiently modern C++ compiler will manage to
322 //     generate code close to the performance of all the hand-optimised
323 //     assembly code that GMP uses. This reduces the total size of the
324 //     package substantially and makes building/installing/using it especially
325 //     easy even when a packaged version is not instantly available for
326 //     your machine.
327 // (3) By being a header-only library, arithlib imposes a cost at program
328 //     build time as it all has to be processed by the compiler - but these
329 //     days compile-times are pretty short anyway. And by having all of
330 //     its souce code available when code that uses it is built there are
331 //     potential whole-program optimisations that can be made.
332 // (4) arithlib is coded in C++ not C, and this allows it to leverage features
333 //     of C++11. For instance it can rely on the random number generation
334 //     facilities that C++ provides rather than needing to implement its
335 //     own. There are places within it where template code leads to a neater
336 //     implementation, and the operator overloading scheme that various other
337 //     C++ arithmetic packages provide fits in especially naturally.
338 // (5) My initial motivation for creating arithlib was a need for a big
339 //     arithmetic package to form part of the run-time system of a language
340 //     implementation. arithlib was built with a view to keeping much of the
341 //     memory allocation and management somewhere else, probably supported
342 //     by garbage collection. I found it much harder to see how to arrange
343 //     that the garbage collector in the rest of my run-time system could
344 //     track the memory usage within GMP.
345 // (6) While arithlib is not a totally tiny body of code it is smaller and
346 //     simpler than GMP. When its capabilities cover what is needed and when
347 //     its speed is sufficient I would suggest that "small and tidy is good".
348 //
349 // A key use-case that arithlib is NOT set up to support is arithmetic on
350 // long but fixed precision numbers - this is liable to mean that it will
351 // not be the technlogy of choice for a range of cryptography applications!
352 // The code here has been tested and runs on both 32 and 64-bit machines,
353 // however its internal workings are almost all expressed in terms on the
354 // type "uint64_t". This may result in there being significant scope for
355 // better specialization for code to run on 32-bit targets.
356 //
357 // What about thread safety? Well the code should be thread-safe.
358 // with the C++ "NEW" option I provide several options and you need to
359 // configure one (at compile time, and by editing this file or adding
360 // overriding predefined symbols), choosing the one you like or that
361 // will run fastest on your platform. Search for "CONCURRENCY SUPPORT" to
362 // find the commentary.
363 //
364 // I have run some VERY SIMPLISTIC benchmark comparisons between this code
365 // and gmp. All that has been tested is the low-level code for multiplying
366 // a pair on N word unsigned integers to get a 2N word result.
367 // On x86_64 and compiling with "g++ -O3" I believe that this test suggests
368 // that up to around 100 words (ie 6400 bits, 2000 decimal digits) the speed
369 // ratio between gmp and arithlib is in the range 1 to 2 (on Ubuntu Linux or
370 // Windows 10 (64-bit)).
371 // Beyond that the use of "TOOM3" by gmp leads to it gradually picking
372 // up advantages, reaching about a factor somewhat over 3 at around 1500
373 // words (100000 bits, 30000 decimal digits).
374 // This benchmark only tests multiplication of equally sized numbers and
375 // its results will vary noticably across platforms, and so it is not liable
376 // to be representative of overall results including mixes of all the
377 // operations on mixed-size numbers, but at least it shows something!
378 // For large enough inputs I believe I see multiplication being about 4 times
379 // as costly on a 32-bit platform as on a 64-bit one, a result that is perhaps
380 // no cause for great astonishment!
381 
382 // Those concerned with programming style may be minded to complain about the
383 // fairly large numbers of casts in this code. So here is an attempt to
384 // explain some major causes.
385 // First, in C++ signed arithmetic overflow is undefined behaviour. However I
386 // want to be able to tell when it would arise, and I want to be able to do
387 // a great deal of my big-number implementation using all 64-bits of a wide
388 // integer. Thus both overflow tests and a great deal of the inner arithmetic
389 // are done using unsigned aritmeric, but both when dealing with negative
390 // bignums and when interacting with the user I need to have signed integers.
391 // I frequently cast between signed an unsigned 64-bit values in these
392 // contexts. The code ASSUMES that arithmetic is twos complement. Note that
393 // from C++20 onwards this ceases to be an assumption and is guaranteed by
394 // the standard!
395 // Secondly in many cases I will represent a big number as an object whose
396 // sole data menber is of type std::uintptr_t. This will be interpreted as
397 // a value with its low bit or few bits as tag information and upper bits
398 // as either a pointer (to a vector of digits) or an immediate integer value.
399 // I am ASSUMING that one or more low bits in the representation of a pointer
400 // to an aligned item will be zero in the natural pointer, and that I can
401 // force information into them for tagging. Even though this is not entirely
402 // proper I feel that eg std::align() would not make sense if this was not
403 // going to work! Whatever the risks, using this sort of representation leads
404 // to many casts beween  std::uint64_t*  and std::uintptr_t, and the
405 // unpacking of a signed value from the immediate integer case leads to
406 // further casts with signed integers, ie std::intptr_t.
407 // Finally I need at times to generate a bit-fiels using code such as
408 // (1<<n). If the literal "1" is not introduced using a wide enough type
409 // this can overflow. I might try (1LLU<<n) but I have no absolute guarantee
410 // that "LL" makes a 64-bit integer. I could use UINT64_C(1) and that is not
411 // too bad, but in many places I will in fact write the rather wordy but very
412 // explicit (static_cast<std::uint64_t>(1)<<n).
413 
414 
415 // I provide a default configuration, but by predefining one of the
416 // symbols allow other versions to be built.
417 
418 #if !defined MALLOC && !defined NEW && !defined LISP
419 // If no explicit options have been set I will building using memory
420 // allocation via C++ "new".
421 
422 #define NEW           1
423 #define DEBUG_OVERRUN 1   // Overhead is not huge: watching for error is good.
424 
425 #endif
426 
427 #ifdef __cpp_inline_variables
428 
429 // For versions of C++ up to C++17 I will put constant values in header
430 // files using something along the line of "static const int VAR = VAL;".
431 // This should give the compiler a chance to replace the name with its value
432 // throughout the compilation unit, and if the compiler is clever enough it
433 // will avoid leaving a word of memory with the value stored if all uses
434 // have been dealt with more directly. However it will tend to lead to a
435 // lot of "static variable defined but not used" warnings.
436 // From C++17 onwards (and C++ mandates the __cpp_inline_variables macro to
437 // indicate if the feature is in place) I will use
438 //             inline const int VAR = VAL;
439 // and now if memory is allocated for the variable it will only be allocated
440 // once, and I hope that compilers will not feel entitled to moan about
441 // cases where there are no references. I will only use this idiom for things
442 // that are at least "almost" constant so that in the case that the variables
443 // end up static and there are different copies in each compilation unit
444 // it should not cause cconfusion.
445 
446 #define INLINE_VAR inline
447 #define HAVE_INLINE_VARS 1
448 #else
449 #define INLINE_VAR static
450 #endif
451 
452 
453 // Another useful C++17 feature.... with a fallback to a GNU-specific
454 // way of achieving the same through use of C++11 annotations. And a final
455 // fall back to just not worrying.
456 
457 #ifndef MAYBE_UNUSED
458 #ifdef __has_cpp_attribute_maybe_unused
459 #define MAYBE_UNUSED [[maybe_unused]]
460 #elif defined __GNUC__
461 #define MAYBE_UNUSED [[gnu::unused]]
462 #else
463 #define MAYBE_UNUSED
464 #endif
465 #endif
466 
467 #include <cstdio>
468 #include <cstring>
469 #include <cstdint>
470 #include <cmath>
471 #include <cfloat>
472 #include <cctype>
473 #include <cinttypes>
474 #include <cstdlib>
475 #include <cstdarg>
476 #include <cassert>
477 #include <random>
478 #include <iostream>
479 #include <iomanip>
480 #include <thread>
481 #include <ctime>
482 #include <chrono>
483 #include <utility>
484 #include <string>
485 #include <chrono>
486 #include <mutex>
487 #include <atomic>
488 #include <vector>
489 #include <type_traits>
490 
491 namespace arithlib_implementation
492 {
493 
494 #ifdef CSL
495 // For use within CSL I will provide a single thread-local pointer that
496 // can be accessed fast even on Windows. All value that are to be
497 // thread_local within this library can (eventually) be migrated to live in
498 // a chunk of memory referenced by this.
499 //
500 // The data involved will be
501 // (1) Information related to the worker threads for Karatsuba multiplication.
502 // (2) Information about the modulus used with modular arithmetic.
503 // Note that the allocation of memory when that is done within this library
504 // rather that outside it may also need to be thread local if the library is
505 // used from a threaded application, but that does not use the special
506 // treatment indicated just here.
507 
508 // While this pointer is defined here it is not at present used! It is for
509 // future work.
510 
511 #define TL_arithlibData 48
512 DEFINE_INLINE_THREAD_LOCAL(void *, arithlibData);
513 #endif // CSL
514 
515 // A scheme "arithlib_assert" lets me write in my own code to print the
516 // diagnostics. To use this version you must include arithlib.hpp after
517 // and other header files that define assert or abort macros.
518 
519 // I implement things here using #define because I want to exploit
520 // __FILE__ and __LINE__ to report where issues arose.
521 
522 #define STRINGIFY1(x) #x
523 #define STRINGIFY(x) STRINGIFY1(x)
524 
525 // arithlib_abort() mainly exists so I can set a breakpoint on it! Setting one
526 // on the system abort() function sometimes does not give me as much help
527 // as I might have hoped on at least some platforms, while a break-point
528 // on abort() does what I expect. But also the version I have here explains
529 // where it was called from nicely. The location gets passed in as a string
530 // and in a multi-file multi-thread world that could be "interesting", and
531 // so I will have mutex protection for the failure message (getting two
532 // aborts at once would be horrid, and I will be happy if ONE of the reports
533 // come out neatly!). The function diagnostic_muxex returns a reference
534 // to a mutex that can be used to protect things, and there will be just
535 // one of if. It also sets its argument to a reference to a const char *
536 // pointer that will be used for transmitting the location information.
537 
538 #ifdef HAVE_INLINE_VARS
539 inline std::mutex my_diagnostic_mutex;
540 inline static const char *my_diagnostic_location;
541 #endif // HAVE_INLINE_VARS
542 
diagnostic_mutex(const char *** where)543 inline std::mutex &diagnostic_mutex(const char ***where)
544 {
545 #ifndef HAVE_INLINE_VARS
546 // C++11 guarantees that even if this header is included from many
547 // compilation units there will be a single unique mutex here. I guarantees
548 // that the mutex will have been constructed (ie initialized) by the time
549 // an execution path flows past its definition. However there can be
550 // overhead since the C++ run-time system may protect itself from risk of
551 // multiple threads triggering initialization at the same time. In doing so
552 // it may be that the initialization involved not just a simple boolean flag
553 // but some synchronization primitives. If we have C++17 then inline
554 // variable declarations achieve pretty well just what I want without this
555 // mess, so I will use it.
556     static std::mutex my_diagnostic_mutex;
557     static const char *my_diagnostic_location;
558 #endif // !HAVE_INLINE_VARS
559     *where = &my_diagnostic_location;
560     return my_diagnostic_mutex;
561 }
562 
abort1(const char * msg)563 [[noreturn]] inline void abort1(const char *msg)
564 {   const char **where;
565 // The call to diagnostic_mutex here is just to retrieve the location of the
566 // const char * variable that holds the location. I am already within
567 // the scope of a mutex.
568     static_cast<void>(diagnostic_mutex(&where));
569     std::cout << std::endl << "About to abort at " << *where << ": "
570               << msg << std::endl;
571     std::abort();
572 }
573 
abort1()574 [[noreturn]] inline void abort1()
575 {   const char **where;
576     static_cast<void>(diagnostic_mutex(&where));
577     std::cout << std::endl << "About to abort at " << *where << std::endl;
578     std::abort();
579 }
580 
581 // This variation on an abort() macro established a lock_guard so that only
582 // one part of the code can be aborting at any one time, and sets up
583 // information about the file and line where trouble arose. It could cope
584 // with arbitrary overloads of abort1() and the fact that the location
585 // information is not passed as an extra argument to abort1() is because
586 // of limitations in __VA_ARGS__ in portable code until C++2a.
587 
588 // bacause arithlib_abort is a macro it does not live in any particular
589 // namespace
590 
591 #define arithlib_abort(...)                                                \
592     {   const char **where;                                                \
593         std::lock_guard<std::mutex> lock(                                  \
594             arithlib_implementation::diagnostic_mutex(&where));            \
595         *where = __FILE__ " line " STRINGIFY(__LINE__);                    \
596         arithlib_implementation::abort1(__VA_ARGS__);                      \
597     }
598 
599 // The following variable (well constant) enabled "assert" checking. The
600 // effect might be a (probably rather modest) slowdown. However the predefined
601 // macro __OPTIMIZE__ will be set up if any g++ optimizations are in force, so
602 // here I only activate assertions in the case of compilation subject to
603 // "-O0" which will often be associated with "-g".
604 //
605 // However the user can override this by predefining FORCE_DEBUG_ARITH to
606 // encourage debugging of FORCE_NO_DEBUG_ARITH to discourage it.
607 
608 // Note I make the flags static not inline in case compilation flags for
609 // different files in a multi-file project differ.
610 
611 #if (defined __OPTIMIZE__ && !defined FORCE_DEBUG_ARITH) || \
612     defined FORCE_NO_DEBUG_ARITH
613 static const bool debug_arith = false;
614 #else  // __OPTIMIZE__
615 static const bool debug_arith = true;
616 #endif // __OPTIMIZE__
617 
618 template <typename F>
assert1(bool ok,const char * why,F && action,const char * location)619 inline void assert1(bool ok, const char *why,
620                     F&& action, const char *location)
621 {
622 // Use this as in:
623 //     arithlib_assert(predicate, [&]{...});
624 // where the "..." is an arbitrary sequence of actions to be taken
625 // if the assertion fails. The action will typically be to display
626 // extra information about what went wrong.
627     if (debug_arith && !ok)
628     {   action();
629         const char **where;
630         std::lock_guard<std::mutex> lock(diagnostic_mutex(&where));
631         *where = location;
632         abort1();
633     }
634 }
635 
assert1(bool ok,const char * why,const char * location)636 inline void assert1(bool ok, const char *why, const char *location)
637 {
638 // For simple use where a customised message is not required:
639 //     arithlib_assert(predicate);
640     if (debug_arith && !ok)
641     {   const char **where;
642         std::lock_guard<std::mutex> lock(diagnostic_mutex(&where));
643         *where = location;
644         abort1(why);
645     }
646 }
647 
648 #define arithlib_assert(...)                                 \
649     arithlib_implementation::assert1(__VA_ARGS__,            \
650                       "arithlib_assert(" #__VA_ARGS__ ")",   \
651                       __FILE__ " line " STRINGIFY(__LINE__))
652 
653 
654 // At times during development it is useful to be able to send messages
655 // to a log file.... This should not be used in final production code
656 // but still may help while debugging.
657 
658 // Making this "inline" avoids warning messages if it is not
659 // used. So even though this may somewhat waste space when it is used,
660 // I like this option.
661 
logprintf(const char * fmt,...)662 inline void logprintf(const char *fmt, ...)
663 {
664 // I use a fixed name for the log file. This is another respect in which
665 // this has to be seen as code only suitable for temporary use. I use static
666 // variables that are local to this function so that even if the header is
667 // included from multiple source files there will not be multiple-definition
668 // clashes, and since this is just for diagnostics I am not worried about
669 // any costs that this imposes.
670     static std::FILE *logfile = NULL;
671     const char **where;
672     std::lock_guard<std::mutex> lock(diagnostic_mutex(&where));
673     if (logfile == NULL) logfile = std::fopen("arith.log", "w");
674     std::va_list args;
675     va_start(args, fmt);
676     std::vfprintf(logfile, fmt, args);
677     va_end(args);
678     std::fflush(logfile);
679 }
680 
traceprintf(const char * fmt,...)681 inline void traceprintf(const char *fmt, ...)
682 {   const char **where;
683     std::lock_guard<std::mutex> lock(diagnostic_mutex(&where));
684     std::va_list args;
685     va_start(args, fmt);
686     std::vprintf(fmt, args);
687     va_end(args);
688     std::fflush(stdout);
689 }
690 
691 // The C++ rules about shifts are not always very comfortable, in particular
692 // right shifts on signed values may or may not propagate the sign bit
693 // and a left shift on signed values might cause trouble in overflow cases.
694 // So here are versions that should behave consistently across all
695 // architectures. Well it is probable that from C++20 onwards the language
696 // specification will arrange that right shifts on arithmetic types
697 // propagate the sign in the way that I want, and it is also liable to be
698 // the case that g++ and clang on all the computers that interest me do things
699 // the "obvious way" already, so the code here is something of an exercise
700 // in pedantry.
701 
ASR(std::int32_t a,std::int64_t n)702 inline std::int32_t ASR(std::int32_t a, std::int64_t n)
703 {   if (n<0 || n>=8*static_cast<int>(sizeof(std::int32_t))) n=0;
704     std::uint32_t r = static_cast<std::uint32_t>(a) >> n;
705     std::uint32_t signbit = static_cast<std::uint32_t>(a) >>
706                             (8*sizeof(std::uint32_t)-1);
707     if (n != 0) r |= ((-signbit) << (8*sizeof(std::uint32_t) - n));
708     return static_cast<std::int32_t>(r);
709 }
710 
ASR(std::int64_t a,std::int64_t n)711 inline std::int64_t ASR(std::int64_t a, std::int64_t n)
712 {   if (n<0 || n>=8*static_cast<int>(sizeof(std::int64_t))) n=0;
713     std::uint64_t r = static_cast<std::uint64_t>(a) >> n;
714     std::uint64_t signbit = static_cast<std::uint64_t>(a) >> (8*sizeof(
715                                 std::uint64_t)-1);
716     if (n != 0) r |= ((-signbit) << (8*sizeof(std::uint64_t) - n));
717     return static_cast<std::int64_t>(r);
718 }
719 
ASR(std::uint64_t a,std::int64_t n)720 inline std::uint64_t ASR(std::uint64_t a, std::int64_t n)
721 {   return ASR(static_cast<std::int64_t>(a), n);
722 }
723 
724 // The behaviour of left shifts on negative (signed) values seems to be
725 // labelled as undefined in C/C++, so any time I am going to do a left shift
726 // I need to work in an unsigned type. Well at some stage in the future it
727 // may be that C++ will insist that signed integers are handled in 2s
728 // complement form and shifts on them behave "as naively expected" but at
729 // present that can not be relied upon.
730 
731 // In the comparisons I case the sizeof value to an int so that I can compare
732 // it against the signed amount n.
733 
ASL(std::int32_t a,std::int64_t n)734 inline std::int32_t ASL(std::int32_t a, std::int64_t n)
735 {   if (n < 0 || n>=8*static_cast<int>(sizeof(std::uint32_t))) n = 0;
736     return static_cast<std::int32_t>(static_cast<std::uint32_t>(a) << n);
737 }
738 
ASL(std::int64_t a,std::int64_t n)739 inline std::int64_t ASL(std::int64_t a, std::int64_t n)
740 {   if (n < 0 || n>=8*static_cast<int>(sizeof(std::uint64_t))) n = 0;
741     return static_cast<std::int64_t>(static_cast<std::uint64_t>(a) << n);
742 }
743 
744 //=========================================================================
745 //=========================================================================
746 // I want to make C++ output using the "<<" operator on an ostream cope
747 // with big numbers. Doing so makes their use much smoother. The particular
748 // aspect of this addresses here is the provision of an IO manipulator
749 // called "std::bin" that sets for binary display of bignums (bit not of
750 // other integer types).
751 //=========================================================================
752 //=========================================================================
753 
754 struct radix
755 {
756 public:
757 // I would like setting hex or oct or dec to disable this, but at present
758 // I do not know how to do that. So I will arrange that binary output is
759 // only generated if none of those flags are set, because I can clear
760 // them here. Then when I restore one of them I disable the test for binary.
761 // I will arrange that if nobody has ever asked for binary they do not get it,
762 // just so I feel safe.
set_binary_outputarithlib_implementation::radix763     static void set_binary_output(std::ios_base &s)
764     {   flag(s) = 1;
765         s.unsetf(std::ios_base::dec);
766         s.unsetf(std::ios_base::oct);
767         s.unsetf(std::ios_base::hex);
768     }
is_binary_outputarithlib_implementation::radix769     static bool is_binary_output(std::ios_base &s)
770     {   return flag(s) != 0;
771     }
772 private:
flagarithlib_implementation::radix773     static long & flag(std::ios_base &s)
774     {   static int n = std::ios_base::xalloc();
775         return s.iword(n);
776     }
777 };
778 
779 } // temporary end of namespace arithlib
780 
781 // I want a new io manipulator "std::bin" to select binary mode output.
782 // This will be used much as std::oct, std::dec and std::hex.
783 
784 namespace std
bin(std::ostream & os)785 {   inline std::ostream &bin(std::ostream &os)
786     {   arithlib_implementation::radix::set_binary_output(os);
787         return os;
788     }
789 }
790 
791 
792 
793 namespace arithlib_implementation
794 {
795 
796 // When I get to big-integer multiplication I will use two worker threads
797 // so that elapsed times for really large multiplications are reduced
798 // somewhat. Well ideally by a factor approaching 3. I set up a framework
799 // of support for the threads here. Each main program thread will want
800 // its own pair of worker threads here. Each worker thread gets passed a
801 // nice object called "worker_data" that encapsulates the way it receives
802 // data from the caller and passes results back.
803 
804 // Each worker thread needs some data that it shares with the main thread.
805 // this structure encapsulates that.
806 
807 // Probably the most "official" way to coordinate threads would be to use
808 // condition variables, but doing so involves several synchronization
809 // primitives for each step of the transaction. For the simple level of
810 // coordination I need here it would be more costly that necessary. I can
811 // manage here with a scheme that when thread A want to allow thread B to
812 // proceed it unlocks a mutex that thread B was waiting on. There is some
813 // mess associated with ensuring that the main thread waits for results and
814 // that there are no race situations where all threads compete for a single
815 // mutex.
816 //
817 // There are 4 mutexes for each worker thread, but each synchronization step
818 // just involves a single mutex, transferring ownership between main and worker
819 // thread. Here is the patter of transfer and subsequent ownership, with "?"
820 // marking a muxex that has been waiting and the ">n" or <n" in the middle
821 // also showing which muxex has just been activated:
822 //       X  X  .  .         ?  .  X  X    Idle state. Worker waiting on mutex 0
823 // To start a transaction the main thread sets up data and unlocks mutex 0.
824 // That allows the worker to proceed and pick up the data.
825 //       .  X  .  .   >0    ?X .  X  X    first transaction
826 // The main thread must not alter data until the worker is finished. It waits
827 // on mutex 1 until the worker tells it that a result is available.
828 //       .  X ?X  .   <2    X  .  .  X
829 // The main thread is now entitles to start using the results of the activity
830 // just completed and setting up data for the next one. It can not release
831 // mutex 0 to restart the worker because the worker alread owns that. And even
832 // though it owns mutex 2 it had better not release that, because for that
833 // to make sense the worker would need to be waiting on it, and that would mean
834 // the worker had just done m3.unlock(); m3.lock() in quick succesion, which
835 // might have led it to grab m3 rather than the main program managing to. So
836 // use the third mutex, which the worker must be waiting on.
837 //       .  .  X  .   >1    X ?X  .  X    second transaction
838 // When it has finished its task the worker now unlocks mutex 3. This leaves
839 // a situation symmetric with the initial one
840 //       .  .  X ?X   <3    X  X  .  .
841 //       .  .  .  X   >2    X  X  ?X .    third transaction
842 //       ?X .  .  X   <0    .  X  X  .
843 //       X  .  .  .   >3    .  X  X ?X    fourth transaction
844 //       X ?X  .  .   <1    .  .  X  X    back in initial configuration!
845 //
846 // The pattern is thus that the master goes
847 //  [initially lock 0 and 1]
848 //  unlock 0  wait 2
849 //  unlock 1  wait 3
850 //  unlock 2  wait 0
851 //  unlock 3  wait 1
852 // while the worker goes
853 //  [initially lock 2 and 3]
854 //  wait 0    unlock 2
855 //  wait 1    unlock 3
856 //  wait 2    unlock 0
857 //  wait 3    unlock 1
858 // Observe that I can use (n^2) to map between the mutex number in the first
859 // and second columns here. That counting is what send_count and
860 // receive_count are doing.
861 
862 // In a nice world I would use just the C++ std::mutex scheme for
863 // synchronization, however here I am performance critical and to save
864 // a bit when performing medium-sized multiplications I will use the
865 // Microsoft version of mutexes directly on that platform.
866 
867 #if defined __CYGWIN__ || defined __MINGW32__
868 #define USE_MICROSOFT_MUTEX 1
869 
870 extern "C"
871 {   struct SecApp
872     {   std::uintptr_t nLength;
873         void *lpSecurityDescriptor;
874         int bINheritHandle;
875     };
876 
877     extern __declspec(dllimport)  void *
878     CreateMutexA(SecApp *, std::uintptr_t, const char *);
879     extern __declspec(dllimport) int CloseHandle(void *h);
880     extern __declspec(dllimport) int ReleaseMutex(void *m);
881     extern __declspec(dllimport) void *
882     WaitForSingleObject(void *, std::uintptr_t);
883 };
884 
885 #endif // __CYGWIN__ or __MINGW32__
886 
887 class Worker_data
888 {
889 public:
890     std::atomic<bool> ready;
891 #ifdef USE_MICROSOFT_MUTEX
892     void *mutex[4];
893 #else
894     std::mutex mutex[4];
895 #endif
896     bool quit_flag;
897     const std::uint64_t *a;
898     std::size_t lena;
899     const std::uint64_t *b;
900     std::size_t lenb;
901     std::uint64_t *c;
902     std::uint64_t *w;
903 
904 // When I construct an instance of Worker data I set its quit_flag to
905 // false and lock two of the mutexes. That sets things up so that when
906 // it is passed to a new worker thread that thread behaves in the way I
907 // want it to.
Worker_data()908     Worker_data()
909     {   ready = false;
910         quit_flag = false;
911 #ifdef USE_MICROSOFT_MUTEX
912         mutex[0] = CreateMutexA(NULL, 1, NULL);
913         mutex[1] = CreateMutexA(NULL, 1, NULL);
914         mutex[2] = CreateMutexA(NULL, 0, NULL);
915         mutex[3] = CreateMutexA(NULL, 0, NULL);
916 #else
917 // The next two must be locked by the main thread.
918         mutex[0].lock();
919         mutex[1].lock();
920 #endif
921     }
922 #ifdef USE_MICROSOFT_MUTEX
~Worker_data()923     ~Worker_data()
924     {   CloseHandle(mutex[0]);
925         CloseHandle(mutex[1]);
926         CloseHandle(mutex[2]);
927         CloseHandle(mutex[3]);
928     }
929 #endif
930 };
931 
932 inline void worker_thread(Worker_data *w);
933 
934 // Then each main thread will have a structure that encapsulated the
935 // two worker threads that it ends up with and the data it sets up for
936 // them and that they then access. When this structures is created it will
937 // cause the worker threads and the data block they need to be constructed.
938 
939 class DriverData
940 {
941 public:
942     int         send_count = 0;
943     Worker_data wd_0,
944                 wd_1;
945 // When an instance of DriverData is created the two sets of Worker_data
946 // get constructed with two of their mutexes locked. This will mean that when
947 // worker threads are created and start running they will politely wait for
948 // work.
949 
950     std::thread w_0, w_1;
DriverData()951     DriverData()
952     {   w_0 = std::thread(worker_thread, &wd_0),
953         w_1 = std::thread(worker_thread, &wd_1);
954 // I busy-wait until the two threads have both claimed the mutexes that they
955 // need to own at the start! Without this the main thread may post a
956 // multiplication, so its part of the work and try to check that the worker
957 // has finished (by claiming one of these mutexes) before the worker thread
958 // has got started up and has claimed them. This feels clumsy, but it only
959 // happens at system-startup.
960         while (!wd_0.ready.load() && !wd_1.ready.load())
961             std::this_thread::sleep_for(std::chrono::microseconds(1));
962     }
963 
964 // When the DriverData object is to be destroyed it must arrange to
965 // stop and then join the two threads that it set up. This code that sends
966 // a "quit" message to each thread will be executed before the thread object
967 // is deleted, and the destructor of the thread object should be activated
968 // before that of the Worker_data and the mutexes within that.
969 
~DriverData()970     ~DriverData()
971     {   wd_0.quit_flag = wd_1.quit_flag = true;
972         release_workers();
973         w_0.join();
974         w_1.join();
975     }
976 
977 // Using the worker threads is then rather easy: one sets up data in
978 // the Worker_data structures and then call release_workers(). Then
979 // you can do your own thing in parallel with the two workers picking up
980 // the tasks that they had been given. When you are ready you call
981 // wait_for_workers() which does what one might imagine, and the workers
982 // are expected to have left their results in the Worker_data object so
983 // you can find it.
984 
release_workers()985     void release_workers()
986     {
987 #ifdef USE_MICROSOFT_MUTEX
988         ReleaseMutex(wd_0.mutex[send_count]);
989         ReleaseMutex(wd_1.mutex[send_count]);
990 #else
991         wd_0.mutex[send_count].unlock();
992         wd_1.mutex[send_count].unlock();
993 #endif
994     }
995 
wait_for_workers()996     void wait_for_workers()
997     {
998 #ifdef USE_MICROSOFT_MUTEX
999         WaitForSingleObject(wd_0.mutex[send_count^2], 0xffffffff);
1000         WaitForSingleObject(wd_1.mutex[send_count^2], 0xffffffff);
1001 #else
1002         wd_0.mutex[send_count^2].lock();
1003         wd_1.mutex[send_count^2].lock();
1004 #endif
1005         send_count = (send_count+1)&3;
1006     }
1007 
1008 };
1009 
1010 // I encapsulate the driver data in this function, which will ensure that
1011 // exactly one copy gets made for each top-level thread that calls this,
1012 // ie that uses a huge multiplication.
1013 
getDriverData()1014 inline DriverData *getDriverData()
1015 {   static thread_local DriverData dd;
1016     return &dd;
1017 }
1018 
1019 // Declare a number of functions that might usefully be used elsewhere. If
1020 // I declare them "inline" then it will be OK even if I include this header
1021 // from multiple source files because only one copy should end up in the
1022 // eventually-linked executable.
1023 
1024 inline std::uint64_t *reserve(std::size_t n);
1025 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1026                                   std::size_t final);
1027 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1028                                     std::size_t final);
1029 inline void abandon(std::uint64_t *p);
1030 inline void abandon(std::intptr_t h);
1031 
1032 #if defined LISP && !defined ZAPPA
1033 typedef std::intptr_t string_handle;
1034 #else
1035 typedef char *string_handle;
1036 #endif
1037 
1038 inline string_handle confirm_size_string(char *p, std::size_t n,
1039         std::size_t final);
1040 inline void abandon_string(string_handle);
1041 
1042 inline std::intptr_t vector_to_handle(std::uint64_t *p);
1043 inline std::uint64_t *vector_of_handle(std::intptr_t n);
1044 inline std::size_t number_size(std::uint64_t *p);
1045 
1046 inline bool fits_into_fixnum(std::int64_t n);
1047 inline std::intptr_t int_to_handle(std::int64_t n);
1048 constexpr inline std::int64_t int_of_handle(std::intptr_t n);
1049 
1050 inline std::intptr_t string_to_bignum(const char *s);
1051 inline std::intptr_t int_to_bignum(std::int64_t n);
1052 inline std::intptr_t unsigned_int_to_bignum(std::uint64_t n);
1053 inline std::intptr_t round_double_to_int(double d);
1054 inline std::intptr_t trunc_double_to_int(double d);
1055 inline std::intptr_t floor_double_to_int(double d);
1056 inline std::intptr_t ceiling_double_to_int(double d);
1057 #ifdef softfloat_h
1058 inline std::intptr_t round_float128_to_int(float128_t d);
1059 inline std::intptr_t trunc_float128_to_int(float128_t d);
1060 inline std::intptr_t floor_float128_to_int(float128_t d);
1061 inline std::intptr_t ceiling_float128_to_int(float128_t d);
1062 #endif // softfloat_h
1063 inline std::intptr_t uniform_positive(std::size_t n);
1064 inline std::intptr_t uniform_signed(std::size_t n);
1065 inline std::intptr_t uniform_upto(std::intptr_t a);
1066 inline std::intptr_t random_upto_bits(std::size_t bits);
1067 inline std::intptr_t fudge_distribution(std::intptr_t, int);
1068 
1069 #ifdef PRECISE_GC
1070 inline void push(std::intptr_t w);
1071 inline void pop(std::intptr_t& w);
1072 
1073 // In many cases where I want to save things I will have a reference to
1074 // an array of uint64_t objects, so when I push it I must convert it back to
1075 // a Lisp-friendly form.
push(std::uint64_t * p)1076 inline void push(std::uint64_t *p)
1077 {   push(vector_to_handle(p));
1078 }
pop(std::uint64_t * & p)1079 inline void pop(std::uint64_t *&p)
1080 {   std::intptr_t w;
1081     pop(w);
1082     p = vector_of_handle(w);
1083 }
1084 #else
1085 // In cases where these are not required I will just defined them as
1086 // empty procedures and hope that the C++ compiler will inline them and
1087 // hence lead to them not adding any overhead at all.
push(std::intptr_t p)1088 inline void push(std::intptr_t p)
1089 {}
pop(std::intptr_t & p)1090 inline void pop(std::intptr_t& p)
1091 {}
1092 
push(std::uint64_t * p)1093 inline void push(std::uint64_t *p)
1094 {}
pop(std::uint64_t * & p)1095 inline void pop(std::uint64_t *&p)
1096 {}
1097 
push(const std::uint64_t * p)1098 inline void push(const std::uint64_t *p)
1099 {}
pop(const std::uint64_t * & p)1100 inline void pop(const std::uint64_t *&p)
1101 {}
1102 
1103 #endif
1104 
1105 #if defined MALLOC
1106 
1107 //=========================================================================
1108 //=========================================================================
1109 // The MALLOC option is perhaps the simplest! It uses C style
1110 // malloc/realloc and free functions and the vector if turned into a
1111 // handle by just casting it to an intptr_t value.
1112 //=========================================================================
1113 //=========================================================================
1114 
1115 
1116 // The following are provided so that a user can update malloc_function,
1117 // realloc_function and free_function to refer to their own choice of
1118 // allocation technology. Well it is a bit uglier than that! If you compile
1119 // using C++17 then the function pointers here are per_thread across every
1120 // compilation unit and things behave "as you might expect". If you have an
1121 // earlier C++ standard in play than each compilation unit gets its own static
1122 // set of variables that can be used to divert memory allocation, and hence
1123 // each compilation unit that includes this header is liable to need to reset
1124 // them all.
1125 // I view the ability to replace the allocation functions as somewhat
1126 // specialist and to be used by people who will have the skill to modify this
1127 // code as necessary, so this slight oddity does not worry me.
1128 
1129 typedef void *malloc_t(size_t n);
1130 typedef void *realloc_t(void *, std::size_t);
1131 typedef void free_t(void *);
1132 
1133 INLINE_VAR malloc_t  *malloc_function = std::malloc;
1134 INLINE_VAR realloc_t *realloc_function = std::realloc;
1135 INLINE_VAR free_t    *free_function   = std::free;
1136 
reserve(std::size_t n)1137 inline std::uint64_t *reserve(std::size_t n)
1138 {   arithlib_assert(n>0);
1139     std::uint64_t *r = reinterpret_cast<std::uint64_t *>
1140                        (*malloc_function)((
1141                                n+1)*sizeof(std::uint64_t));
1142     arithlib_assert(r != NULL);
1143     return &r[1];
1144 }
1145 
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)1146 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1147                                   std::size_t final)
1148 {   artithlib_assert(final>0 && n>=final);
1149     p = reinterpret_cast<std::uint64_t *>()
1150         (*realloc_function)((void *)&p[-1],
1151                             (final_n+1)*sizeof(std::uint64_t));
1152     arithlib_assert(p != NULL);
1153     p[0] = final_n;
1154     return vector_to_handle(&p[1]);
1155 }
1156 
1157 // In this model confirm_size_x() is just the same as confirm_size().
1158 
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)1159 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1160                                     std::size_t final)
1161 {   arithlib_assert(final>0 && n>=final);
1162     confirm_size(p, n, final);
1163 }
1164 
abandon(std::uint64_t * p)1165 inline void abandon(std::uint64_t *p)
1166 {   (*free_function)((void *)&p[-1]);
1167 }
1168 
1169 // Note that I allocate space for the string data plus a NUL terminating byte.
1170 
reserve_string(std::size_t n)1171 inline char *reserve_string(std::size_t n)
1172 {   char *r = reinterpret_cast<char *>(*malloc_function)(n+1);
1173     arithlib_assert(r != NULL);
1174     return r;
1175 }
1176 
1177 // When I confirm the size of a string all I do is to write a NUL byte
1178 // to terminate it. I expect the code to have reserved an amount of memory
1179 // that is not much longer than the amount that I will need, so using
1180 // realloc to shrink things to the exact size that is filled would not
1181 // be a good bargain.
1182 
confirm_size_string(char * p,std::size_t n,std::size_t final)1183 inline char *confirm_size_string(char *p, std::size_t n,
1184                                  std::size_t final)
1185 {   arithlib_assert(final>0 && (n+9)>final);
1186     r[final] = 0;
1187     return r;
1188 }
1189 
abandon_string(char * s)1190 inline void abandon_string(char *s)
1191 {   (*free_function)(s);
1192 }
1193 
1194 // In the C/malloc model I will represent a number by the intptr_t style
1195 // integer that is obtained from a pointer to the first digit of the bignum.
1196 
vector_to_handle(std::uint64_t * p)1197 inline std::intptr_t vector_to_handle(std::uint64_t *p)
1198 {   return reinterpret_cast<std::intptr_t>(p);
1199 }
1200 
vector_of_handle(std::intptr_t n)1201 inline std::uint64_t *vector_of_handle(std::intptr_t n)
1202 {   return reinterpret_cast<std::uint64_t *>(n);
1203 }
1204 
number_size(std::uint64_t * p)1205 inline std::size_t number_size(std::uint64_t *p)
1206 {   arithlib_assert(p[-1]!=0);
1207     return p[-1];
1208 }
1209 
1210 // When I use Bignums that are allocated using malloc() and operated on
1211 // via C++ overloaded operators I often need to copy the data. However when
1212 // memory management uses garbage collection I can allow multiple references
1213 // to the same object and so copying is not needed as much. I have two
1214 // copying functions. One only copies if the system is using MALLOC or
1215 // NEW (but leaves data sharable on systems with garbage collection) while
1216 // the other unconditionally copies.
1217 
always_copy_bignum(std::uint64_t * p)1218 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
1219 {   std::size_t n = p[-1];
1220     push(p);
1221     std::uint64_t *r = reserve(n);
1222     pop(p);
1223     std::memcpy(r, p, n*sizeof(std::uint64_t));
1224     return confirm_size(r, n, n);
1225 }
1226 
copy_if_no_garbage_collector(std::uint64_t * p)1227 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
1228 {   std::size_t n = p[-1];
1229     push(p);
1230     std::uint64_t *r = reserve(n);
1231     pop(p);
1232     std::memcpy(r, p, n*sizeof(std::uint64_t));
1233     return confirm_size(r, n, n);
1234 }
1235 
copy_if_no_garbage_collector(std::intptr_t pp)1236 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
1237 {   if (stored_as_fixnum(pp)) return pp;
1238     std::uint64_t *p = vector_of_handle(pp);
1239     std::size_t n = number_size(p);
1240     push(p);
1241     std::uint64_t *r = reserve(n);
1242     pop(p);
1243     std::memcpy(r, p, n*sizeof(std::uint64_t));
1244     return confirm_size(r, n, n);
1245 }
1246 
1247 #elif defined NEW
1248 
1249 //=========================================================================
1250 //=========================================================================
1251 // The NEW code is intended to be a reasonably sensible implementation for
1252 // use of this code free-standing within C++. Memory is allocated in units
1253 // whose size is a power of 2, and I keep track of memory that I have used
1254 // and discarded so that I do not need to go back to the system-provided
1255 // allocator too often.
1256 //=========================================================================
1257 //=========================================================================
1258 
1259 
1260 inline unsigned int log_next_power_of_2(std::size_t n);
1261 
1262 // There is a serious issue here as regards thread safety. And a subsidiary
1263 // one about C++17.
1264 //
1265 // As things stand if you use C++ memory allocation the local allocation
1266 // and reallocation here is not thread safe. The result could be a disaster
1267 // if multiple threads used big-numbers via the C++ type Bignum. Note that
1268 // if you do not have "NEW" defined but instead use MALLOC or LISP you are
1269 // safe.
1270 
1271 // CONCURRENCY SUPPORT:
1272 // I provide three options, and the selection as to which is used
1273 // can be made by predefining a symbol.
1274 //
1275 // ARITHLIB_MUTEX. Use a mutex to protect memory allocation. One hope
1276 //                 is that in situations where there is low contention the
1277 //                 overhead will be small, and so this is the default.
1278 // ARITHLIB_THREAD_LOCAL. Have a separate memory pool for use by each thread.
1279 //                 This can use more memory than a scheme that uses shared
1280 //                 allocation.
1281 // ARITHLIB_NO_THREADS. Assume that no concurrent use of arithlib will
1282 //                 arise and so no extra complication or overhead is needed.
1283 //                 I do not make this the default because I can imagine
1284 //                 people extending a program to use threads and then not
1285 //                 looking here!
1286 //
1287 // I looked into having an ARITHLIB_LOCK_FREE to use compare-and-swap
1288 // operations to maintain the freestore pool, but support via gcc on x86_64
1289 // is uncertain (in part because not all variants on x86_64 CPUs implement
1290 // the CMPXCHG16B instruction (though soon the ones that do not will count as
1291 // museum pieces), and also because reports on the performance impact
1292 // do not see quite clear-cut. I also see various unspecific comments about
1293 // patents, but I have not followed them up.
1294 // I could fairly happily imagine replacing the discard() code with a lock
1295 // free "push" operation, which is simple to implement and does not suffer
1296 // from the "ABA" issue. allocate() seems to be harder to implement and
1297 // especially harder to implement in a way that will give reliably high
1298 // performance across all platforms while avoiding machine depemdent
1299 // components and especially in-line assembly code.
1300 
1301 // If no options has been pre-defined I will default to ARITHLIB_MUTEX
1302 #if !defined ARITHLIB_MUTEX &&        \
1303     !defined ARITHLIB_THREAD_LOCAL && \
1304     !defined ARITHLIB_NO_THREADS
1305 #define ARITHLIB_MUTEX 1
1306 #endif
1307 
1308 // Attempts to select more that one option at once get detected and moaned
1309 // about.
1310 
1311 #if (defined ARITHLIB_MUTEX && defined ARITHLIB_THREAD_LOCAL) ||      \
1312     (defined ARITHLIB_THREAD_LOCAL && defined ARITHLIB_NO_THREADS) || \
1313     (defined ARITHLIB_NO_THREADS && defined ARITHLIB_MUTEX)
1314 #error Only one thread-support policy can be selected.
1315 #endif
1316 
1317 #ifdef ARITHLIB_MUTEX
freechain_mutex()1318 inline std::mutex& freechain_mutex()
1319 {   static std::mutex m;
1320     return m;
1321 }
1322 #endif
1323 
1324 typedef std::uint64_t *freechain_table_type[64];
1325 
1326 #ifdef ARITHLIB_THREAD_LOCAL
1327 inline thread_local freechain_table_type freechain_table;
1328 #else // ARITHLIB_THREAD_LOCAL
1329 #ifdef __cpp_inline_variables
1330 class freechain_table
1331 {   static inline freechain_table_type freechain_table_Var;
1332 public:
get()1333     static freechain_table_type& get()
1334     {   return freechain_table_Var;
1335     }
1336 };
1337 #else // __cpp_inline_variables
1338 class freechain_table
1339 {
1340 public:
get()1341     static freechain_table_type& get()
1342     {   static freechain_table_type freechain_table_Var;
1343         return freechain_table_Var;
1344     }
1345 };
1346 #endif // __cpp_inline_variables
1347 #endif // ARITHLIB_THREAD_LOCAL
1348 
1349 class Freechains
1350 {
1351 private:
1352 // The following obscure line ensures that when I make an instance
1353 // of Freechains it forces the standard streams into existence. Having
1354 // that initilization happening early helps a LITTLE bit to reassure me
1355 // that the standard streams should still be alive during the destructor
1356 // for this class.
1357     std::ios_base::Init force_iostreams_initialization;
1358 
1359 public:
Freechains()1360     Freechains()
1361     {   for (int i=0; i<64; i++) (freechain_table::get())[i] = NULL;
1362     }
1363 
~Freechains()1364     ~Freechains()
1365     {   for (std::size_t i=0; i<64; i++)
1366         {   std::uint64_t *f = (freechain_table::get())[i];
1367             if (debug_arith)
1368 // Report how many entries there are in the freechain.
1369             {   std::size_t n = 0;
1370 // To arrange that double-free mistakes are detected I arrange to put -1
1371 // in the initial word of any deleted block, so that all blocks on the
1372 // freechains here should show that. I set and test for that in the other
1373 // bits of code that allocate or release memory.
1374                 for (std::uint64_t *b=f; b!=NULL; b = (std::uint64_t *)b[1])
1375                 {   arithlib_assert(b[0] == -static_cast<std::uint64_t>(1));
1376                     n++;
1377                 }
1378                 if (n != 0)
1379                     std::cout << "Freechain " << i << " length: "
1380                           << n << " " << ((static_cast<std::size_t>(1))<<i)
1381                           << std::endl;
1382             }
1383             while (f != NULL)
1384             {   std::uint64_t w = f[1];
1385                 delete [] f;
1386                 f = reinterpret_cast<std::uint64_t *>(w);
1387             }
1388             (freechain_table::get())[i] = NULL;
1389         }
1390     }
1391 
allocate(std::size_t n)1392     static std::uint64_t *allocate(std::size_t n)
1393     {
1394 #ifdef DEBUG_OVERRUN
1395 // If I am debugging I can allocate an extra word that will lie
1396 // just beyond what would have been the end of my block, and I will
1397 // fill everything with a pattern that might let me spot some cases where
1398 // I write beyond the proper size of data. This option is only supported
1399 // when storage allocation is using NEW.
1400         int bits = log_next_power_of_2(n+1);
1401 #else
1402 // Finding the number of bits in n is achieved by counting the leading
1403 // zeros in the representation of n, and on many platforms an intrinsic
1404 // function will compile this into a single machine code instruction.
1405         int bits = log_next_power_of_2(n);
1406 #endif
1407         arithlib_assert(n<=((static_cast<std::size_t>(1))<<bits) && n>0);
1408         std::uint64_t *r;
1409 #if defined ARITHLIB_THREAD_LOCAL || ARITHLIB_NO_THREADS
1410         r = (freechain_table::get())[bits];
1411         if (r != NULL)
1412             (freechain_table::get())[bits] = reinterpret_cast<std::uint64_t *>
1413                                              (r)[1];
1414 #elif defined ARITHLIB_MUTEX
1415         {   std::lock_guard<std::mutex> lock(freechain_mutex());
1416             r = (freechain_table::get())[bits];
1417             if (r != NULL)
1418                 (freechain_table::get())[bits] =
1419                     reinterpret_cast<std::uint64_t *>(r[1]);
1420         }
1421 #else
1422 #error Internal inconsistency in arithlib.hpp: memory allocation strategy.
1423 #endif
1424 // If no memory had been found on the freechain I need to allocate some
1425 // more.
1426         if (r == NULL)
1427         {   r = new std::uint64_t[(static_cast<std::size_t>(1))<<bits];
1428             if (r == NULL) arithlib_abort("Run out of memory");
1429         }
1430 #ifdef DEBUG_OVERRUN
1431 // When debugging I tend to initialize all memory to a known pattern
1432 // when it is allocated and check that the final word has not been
1433 // overwritten when I release it. This is not foolproof but it helps me
1434 // by being able to detect many possible cases of overrun.
1435         if (debug_arith)
1436         {   std::memset(r, 0xaa,
1437                 (static_cast<std::size_t>(1)<<bits)*sizeof(std::uint64_t));
1438             r[0] = 0;
1439         }
1440 #endif
1441 // Just the first 32-bits of the header word record the block capacity.
1442 // The casts here look (and indeed are) ugly, but when I store data into
1443 // memory as a 32-bit value that is how I will read it later on, and the
1444 // messy notation here does not correspond to complicated computation.
1445         reinterpret_cast<std::uint32_t *>(r)[0] = bits;
1446         return r;
1447     }
1448 
1449 // When I abandon a memory block I will push it onto a relevant free chain.
1450 
abandon(std::uint64_t * p)1451     static void abandon(std::uint64_t *p)
1452     {   arithlib_assert(p[0] != -static_cast<std::uint64_t>(1));
1453         int bits = reinterpret_cast<std::uint32_t *>(p)[0];
1454         arithlib_assert(bits>0 && bits<48);
1455 // Here I assume that sizeof(uint64_t) >= sizeof(intptr_t) so I am not
1456 // going to lose information here on any platform I can consider.
1457         if (debug_arith) p[0] = -static_cast<std::uint64_t>(1);
1458 #ifdef ARITHLIB_ATOMIC
1459         lockfree_push(p, freechain_table::get(), bits);
1460 #else
1461 #ifdef ARITHLIB_MUTEX
1462         std::lock_guard<std::mutex> lock(freechain_mutex());
1463 #endif
1464         p[1] = reinterpret_cast<std::uint64_t>(freechain_table::get()[bits]);
1465         (freechain_table::get())[bits] = p;
1466 #endif
1467     }
1468 
1469 };
1470 
1471 #ifdef ARITHLIB_THREAD_LOCAL
1472 inline thread_local Freechains freechain;
1473 #else // ARITHLIB_THREAD_LOCAL
1474 #ifdef __cpp_inline_variables
1475 class freechains
1476 {   static inline Freechains freechains_Var;
1477 public:
get()1478     static Freechains& get()
1479     {   return freechains_Var;
1480     }
1481 };
1482 #else // __cpp_inline_variables
1483 class freechains
1484 {
1485 public:
get()1486     static Freechains& get()
1487     {   static Freechains freechains_Var;
1488         return freechains_Var;
1489     }
1490 };
1491 #endif // __cpp_inline_variables
1492 #endif // ARITHLIB_THREAD_LOCAL
1493 
reserve(std::size_t n)1494 inline std::uint64_t *reserve(std::size_t n)
1495 {   arithlib_assert(n>0);
1496     return &(freechains::get().allocate(n+1))[1];
1497 }
1498 
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)1499 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1500                                   std::size_t final)
1501 {   arithlib_assert(final>0 && n>=final);
1502 // Verify that the word just beyond where anything should have been
1503 // stored has not been clobbered.
1504 #ifdef DEBUG_OVERRUN
1505     if (debug_arith) arithlib_assert(p[n] == 0xaaaaaaaaaaaaaaaaU);
1506 #endif
1507     if (final == 1 && fits_into_fixnum(static_cast<std::int64_t>(p[0])))
1508     {   std::intptr_t r = int_to_handle(static_cast<std::int64_t>(p[0]));
1509         freechains::get().abandon(&p[-1]);
1510         return r;
1511     }
1512 // I compare the final size with the capacity and if it is a LOT smaller
1513 // I allocate a new smaller block and copy the data across.
1514 // That situation can most plausibly arise when two similar-values big
1515 // numbers are subtracted.
1516     int bits = reinterpret_cast<std::uint32_t *>(&p[-1])[0];
1517     std::size_t capacity = (static_cast<std::size_t>(1))<<bits;
1518     if (capacity > 4*final)
1519     {   std::uint64_t *w =
1520             freechains::get().allocate(
1521                 (static_cast<std::size_t>(1))<<log_next_power_of_2(final+1));
1522         std::memcpy(&w[1], p, final*sizeof(std::uint64_t));
1523         freechains::get().abandon(&p[-1]);
1524         p = &w[1];
1525     }
1526     reinterpret_cast<std::uint32_t *>(&p[-1])[1] = final;
1527     return vector_to_handle(p);
1528 }
1529 
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)1530 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1531                                     std::size_t final)
1532 {   arithlib_assert(final>0 && n>=final);
1533     return confirm_size(p, n, final);
1534 }
1535 
number_size(std::uint64_t * p)1536 inline std::size_t number_size(std::uint64_t *p)
1537 {   std::size_t r = reinterpret_cast<std::uint32_t *>(&p[-1])[1];
1538     arithlib_assert(r>0);
1539     return reinterpret_cast<std::uint32_t *>(&p[-1])[1];
1540 }
1541 
vector_to_handle(std::uint64_t * p)1542 inline std::intptr_t vector_to_handle(std::uint64_t *p)
1543 {   return reinterpret_cast<std::intptr_t>(p);
1544 }
1545 
vector_of_handle(std::intptr_t n)1546 inline std::uint64_t *vector_of_handle(std::intptr_t n)
1547 {   return reinterpret_cast<std::uint64_t *>(n);
1548 }
1549 
stored_as_fixnum(std::intptr_t a)1550 inline bool stored_as_fixnum(std::intptr_t a)
1551 {   return (a & 1) != 0;
1552 }
1553 
int_of_handle(std::intptr_t a)1554 constexpr inline std::int64_t int_of_handle(std::intptr_t a)
1555 {   return (static_cast<std::int64_t>(a) & ~static_cast<std::int64_t>
1556             (1))/2;
1557 }
1558 
1559 // This function should only ever be called in situations where the
1560 // arithmetic indicated will not overflow.
1561 
int_to_handle(std::int64_t n)1562 inline std::intptr_t int_to_handle(std::int64_t n)
1563 {   return static_cast<std::intptr_t>(2*n + 1);
1564 }
1565 
1566 // The following two lines are slighly delicate in that INTPTR_MIN and _MAX
1567 // may not have the low tag bits to be proper fixnums. But if I implement
1568 // int_of_handle so that it ignores tag bits that will be OK.
1569 
1570 INLINE_VAR const std::int64_t MIN_FIXNUM = int_of_handle(INTPTR_MIN);
1571 INLINE_VAR const std::int64_t MAX_FIXNUM = int_of_handle(INTPTR_MAX);
1572 
fits_into_fixnum(std::int64_t a)1573 inline bool fits_into_fixnum(std::int64_t a)
1574 {   return a>=MIN_FIXNUM && a<=MAX_FIXNUM;
1575 }
1576 
abandon(std::uint64_t * p)1577 inline void abandon(std::uint64_t *p)
1578 {   freechains::get().abandon(&p[-1]);
1579 }
1580 
abandon(std::intptr_t p)1581 inline void abandon(std::intptr_t p)
1582 {   if (!stored_as_fixnum(p) && p!=0)
1583     {   std::uint64_t *pp = vector_of_handle(p);
1584         freechains::get().abandon(&pp[-1]);
1585     }
1586 }
1587 
reserve_string(std::size_t n)1588 inline char *reserve_string(std::size_t n)
1589 {   return new char[n+1];
1590 }
1591 
confirm_size_string(char * p,std::size_t n,std::size_t final)1592 inline char *confirm_size_string(char *p, std::size_t n,
1593                                  std::size_t final)
1594 {   arithlib_assert(final>0 && (n+9)>final);
1595     p[final] = 0;
1596     return p;
1597 }
1598 
abandon_string(char * s)1599 inline void abandon_string(char *s)
1600 {   delete [] s;
1601 }
1602 
1603 // In the NEW case I will want to make all operations cope with both
1604 // shorter integers (up to 63 bits) stored as the "handle", or genuine
1605 // big numbers where the handle yields a pointer to a vector of 64-bit
1606 // words. I do this by having a dispatch scheme that can activate code
1607 // for each of the mixtures of representations.
1608 //
1609 
1610 // Dispatch as between mixed bignum and fixnum representations using some
1611 // template stuff and classes.
1612 
1613 template <class OP,class RES>
op_dispatch1(std::intptr_t a1)1614 inline RES op_dispatch1(std::intptr_t a1)
1615 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1));
1616     else return OP::op(vector_of_handle(a1));
1617 }
1618 
1619 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::int64_t n)1620 inline RES op_dispatch1(std::intptr_t a1, std::int64_t n)
1621 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1622     else return OP::op(vector_of_handle(a1), n);
1623 }
1624 
1625 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::uint64_t * n)1626 inline RES op_dispatch1(std::intptr_t a1, std::uint64_t *n)
1627 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1628     else return OP::op(vector_of_handle(a1), n);
1629 }
1630 
1631 template <class OP,class RES>
op_dispatch2(std::intptr_t a1,std::intptr_t a2)1632 inline RES op_dispatch2(std::intptr_t a1, std::intptr_t a2)
1633 {   if (stored_as_fixnum(a1))
1634     {   if (stored_as_fixnum(a2))
1635             return OP::op(int_of_handle(a1), int_of_handle(a2));
1636         else return OP::op(int_of_handle(a1), vector_of_handle(a2));
1637     }
1638     else
1639     {   if (stored_as_fixnum(a2))
1640             return OP::op(vector_of_handle(a1), int_of_handle(a2));
1641         else return OP::op(vector_of_handle(a1), vector_of_handle(a2));
1642     }
1643 }
1644 
always_copy_bignum(std::uint64_t * p)1645 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
1646 {   std::size_t n = reinterpret_cast<std::uint32_t *>(&p[-1])[1];
1647     push(p);
1648     std::uint64_t *r = reserve(n);
1649     pop(p);
1650     std::memcpy(r, p, n*sizeof(std::uint64_t));
1651     return confirm_size(r, n, n);
1652 }
1653 
copy_if_no_garbage_collector(std::uint64_t * p)1654 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
1655 {   std::size_t n = number_size(p);
1656     push(p);
1657     std::uint64_t *r = reserve(n);
1658     pop(p);
1659     std::memcpy(r, p, n*sizeof(std::uint64_t));
1660     return confirm_size(r, n, n);
1661 }
1662 
copy_if_no_garbage_collector(std::intptr_t pp)1663 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
1664 {   if (stored_as_fixnum(pp)) return pp;
1665     std::uint64_t *p = vector_of_handle(pp);
1666     std::size_t n = number_size(p);
1667     push(p);
1668     std::uint64_t *r = reserve(n);
1669     pop(p);
1670     std::memcpy(r, p, n*sizeof(std::uint64_t));
1671     return confirm_size(r, n, n);
1672 }
1673 
1674 #elif defined LISP
1675 
1676 //=========================================================================
1677 //=========================================================================
1678 // The LISP code is for incorporation in VSL or CSL or Zappa
1679 //=========================================================================
1680 //=========================================================================
1681 
1682 // The code in this region needs to be adapted to work with whatever
1683 // language implementation you are going to use the arithmetic library from.
1684 // In my case this will be one of two fairly closely related Lisp systems,
1685 // where VSL is a "small" one and CSL is larger (and more complicated).
1686 // The code here is left visible and available since it may provide some
1687 // sort of model for anybody wanting to use this code in their own
1688 // project.
1689 
1690 #if defined CSL
1691 
1692 // The code here can only make sense in the context of the CSL sources,
1693 // and it is assumed that all the relevant CSL header files have already
1694 // been #included.
1695 
reserve(std::size_t n)1696 inline std::uint64_t *reserve(std::size_t n)
1697 {   arithlib_assert(n>0);
1698 // During a transition period I will use TYPE_NEW_BIGNUM rather than
1699 // TYPE_BIGNUM.
1700     LispObject a = get_basic_vector(TAG_NUMBERS, TYPE_NEW_BIGNUM,
1701                                     n*sizeof(std::uint64_t)+8);
1702     return reinterpret_cast<std::uint64_t *>(a + 8 - TAG_NUMBERS);
1703 }
1704 
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)1705 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1706                                   std::size_t final)
1707 {   arithlib_assert(final>0 && n>=final);
1708     if (final == 1 && fits_into_fixnum(static_cast<std::int64_t>(p[0])))
1709     {   std::intptr_t r = int_to_handle(static_cast<std::int64_t>(p[0]));
1710         return r;
1711     }
1712 // Note that pack_hdrlength() takes its argument measured in units of
1713 // 32-bit words. That is because the way the length field is packed into
1714 // an object header supported just that resolution (and special treatment is
1715 // given to halfword, byte and bit-vectors to allow for their finer grain).
1716 // The length also includes the size of a header-word, and on 32-bit platforms
1717 // it has to allow for padding the data to allow the array of 64-bit digits
1718 // to be properly aligned in memory.
1719     ((LispObject *)&p[-1])[0] =
1720         TAG_HDR_IMMED + TYPE_NEW_BIGNUM +
1721         pack_hdrlength((final+1)*sizeof(std::uint64_t)/sizeof(std::int32_t));
1722 // If I am on a 32-bit system the data for a bignum is 8 bit aligned and
1723 // that leaves a 4-byte gap after the header. In such a case I will write
1724 // in a zero just to keep memory tidy.
1725     if (sizeof(LispObject) == 4)
1726         ((LispObject *)&p[-1])[1] = 0;
1727 // Here I should reset fringe down by (final-n) perhaps. Think about that
1728 // later!
1729     return vector_to_handle(p);
1730 }
1731 
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)1732 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1733                                     std::size_t final)
1734 {   arithlib_assert(final>0 && n>=final);
1735 // Here I might need to write a nice dummy object into the gap left by
1736 // shrinking the object.
1737     return confirm_size(p, n, final);
1738 }
1739 
vector_to_handle(std::uint64_t * p)1740 inline std::intptr_t vector_to_handle(std::uint64_t *p)
1741 {   return reinterpret_cast<std::intptr_t>(p) - 8 + TAG_NUMBERS;
1742 }
1743 
vector_of_handle(std::intptr_t n)1744 inline std::uint64_t *vector_of_handle(std::intptr_t n)
1745 {   return reinterpret_cast<std::uint64_t *>(n + 8 - TAG_NUMBERS);
1746 }
1747 
number_size(std::uint64_t * p)1748 inline std::size_t number_size(std::uint64_t *p)
1749 {
1750 // The odd looking cast here is because in arithlib I am passing around
1751 // arrays of explicitly 64-bit values, however in the underlying Lisp
1752 // I expect to be modelling memory as made up of intptr-sized items
1753 // that I arrange to have aligned on 8-byte boundaries. So to show some
1754 // though about strict aliasing and the like I will access memory as
1755 // an array of LispObject things when I access the header of an item.
1756     Header h = (Header)*(LispObject *)&p[-1];
1757     std::size_t r = length_of_header(h);
1758 // On 32-bit systems a bignum will have a wasted 32-bit word after the
1759 // header and before the digits, so that the digits are properly aligned
1760 // in memory. The result will be that the bignum is laid out as follows
1761 //      |     hdr64     | digit64 | digit64 | ... |    (64-bit world)
1762 //      | hdr32 | gap32 | digit64 | digit64 | ... |    (32-bit world)
1763 // The length value packed into the header is the length of the vector
1764 // including its header.
1765     r = (r-8)/sizeof(std::uint64_t);
1766     arithlib_assert(r>0);
1767     return r;
1768 }
1769 
stored_as_fixnum(std::intptr_t a)1770 inline bool stored_as_fixnum(std::intptr_t a)
1771 {   return is_fixnum(a);
1772 }
1773 
int_of_handle(std::intptr_t a)1774 constexpr inline std::int64_t int_of_handle(std::intptr_t a)
1775 {   return int_of_fixnum(a);
1776 }
1777 
int_to_handle(std::int64_t n)1778 inline std::intptr_t int_to_handle(std::int64_t n)
1779 {   return fixnum_of_int(n);
1780 }
1781 
1782 INLINE_VAR const std::int64_t MIN_FIXNUM = int_of_handle(INTPTR_MIN);
1783 INLINE_VAR const std::int64_t MAX_FIXNUM = int_of_handle(INTPTR_MAX);
1784 
fits_into_fixnum(std::int64_t a)1785 inline bool fits_into_fixnum(std::int64_t a)
1786 {   return a>=MIN_FIXNUM && a<=MAX_FIXNUM;
1787 }
1788 
abandon(std::uint64_t * p)1789 inline void abandon(std::uint64_t *p)
1790 {   // No need to do anything! But MIGHT reset fringe pointer?
1791 }
1792 
abandon(std::intptr_t p)1793 inline void abandon(std::intptr_t p)
1794 {   if (!stored_as_fixnum(p) && p!=0)
1795     {   std::uint64_t *pp = vector_of_handle(p);
1796         abandon(pp);
1797     }
1798 }
1799 
reserve_string(std::size_t n)1800 inline char *reserve_string(std::size_t n)
1801 {   LispObject a = get_basic_vector(TAG_VECTOR, TYPE_STRING_4,
1802                                     CELL+n);
1803     return reinterpret_cast<char *>(a - TAG_VECTOR + sizeof(LispObject));
1804 }
1805 
confirm_size_string(char * p,std::size_t n,std::size_t final)1806 inline LispObject confirm_size_string(char *p, std::size_t n,
1807                                       std::size_t final)
1808 {   LispObject *a = (LispObject *)(p - sizeof(LispObject));
1809     *a = TAG_HDR_IMMED + TYPE_STRING_4 + (final<<(Tw+5));
1810     return (LispObject)a + TAG_VECTOR;
1811 }
1812 
abandon_string(string_handle s)1813 inline void abandon_string(string_handle s)
1814 {   // Do nothing.
1815 }
1816 
1817 template <class OP,class RES>
op_dispatch1(std::intptr_t a1)1818 inline RES op_dispatch1(std::intptr_t a1)
1819 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1));
1820     else return OP::op(vector_of_handle(a1));
1821 }
1822 
1823 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::int64_t n)1824 inline RES op_dispatch1(std::intptr_t a1, std::int64_t n)
1825 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1826     else return OP::op(vector_of_handle(a1), n);
1827 }
1828 
1829 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::uint64_t * n)1830 inline RES op_dispatch1(std::intptr_t a1, std::uint64_t *n)
1831 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1832     else return OP::op(vector_of_handle(a1), n);
1833 }
1834 
1835 template <class OP,class RES>
op_dispatch2(std::intptr_t a1,std::intptr_t a2)1836 inline RES op_dispatch2(std::intptr_t a1, std::intptr_t a2)
1837 {   if (stored_as_fixnum(a1))
1838     {   if (stored_as_fixnum(a2))
1839             return OP::op(int_of_handle(a1), int_of_handle(a2));
1840         else return OP::op(int_of_handle(a1), vector_of_handle(a2));
1841     }
1842     else
1843     {   if (stored_as_fixnum(a2))
1844             return OP::op(vector_of_handle(a1), int_of_handle(a2));
1845         else return OP::op(vector_of_handle(a1), vector_of_handle(a2));
1846     }
1847 }
1848 
always_copy_bignum(std::uint64_t * p)1849 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
1850 {   std::size_t n = number_size(p);
1851     push(p);
1852     std::uint64_t *r = reserve(n);
1853     pop(p);
1854     std::memcpy(r, p, n*sizeof(std::uint64_t));
1855     return confirm_size(r, n, n);
1856 }
1857 
copy_if_no_garbage_collector(std::uint64_t * p)1858 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
1859 {   std::size_t n = number_size(p);
1860     push(p);
1861     std::uint64_t *r = reserve(n);
1862     pop(p);
1863     std::memcpy(r, p, n*sizeof(std::uint64_t));
1864     return confirm_size(r, n, n);
1865 }
1866 
copy_if_no_garbage_collector(std::intptr_t pp)1867 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
1868 {   if (stored_as_fixnum(pp)) return pp;
1869     std::uint64_t *p = vector_of_handle(pp);
1870     std::size_t n = number_size(p);
1871     push(p);
1872     std::uint64_t *r = reserve(n);
1873     pop(p);
1874     std::memcpy(r, p, n*sizeof(std::uint64_t));
1875     return confirm_size(r, n, n);
1876 }
1877 
1878 
1879 #elif defined ZAPPA // and end of CSL
1880 
1881 // The code here can only make sense in the context of the Zappa sources,
1882 // and it is assumed that all the relevant Zappa header files have already
1883 // been #included.
1884 
reserve(std::size_t n)1885 inline std::uint64_t *reserve(std::size_t n)
1886 {   arithlib_assert(n>0);
1887     std::uint64_t* a = binaryAllocate(n+1);
1888     *a = n;      // Record length of object in first word.
1889     return a+1;
1890 }
1891 
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)1892 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1893                                   std::size_t final)
1894 {   arithlib_assert(final>0 && n>=final);
1895     if (final == 1 && fits_into_fixnum(static_cast<std::int64_t>(p[0])))
1896     {   std::intptr_t r = int_to_handle(static_cast<std::int64_t>(p[0]));
1897         return r;
1898     }
1899 // I put the length of a bignum in stored as 2 times the number of words
1900 // (not including the header word). This leaves valid headers even, and so
1901 // odd values can be used for forwarding addresses.
1902     p[-1] = 2*final;
1903     return vector_to_handle(p);
1904 }
1905 
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)1906 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1907                                     std::size_t final)
1908 {   arithlib_assert(final>0 && n>=final);
1909     return confirm_size(p, n, final);
1910 }
1911 
vector_to_handle(std::uint64_t * p)1912 inline std::intptr_t vector_to_handle(std::uint64_t *p)
1913 {   return reinterpret_cast<std::intptr_t>(p) - 8 + BigInteger;
1914 }
1915 
vector_of_handle(std::intptr_t n)1916 inline std::uint64_t *vector_of_handle(std::intptr_t n)
1917 {   return reinterpret_cast<std::uint64_t *>(n + 8 - BigInteger);
1918 }
1919 
number_size(std::uint64_t * p)1920 inline std::size_t number_size(std::uint64_t *p)
1921 {   std::uint64_t h = p[-1]/2;
1922     arithlib_assert(h>0);
1923     return h;
1924 }
1925 
stored_as_fixnum(std::intptr_t a)1926 inline bool stored_as_fixnum(std::intptr_t a)
1927 {   return (a & tagXMask) == SmallInteger;
1928 }
1929 
int_of_handle(std::intptr_t a)1930 constexpr inline std::int64_t int_of_handle(std::intptr_t a)
1931 {   return static_cast<std::int64_t>(
1932                (a & ~static_cast<std::int64_t>(tagXMask)) /
1933                static_cast<std::int64_t>(valXMult));
1934 }
1935 
int_to_handle(std::int64_t n)1936 inline std::intptr_t int_to_handle(std::int64_t n)
1937 {   return static_cast<std::intptr_t>(n*valXMult + SmallInteger);
1938 }
1939 
1940 INLINE_VAR const std::int64_t MIN_FIXNUM = int_of_handle(INTPTR_MIN);
1941 INLINE_VAR const std::int64_t MAX_FIXNUM = int_of_handle(INTPTR_MAX);
1942 
fits_into_fixnum(std::int64_t a)1943 inline bool fits_into_fixnum(std::int64_t a)
1944 {   return a>=MIN_FIXNUM && a<=MAX_FIXNUM;
1945 }
1946 
abandon(std::uint64_t * p)1947 inline void abandon(std::uint64_t *p)
1948 {   // No need to do anything!
1949 }
1950 
abandon(std::intptr_t p)1951 inline void abandon(std::intptr_t p)
1952 {   // Also do not do anything!
1953 }
1954 
reserve_string(std::size_t n)1955 inline char *reserve_string(std::size_t n)
1956 {   std::uint64_t* a = binaryAllocate((8+n+7)/8);
1957     return reinterpret_cast<char *>(a) + 8;
1958 }
1959 
1960 // A string size is measured in bytes not words.
confirm_size_string(char * p,std::size_t n,std::size_t final)1961 inline char* confirm_size_string(char *p, std::size_t n,
1962                                  std::size_t final)
1963 {   p[final] = 0;
1964     reinterpret_cast<std::uint64_t*>(p)[-1] = 2*final;
1965     return p;
1966 }
1967 
abandon_string(string_handle s)1968 inline void abandon_string(string_handle s)
1969 {   // Do nothing.
1970 }
1971 
1972 template <class OP,class RES>
op_dispatch1(std::intptr_t a1)1973 inline RES op_dispatch1(std::intptr_t a1)
1974 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1));
1975     else return OP::op(vector_of_handle(a1));
1976 }
1977 
1978 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::int64_t n)1979 inline RES op_dispatch1(std::intptr_t a1, std::int64_t n)
1980 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1981     else return OP::op(vector_of_handle(a1), n);
1982 }
1983 
1984 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::uint64_t * n)1985 inline RES op_dispatch1(std::intptr_t a1, std::uint64_t *n)
1986 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1987     else return OP::op(vector_of_handle(a1), n);
1988 }
1989 
1990 template <class OP,class RES>
op_dispatch2(std::intptr_t a1,std::intptr_t a2)1991 inline RES op_dispatch2(std::intptr_t a1, std::intptr_t a2)
1992 {   if (stored_as_fixnum(a1))
1993     {   if (stored_as_fixnum(a2))
1994             return OP::op(int_of_handle(a1), int_of_handle(a2));
1995         else return OP::op(int_of_handle(a1), vector_of_handle(a2));
1996     }
1997     else
1998     {   if (stored_as_fixnum(a2))
1999             return OP::op(vector_of_handle(a1), int_of_handle(a2));
2000         else return OP::op(vector_of_handle(a1), vector_of_handle(a2));
2001     }
2002 }
2003 
always_copy_bignum(std::uint64_t * p)2004 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
2005 {   std::size_t n = number_size(p);
2006     push(p);
2007     std::uint64_t *r = reserve(n);
2008     pop(p);
2009     std::memcpy(r, p, n*sizeof(std::uint64_t));
2010     return confirm_size(r, n, n);
2011 }
2012 
copy_if_no_garbage_collector(std::uint64_t * p)2013 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
2014 {   std::size_t n = number_size(p);
2015     push(p);
2016     std::uint64_t *r = reserve(n);
2017     pop(p);
2018     std::memcpy(r, p, n*sizeof(std::uint64_t));
2019     return confirm_size(r, n, n);
2020 }
2021 
copy_if_no_garbage_collector(std::intptr_t pp)2022 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
2023 {   if (stored_as_fixnum(pp)) return pp;
2024     std::uint64_t *p = vector_of_handle(pp);
2025     std::size_t n = number_size(p);
2026     push(p);
2027     std::uint64_t *r = reserve(n);
2028     pop(p);
2029     std::memcpy(r, p, n*sizeof(std::uint64_t));
2030     return confirm_size(r, n, n);
2031 }
2032 
2033 
2034 
2035 
2036 #else // ZAPPA
2037 
reserve(std::size_t n)2038 inline std::uint64_t *reserve(std::size_t n)
2039 {   arithlib_assert(n>0);
2040 // I must allow for alignment padding on 32-bit platforms.
2041     if (sizeof(LispObject)==4) n = n*sizeof(std::uint64_t) + 4;
2042     else n = n*sizeof(std::uint64_t);
2043     LispObject a = allocateatom(n);
2044     return reinterpret_cast<std::uint64_t *>(a + 8 - tagATOM);
2045 }
2046 
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)2047 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
2048                                   std::size_t final)
2049 {   arithlib_assert(final>0 && n>=final);
2050     if (final == 1 && fits_into_fixnum(static_cast<std::int64_t>(p[0])))
2051     {   std::intptr_t r = int_to_handle(static_cast<std::int64_t>(p[0]));
2052         return r;
2053     }
2054     ((LispObject *)&p[-1])[0] =
2055         tagHDR + typeBIGNUM + packlength(final*sizeof(std::uint64_t) +
2056                                          (sizeof(LispObject)==4 ? 4 : 0));
2057 // If I am on a 32-bit system the data for a bignum is 8 bit aligned and
2058 // that leaves a 4-byte gap after the header. In such a case I will write
2059 // in a zero just to keep memory tidy.
2060     if (sizeof(LispObject) == 4)
2061         ((LispObject *)&p[-1])[1] = 0;
2062 // Here I should reset fringe down by (final-n) perhaps. Think about that
2063 // later!
2064     return vector_to_handle(p);
2065 }
2066 
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)2067 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
2068                                     std::size_t final)
2069 {   arithlib_assert(final>0 && n>=final);
2070 // Here I might need to write a nice dummy object into the gap left by
2071 // shrinking the object.
2072     return confirm_size(p, n, final);
2073 }
2074 
vector_to_handle(std::uint64_t * p)2075 inline std::intptr_t vector_to_handle(std::uint64_t *p)
2076 {   return reinterpret_cast<std::intptr_t>(p) - 8 + tagATOM;
2077 }
2078 
vector_of_handle(std::intptr_t n)2079 inline std::uint64_t *vector_of_handle(std::intptr_t n)
2080 {   return reinterpret_cast<std::uint64_t *>(n + 8 - tagATOM);
2081 }
2082 
number_size(std::uint64_t * p)2083 inline std::size_t number_size(std::uint64_t *p)
2084 {
2085 // The odd looking cast here is because in arithlib I am passing around
2086 // arrays of explicitly 64-bit values, however in the underlying Lisp
2087 // I expect to be modelling memory as made up of intptr-sized items
2088 // that I arrange to have aligned on 8-byte boundaries. So to show some
2089 // though about strict aliasing and the like I will access memory as
2090 // an array of LispObject things when I access the header of an item.
2091     std::uintptr_t h = reinterpret_cast<std::uintptr_t>()*
2092                        (LispObject *)&p[-1];
2093     std::size_t r = veclength(h);
2094 // On 32-bit systems a bignum will have a wasted 32-bit word after the
2095 // header and before the digits, so that the digits are properly aligned
2096 // in memory. The result will be that the bignum is laid out as follows
2097 //      |     hdr64     | digit64 | digit64 | ... |    (64-bit world)
2098 //      | hdr32 | gap32 | digit64 | digit64 | ... |    (32-bit world)
2099 // The length value packed into the header is the length of the vector
2100 // excluding its header.
2101 //  if (sizeof(LispObject) == 4) r -= 4; { the remaindering does all I need! }
2102     r = r/sizeof(std::uint64_t);
2103     arithlib_assert(r>0);
2104     return r;
2105 }
2106 
stored_as_fixnum(std::intptr_t a)2107 inline bool stored_as_fixnum(std::intptr_t a)
2108 {   return isFIXNUM(a);
2109 }
2110 
int_of_handle(std::intptr_t a)2111 constexpr inline std::int64_t int_of_handle(std::intptr_t a)
2112 {   return qfixnum(a);
2113 }
2114 
int_to_handle(std::int64_t n)2115 inline std::intptr_t int_to_handle(std::int64_t n)
2116 {   return packfixnum(n);
2117 }
2118 
fits_into_fixnum(std::int64_t a)2119 inline bool fits_into_fixnum(std::int64_t a)
2120 {   return a>=MIN_FIXNUM && a<=MAX_FIXNUM;
2121 }
2122 
abandon(std::uint64_t * p)2123 inline void abandon(std::uint64_t *p)
2124 {   // No need to do anything! But MIGHT reset fringe pointer?
2125 }
2126 
abandon(std::intptr_t p)2127 inline void abandon(std::intptr_t p)
2128 {   if (!stored_as_fixnum(p) && p!=0)
2129     {   std::uint64_t *pp = vector_of_handle(p);
2130         abandon(pp);
2131     }
2132 }
2133 
reserve_string(std::size_t n)2134 inline char *reserve_string(std::size_t n)
2135 {   LispObject a = allocateatom(n);
2136     return reinterpret_cast<char *>(a - tagATOM + sizeof(LispObject));
2137 }
2138 
confirm_size_string(char * p,std::size_t n,std::size_t final)2139 inline LispObject confirm_size_string(char *p, std::size_t n,
2140                                       std::size_t final)
2141 {   LispObject *a = (LispObject *)(p - sizeof(LispObject));
2142 // On 32-bit platforms I do not have a padder word before string data
2143 // so things are simpler here than when confirming the size of a number.
2144     *a = tagHDR + typeSTRING + packlength(final);
2145     return (LispObject)a +tagATOM;
2146 }
2147 
abandon_string(string_handle s)2148 inline void abandon_string(string_handle s)
2149 {   // Do nothing.
2150 }
2151 
2152 template <class OP,class RES>
op_dispatch1(std::intptr_t a1)2153 inline RES op_dispatch1(std::intptr_t a1)
2154 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1));
2155     else return OP::op(vector_of_handle(a1));
2156 }
2157 
2158 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::int64_t n)2159 inline RES op_dispatch1(std::intptr_t a1, std::int64_t n)
2160 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
2161     else return OP::op(vector_of_handle(a1), n);
2162 }
2163 
2164 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::uint64_t * n)2165 inline RES op_dispatch1(std::intptr_t a1, std::uint64_t *n)
2166 {   if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
2167     else return OP::op(vector_of_handle(a1), n);
2168 }
2169 
2170 template <class OP,class RES>
op_dispatch2(std::intptr_t a1,std::intptr_t a2)2171 inline RES op_dispatch2(std::intptr_t a1, std::intptr_t a2)
2172 {   if (stored_as_fixnum(a1))
2173     {   if (stored_as_fixnum(a2))
2174             return OP::op(int_of_handle(a1), int_of_handle(a2));
2175         else return OP::op(int_of_handle(a1), vector_of_handle(a2));
2176     }
2177     else
2178     {   if (stored_as_fixnum(a2))
2179             return OP::op(vector_of_handle(a1), int_of_handle(a2));
2180         else return OP::op(vector_of_handle(a1), vector_of_handle(a2));
2181     }
2182 }
2183 
always_copy_bignum(std::uint64_t * p)2184 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
2185 {   std::size_t n = number_size(p);
2186     push(p);
2187     std::uint64_t *r = reserve(n);
2188     pop(p);
2189     std::memcpy(r, p, n*sizeof(std::uint64_t));
2190     return confirm_size(r, n, n);
2191 }
2192 
copy_if_no_garbage_collector(std::uint64_t * p)2193 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
2194 {   std::size_t n = number_size(p);
2195     push(p);
2196     std::uint64_t *r = reserve(n);
2197     pop(p);
2198     std::memcpy(r, p, n*sizeof(std::uint64_t));
2199     return confirm_size(r, n, n);
2200 }
2201 
copy_if_no_garbage_collector(std::intptr_t pp)2202 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
2203 {   if (stored_as_fixnum(pp)) return pp;
2204     std::uint64_t *p = vector_of_handle(pp);
2205     std::size_t n = number_size(p);
2206     push(p);
2207     std::uint64_t *r = reserve(n);
2208     pop(p);
2209     std::memcpy(r, p, n*sizeof(std::uint64_t));
2210     return confirm_size(r, n, n);
2211 }
2212 
2213 #endif // !CSL
2214 
2215 #else // none if MALLOC, LISP or NEW specified.
2216 #error Unspecified memory model
2217 #endif
2218 
2219 // The main arithmetic operations are supported by code that can work on
2220 // Bignums stored as vectors of digits or on Fixnums represented as (tagged)
2221 // immediate data, or as mixtures. For each operation there is a class, and
2222 // methods called "op" within it deal with the various cases via overloading.
2223 
2224 class Plus
2225 {
2226 public:
2227     static std::intptr_t op(std::int64_t, std::int64_t);
2228     static std::intptr_t op(std::int64_t, std::uint64_t *);
2229     static std::intptr_t op(std::uint64_t *, std::int64_t);
2230     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2231 };
2232 
2233 inline std::intptr_t bigplus_small(std::intptr_t, std::int64_t);
2234 
2235 class Difference
2236 {
2237 public:
2238     static std::intptr_t op(std::int64_t, std::int64_t);
2239     static std::intptr_t op(std::int64_t, std::uint64_t *);
2240     static std::intptr_t op(std::uint64_t *, std::int64_t);
2241     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2242 };
2243 
2244 class RevDifference
2245 {
2246 public:
2247     static std::intptr_t op(std::int64_t, std::int64_t);
2248     static std::intptr_t op(std::int64_t, std::uint64_t *);
2249     static std::intptr_t op(std::uint64_t *, std::int64_t);
2250     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2251 };
2252 
2253 class Times
2254 {
2255 public:
2256     static std::intptr_t op(std::int64_t, std::int64_t);
2257     static std::intptr_t op(std::int64_t, std::uint64_t *);
2258     static std::intptr_t op(std::uint64_t *, std::int64_t);
2259     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2260 };
2261 
2262 class Quotient
2263 {
2264 public:
2265     static std::intptr_t op(std::int64_t, std::int64_t);
2266     static std::intptr_t op(std::int64_t, std::uint64_t *);
2267     static std::intptr_t op(std::uint64_t *, std::int64_t);
2268     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2269 };
2270 
2271 class Remainder
2272 {
2273 public:
2274     static std::intptr_t op(std::int64_t, std::int64_t);
2275     static std::intptr_t op(std::int64_t, std::uint64_t *);
2276     static std::intptr_t op(std::uint64_t *, std::int64_t);
2277     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2278 };
2279 
2280 class Divide
2281 {
2282 public:
2283     static std::intptr_t op(std::int64_t, std::int64_t);
2284     static std::intptr_t op(std::int64_t, std::uint64_t *);
2285     static std::intptr_t op(std::uint64_t *, std::int64_t);
2286     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2287     static std::intptr_t op(std::int64_t, std::int64_t, std::intptr_t &);
2288     static std::intptr_t op(std::int64_t, std::uint64_t *,
2289                             std::intptr_t &);
2290     static std::intptr_t op(std::uint64_t *, std::int64_t,
2291                             std::intptr_t &);
2292     static std::intptr_t op(std::uint64_t *, std::uint64_t *,
2293                             std::intptr_t &);
2294 };
2295 
2296 class Gcd
2297 {
2298 public:
2299     static std::intptr_t op(std::int64_t, std::int64_t);
2300     static std::intptr_t op(std::int64_t, std::uint64_t *);
2301     static std::intptr_t op(std::uint64_t *, std::int64_t);
2302     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2303 };
2304 
2305 class Lcm
2306 {
2307 public:
2308     static std::intptr_t op(std::int64_t, std::int64_t);
2309     static std::intptr_t op(std::int64_t, std::uint64_t *);
2310     static std::intptr_t op(std::uint64_t *, std::int64_t);
2311     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2312 };
2313 
2314 class Logand
2315 {
2316 public:
2317     static std::intptr_t op(std::int64_t, std::int64_t);
2318     static std::intptr_t op(std::int64_t, std::uint64_t *);
2319     static std::intptr_t op(std::uint64_t *, std::int64_t);
2320     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2321 };
2322 
2323 class Logor
2324 {
2325 public:
2326     static std::intptr_t op(std::int64_t, std::int64_t);
2327     static std::intptr_t op(std::int64_t, std::uint64_t *);
2328     static std::intptr_t op(std::uint64_t *, std::int64_t);
2329     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2330 };
2331 
2332 class Logxor
2333 {
2334 public:
2335     static std::intptr_t op(std::int64_t, std::int64_t);
2336     static std::intptr_t op(std::int64_t, std::uint64_t *);
2337     static std::intptr_t op(std::uint64_t *, std::int64_t);
2338     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2339 };
2340 
2341 class Logeqv
2342 {
2343 public:
2344     static std::intptr_t op(std::int64_t, std::int64_t);
2345     static std::intptr_t op(std::int64_t, std::uint64_t *);
2346     static std::intptr_t op(std::uint64_t *, std::int64_t);
2347     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2348 };
2349 
2350 class Zerop
2351 {
2352 public:
2353     static bool op(std::int64_t w);
2354     static bool op(std::uint64_t *w);
2355 };
2356 
2357 class Onep
2358 {
2359 public:
2360     static bool op(std::int64_t w);
2361     static bool op(uint64_t *w);
2362 };
2363 
2364 class Minusp
2365 {
2366 public:
2367     static bool op(std::int64_t w);
2368     static bool op(uint64_t *w);
2369 };
2370 
2371 class Evenp
2372 {
2373 public:
2374     static bool op(std::int64_t w);
2375     static bool op(uint64_t *w);
2376 };
2377 
2378 class Oddp
2379 {
2380 public:
2381     static bool op(std::int64_t w);
2382     static bool op(uint64_t *w);
2383 };
2384 
2385 class Eqn
2386 {
2387 public:
2388     static bool op(std::int64_t, std::int64_t);
2389     static bool op(std::int64_t, std::uint64_t *);
2390     static bool op(std::uint64_t *, std::int64_t);
2391     static bool op(std::uint64_t *, std::uint64_t *);
2392 // Even comparing a floating point number with an integer for equality
2393 // turns out to be messier than one might have hoped!
2394     static bool op(std::int64_t, float);
2395     static bool op(std::uint64_t *, float);
2396     static bool op(float, std::int64_t);
2397     static bool op(float, std::uint64_t *);
2398     static bool op(std::int64_t, double);
2399     static bool op(std::uint64_t *, double);
2400     static bool op(double, std::int64_t);
2401     static bool op(double, std::uint64_t *);
2402 #ifdef softfloat_h
2403     static bool op(std::int64_t, float128_t);
2404     static bool op(std::uint64_t *, float128_t);
2405     static bool op(float128_t, std::int64_t);
2406     static bool op(float128_t, std::uint64_t *);
2407 #endif // softfloat_h
2408 };
2409 
2410 class Neqn
2411 {
2412 public:
2413     static bool op(std::int64_t, std::int64_t);
2414     static bool op(std::int64_t, std::uint64_t *);
2415     static bool op(std::uint64_t *, std::int64_t);
2416     static bool op(std::uint64_t *, std::uint64_t *);
2417 // Even comparing a floating point number with an integer for equality
2418 // turns out to be messier than one might have hoped!
2419     static bool op(std::int64_t, float);
2420     static bool op(std::uint64_t *, float);
2421     static bool op(float, std::int64_t);
2422     static bool op(float, std::uint64_t *);
2423     static bool op(std::int64_t, double);
2424     static bool op(std::uint64_t *, double);
2425     static bool op(double, std::int64_t);
2426     static bool op(double, std::uint64_t *);
2427 #ifdef softfloat_h
2428     static bool op(std::int64_t, float128_t);
2429     static bool op(std::uint64_t *, float128_t);
2430     static bool op(float128_t, std::int64_t);
2431     static bool op(float128_t, std::uint64_t *);
2432 #endif // softfloat_h
2433 };
2434 
2435 class Geq
2436 {
2437 public:
2438     static bool op(std::int64_t, std::int64_t);
2439     static bool op(std::int64_t, std::uint64_t *);
2440     static bool op(std::uint64_t *, std::int64_t);
2441     static bool op(std::uint64_t *, std::uint64_t *);
2442 // Comparing a bignum against a floating point value has multiple cases
2443 // to consider, but needs special implementation so that neither rounding
2444 // nor overflow not Infinities/NaNs lead to incorrect results.
2445     static bool op(std::int64_t, float);
2446     static bool op(std::uint64_t *, float);
2447     static bool op(float, std::int64_t);
2448     static bool op(float, std::uint64_t *);
2449     static bool op(std::int64_t, double);
2450     static bool op(std::uint64_t *, double);
2451     static bool op(double, std::int64_t);
2452     static bool op(double, std::uint64_t *);
2453 #ifdef softfloat_h
2454     static bool op(std::int64_t, float128_t);
2455     static bool op(std::uint64_t *, float128_t);
2456     static bool op(float128_t, std::int64_t);
2457     static bool op(float128_t, std::uint64_t *);
2458 #endif // softfloat_h
2459 };
2460 
2461 class Greaterp
2462 {
2463 public:
2464     static bool op(std::int64_t, std::int64_t);
2465     static bool op(std::int64_t, std::uint64_t *);
2466     static bool op(std::uint64_t *, std::int64_t);
2467     static bool op(std::uint64_t *, std::uint64_t *);
2468     static bool op(std::int64_t, float);
2469     static bool op(std::uint64_t *, float);
2470     static bool op(float, std::int64_t);
2471     static bool op(float, std::uint64_t *);
2472     static bool op(std::int64_t, double);
2473     static bool op(std::uint64_t *, double);
2474     static bool op(double, std::int64_t);
2475     static bool op(double, std::uint64_t *);
2476 #ifdef softfloat_h
2477     static bool op(std::int64_t, float128_t);
2478     static bool op(std::uint64_t *, float128_t);
2479     static bool op(float128_t, std::int64_t);
2480     static bool op(float128_t, std::uint64_t *);
2481 #endif // softfloat_h
2482 };
2483 
2484 class Leq
2485 {
2486 public:
2487     static bool op(std::int64_t, std::int64_t);
2488     static bool op(std::int64_t, std::uint64_t *);
2489     static bool op(std::uint64_t *, std::int64_t);
2490     static bool op(std::uint64_t *, std::uint64_t *);
2491     static bool op(std::int64_t, float);
2492     static bool op(std::uint64_t *, float);
2493     static bool op(float, std::int64_t);
2494     static bool op(float, std::uint64_t *);
2495     static bool op(std::int64_t, double);
2496     static bool op(std::uint64_t *, double);
2497     static bool op(double, std::int64_t);
2498     static bool op(double, std::uint64_t *);
2499 #ifdef softfloat_h
2500     static bool op(std::int64_t, float128_t);
2501     static bool op(std::uint64_t *, float128_t);
2502     static bool op(float128_t, std::int64_t);
2503     static bool op(float128_t, std::uint64_t *);
2504 #endif // softfloat_h
2505 };
2506 
2507 class Lessp
2508 {
2509 public:
2510     static bool op(std::int64_t, std::int64_t);
2511     static bool op(std::int64_t, std::uint64_t *);
2512     static bool op(std::uint64_t *, std::int64_t);
2513     static bool op(std::uint64_t *, std::uint64_t *);
2514     static bool op(std::int64_t, float);
2515     static bool op(std::uint64_t *, float);
2516     static bool op(float, std::int64_t);
2517     static bool op(float, std::uint64_t *);
2518     static bool op(std::int64_t, double);
2519     static bool op(std::uint64_t *, double);
2520     static bool op(double, std::int64_t);
2521     static bool op(double, std::uint64_t *);
2522 #ifdef softfloat_h
2523     static bool op(std::int64_t, float128_t);
2524     static bool op(std::uint64_t *, float128_t);
2525     static bool op(float128_t, std::int64_t);
2526     static bool op(float128_t, std::uint64_t *);
2527 #endif // softfloat_h
2528 };
2529 
2530 class Add1
2531 {
2532 public:
2533     static std::intptr_t op(std::int64_t w);
2534     static std::intptr_t op(uint64_t *w);
2535 };
2536 
2537 class Sub1
2538 {
2539 public:
2540     static std::intptr_t op(std::int64_t w);
2541     static std::intptr_t op(uint64_t *w);
2542 };
2543 
2544 class Minus
2545 {
2546 public:
2547     static std::intptr_t op(std::int64_t w);
2548     static std::intptr_t op(uint64_t *w);
2549 };
2550 
2551 class Abs
2552 {
2553 public:
2554     static std::intptr_t op(std::int64_t w);
2555     static std::intptr_t op(uint64_t *w);
2556 };
2557 
2558 class Square
2559 {
2560 public:
2561     static std::intptr_t op(std::int64_t w);
2562     static std::intptr_t op(uint64_t *w);
2563 };
2564 
2565 class Isqrt
2566 {
2567 public:
2568     static std::intptr_t op(std::int64_t w);
2569     static std::intptr_t op(uint64_t *w);
2570 };
2571 
2572 class Lognot
2573 {
2574 public:
2575     static std::intptr_t op(std::int64_t w);
2576     static std::intptr_t op(uint64_t *w);
2577 };
2578 
2579 class Pow
2580 {
2581 public:
2582     static std::intptr_t op(std::int64_t, std::int64_t);
2583     static std::intptr_t op(std::uint64_t *, std::int64_t);
2584     static std::intptr_t op(std::int64_t, std::uint64_t *);
2585     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2586     static double op(std::int64_t, double);
2587     static double op(std::uint64_t *, double);
2588 };
2589 
2590 class LeftShift
2591 {
2592 public:
2593     static std::intptr_t op(std::int64_t, std::int64_t);
2594     static std::intptr_t op(std::uint64_t *, std::int64_t);
2595     static std::intptr_t op(std::int64_t, std::uint64_t *);
2596     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2597 };
2598 
2599 class RightShift
2600 {
2601 public:
2602     static std::intptr_t op(std::int64_t, std::int64_t);
2603     static std::intptr_t op(std::uint64_t *, std::int64_t);
2604     static std::intptr_t op(std::int64_t, std::uint64_t *);
2605     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2606 };
2607 
2608 class IntegerLength
2609 {
2610 public:
2611     static std::size_t op(std::int64_t w);
2612     static std::size_t op(uint64_t *w);
2613 };
2614 
2615 class Low_bit
2616 {
2617 public:
2618     static std::size_t op(std::int64_t w);
2619     static std::size_t op(uint64_t *w);
2620 };
2621 
2622 class Logbitp
2623 {
2624 public:
2625     static bool op(std::int64_t, std::size_t);
2626     static bool op(std::uint64_t *, std::size_t);
2627 };
2628 
2629 class Logcount
2630 {
2631 public:
2632     static std::size_t op(std::int64_t w);
2633     static std::size_t op(uint64_t *w);
2634 };
2635 
2636 class Int64_t
2637 {
2638 public:
2639     static std::int64_t op(std::int64_t w);
2640     static std::int64_t op(uint64_t *w);
2641 };
2642 
2643 class Uint64_t
2644 {
2645 public:
2646     static std::uint64_t op(std::int64_t w);
2647     static std::uint64_t op(uint64_t *w);
2648 };
2649 
2650 class Float
2651 {
2652 public:
2653     static float op(std::int64_t w);
2654     static float op(uint64_t *w);
2655 };
2656 
2657 class Double
2658 {
2659 public:
2660     static double op(std::int64_t w);
2661     static double op(uint64_t *w);
2662 };
2663 
2664 class Frexp
2665 {
2666 public:
2667     static double op(std::int64_t, std::int64_t &x);
2668     static double op(std::uint64_t *, std::int64_t &x);
2669 };
2670 
2671 #ifdef softfloat_h
2672 
2673 class Float128
2674 {
2675 public:
2676     static float128_t op(std::int64_t w);
2677     static float128_t op(uint64_t *w);
2678 };
2679 
2680 class Frexp128
2681 {
2682 public:
2683     static float128_t op(std::int64_t, std::int64_t &x);
2684     static float128_t op(std::uint64_t *, std::int64_t &x);
2685 };
2686 
2687 #endif // softfloat_h
2688 
2689 #ifdef CSL
2690 
2691 class ModularPlus
2692 {
2693 public:
2694     static std::intptr_t op(std::int64_t, std::int64_t);
2695     static std::intptr_t op(std::int64_t, std::uint64_t *);
2696     static std::intptr_t op(std::uint64_t *, std::int64_t);
2697     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2698 };
2699 
2700 class ModularDifference
2701 {
2702 public:
2703     static std::intptr_t op(std::int64_t, std::int64_t);
2704     static std::intptr_t op(std::int64_t, std::uint64_t *);
2705     static std::intptr_t op(std::uint64_t *, std::int64_t);
2706     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2707 };
2708 
2709 class ModularTimes
2710 {
2711 public:
2712     static std::intptr_t op(std::int64_t, std::int64_t);
2713     static std::intptr_t op(std::int64_t, std::uint64_t *);
2714     static std::intptr_t op(std::uint64_t *, std::int64_t);
2715     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2716 };
2717 
2718 class ModularExpt
2719 {
2720 public:
2721     static std::intptr_t op(std::int64_t, std::int64_t);
2722     static std::intptr_t op(std::int64_t, std::uint64_t *);
2723     static std::intptr_t op(std::uint64_t *, std::int64_t);
2724     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2725 };
2726 
2727 class ModularQuotient
2728 {
2729 public:
2730     static std::intptr_t op(std::int64_t, std::int64_t);
2731     static std::intptr_t op(std::int64_t, std::uint64_t *);
2732     static std::intptr_t op(std::uint64_t *, std::int64_t);
2733     static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2734 };
2735 
2736 class ModularMinus
2737 {
2738 public:
2739     static std::intptr_t op(std::int64_t w);
2740     static std::intptr_t op(uint64_t *w);
2741 };
2742 
2743 class ModularReciprocal
2744 {
2745 public:
2746     static std::intptr_t op(std::int64_t w);
2747     static std::intptr_t op(uint64_t *w);
2748 };
2749 
2750 class ModularNumber
2751 {
2752 public:
2753     static std::intptr_t op(std::int64_t w);
2754     static std::intptr_t op(uint64_t *w);
2755 };
2756 
2757 class SetModulus
2758 {
2759 public:
2760     static std::intptr_t op(std::int64_t w);
2761     static std::intptr_t op(uint64_t *w);
2762 };
2763 
2764 #endif // CSL
2765 
2766 inline string_handle bignum_to_string(std::intptr_t aa);
2767 inline string_handle bignum_to_string_hex(std::intptr_t aa);
2768 inline string_handle bignum_to_string_octal(std::intptr_t aa);
2769 inline string_handle bignum_to_string_binary(std::intptr_t aa);
2770 
2771 class Bignum;
2772 
2773 inline void display(const char *label, const std::uint64_t *a,
2774                     std::size_t lena);
2775 inline void display(const char *label, std::intptr_t a);
2776 inline void display(const char *label, const Bignum &a);
2777 
2778 
2779 //=========================================================================
2780 //=========================================================================
2781 // I have a class Bignum that wraps up the representation of a number
2782 // and then allows me to overload most operators so that big numbers can be
2783 // used in C++ code anmost as if they were a natural proper type. The main
2784 // big oddity will be that to denote a Bignum literal it will be necessary
2785 // to use a constructor, with obvious constructors accepting integers of up
2786 // to 64-bits and a perhaps less obvious one taking a string that is the
2787 // decimal denotation of the integer concerned.
2788 //=========================================================================
2789 //=========================================================================
2790 
2791 class Bignum
2792 {
2793 public:
2794 // a Bignum only had one data field, and that is simple plain data.
2795     std::intptr_t val;
2796 
2797 
2798 // A default constructor build a Bignum with no stored data.
Bignum()2799     Bignum()
2800     {   val = 0;
2801     }
2802 // In the next constructor the boolean argument is not used at run time but
2803 // serves to indicate which constructor is wanted.
Bignum(bool set_val,std::intptr_t v)2804     Bignum(bool set_val, std::intptr_t v)
2805     {   val = v;
2806     }
~Bignum()2807     ~Bignum()
2808     {   abandon(val);
2809         val = 0;
2810     }
Bignum(std::uint64_t * p)2811     Bignum(std::uint64_t *p)
2812     {   val = vector_to_handle(p);
2813     }
2814 // The code here is more complicated than I would have liked. What I want is
2815 // that ANY sort of C++ integer can be converted to a Bignum. My first
2816 // attempts arranges that int32_t and int64_t could be, however there is
2817 // no guarantee that just coping with all the width-specified cases will
2818 // then cover mere "int" and "long". So what I now use is a template
2819 // definition filtered with magic that constrains it to only matching the
2820 // template parameter against some sort of integer. In some cases (such as
2821 // here) I provide one version to deal with all the signed integer cases
2822 // and another all the unsigned ones.
2823 // In general I will receive integers this way and cast them to 64-bit
2824 // values. This means that if the platform happens to provide intmax_t
2825 // that is wider than that then it will not be handled well!
2826     template <typename T,
2827               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2828               typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
Bignum(T n)2829     Bignum(T n)
2830     {   val = int_to_bignum(static_cast<std::int64_t>(n));
2831     }
2832     template <typename T,
2833               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2834               typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
Bignum(T n)2835     Bignum(T n)
2836     {   val = unsigned_int_to_bignum(static_cast<std::uint64_t>(n));
2837     }
Bignum(float d)2838     Bignum(float d)
2839     {   val = round_double_to_int(static_cast<double>(d));
2840     }
Bignum(double d)2841     Bignum(double d)
2842     {   val = round_double_to_int(d);
2843     }
2844 #ifdef softfloat_h
Bignum(float128_t d)2845     Bignum(float128_t d)
2846     {   val = round_float128_to_int(d);
2847     }
2848 #endif // softfloat_h
Bignum(const char * s)2849     Bignum(const char *s)
2850     {   val = string_to_bignum(s);
2851     }
Bignum(const Bignum & a)2852     Bignum(const Bignum &a)
2853     {   val = copy_if_no_garbage_collector(a.val);
2854     }
2855     template <typename T,
2856               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2857               typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
operator T()2858     operator T()
2859     {   return static_cast<T>(op_dispatch1<Int64_t, std::int64_t>(val));
2860     }
2861     template <typename T,
2862               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2863               typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
operator T()2864     operator T()
2865     {   return static_cast<T>(op_dispatch1<Uint64_t, std::uint64_t>(val));
2866     }
operator double()2867     operator double()
2868     {   return op_dispatch1<Double, double>(val);
2869     }
vec() const2870     std::uint64_t *vec() const
2871     {   return vector_of_handle(val);
2872     }
2873 
2874 // In a way that is BAD I make the result of an assignment void rather than
2875 // the value that is assigned. This is so I do not make gratuitous extra
2876 // copies of it in the common case where the value is not used, but it could
2877 // catch out the unwary.
operator =(const Bignum & x)2878     inline void operator = (const Bignum &x)
2879     {   if (this == &x) return; // assign to self - a silly case!
2880         abandon(val);
2881 // See comment in the copy constructor.
2882         val = copy_if_no_garbage_collector(x.val);
2883     }
2884 
2885     template <typename T,
2886               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2887               typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
operator =(const T x)2888     inline void operator = (const T x)
2889     {   abandon(val);
2890         val = int_to_bignum(static_cast<std::int64_t>(x));
2891     }
2892 
2893     template <typename T,
2894               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2895               typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
operator =(const T x)2896     inline void operator = (const T x)
2897     {   abandon(val);
2898         val = unsigned_int_to_bignum(static_cast<std::uint64_t>(x));
2899     }
2900 
operator =(const char * x)2901     inline void operator = (const char *x)
2902     {   abandon(val);
2903         val = string_to_bignum(x);
2904     }
2905 
operator +(const Bignum & x) const2906     inline Bignum operator +(const Bignum &x) const
2907     {   return Bignum(true, op_dispatch2<Plus,std::intptr_t>(val, x.val));
2908     }
2909 
2910     template <typename T,
2911               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator +(const T x) const2912     inline Bignum operator +(const T x) const
2913     {   return Bignum(true, op_dispatch2<Plus,std::intptr_t>(val,
2914                       Bignum(x).val));
2915     }
2916 
operator -(const Bignum & x) const2917     inline Bignum operator -(const Bignum &x) const
2918     {   return Bignum(true, op_dispatch2<Difference,std::intptr_t>(val,
2919                       x.val));
2920     }
2921 
2922     template <typename T,
2923               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator -(const T x) const2924     inline Bignum operator -(const T x) const
2925     {   return Bignum(true, op_dispatch2<Difference,std::intptr_t>(val,
2926                       Bignum(x).val));
2927     }
2928 
operator *(const Bignum & x) const2929     inline Bignum operator *(const Bignum &x) const
2930     {   return Bignum(true, op_dispatch2<Times,std::intptr_t>(val,
2931                       x.val));
2932     }
2933 
2934     template <typename T,
2935               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator *(const T x) const2936     inline Bignum operator *(const T x) const
2937     {   return Bignum(true, op_dispatch2<Times,std::intptr_t>(val,
2938                       Bignum(x).val));
2939     }
2940 
operator /(const Bignum & x) const2941     inline Bignum operator /(const Bignum &x) const
2942     {   return Bignum(true, op_dispatch2<Quotient,std::intptr_t>(val,
2943                       x.val));
2944     }
2945 
2946     template <typename T,
2947               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator /(const T x) const2948     inline Bignum operator /(const T x) const
2949     {   return Bignum(true, op_dispatch2<Quotient,std::intptr_t>(val,
2950                       Bignum(x).val));
2951     }
2952 
operator %(const Bignum & x) const2953     inline Bignum operator %(const Bignum &x) const
2954     {   return Bignum(true, op_dispatch2<Remainder,std::intptr_t>(val,
2955                       x.val));
2956     }
2957 
2958     template <typename T,
2959               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator %(const T x) const2960     inline Bignum operator %(const T x) const
2961     {   return Bignum(true, op_dispatch2<Remainder,std::intptr_t>(val,
2962                       Bignum(x).val));
2963     }
2964 
operator -() const2965     inline Bignum operator -() const
2966     {   return Bignum(true, op_dispatch1<Minus,std::intptr_t>(val));
2967     }
2968 
operator &(const Bignum & x) const2969     inline Bignum operator &(const Bignum &x) const
2970     {   return Bignum(true, op_dispatch2<Logand,std::intptr_t>(val,
2971                       x.val));
2972     }
2973 
2974     template <typename T,
2975               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator &(const T x) const2976     inline Bignum operator &(const T x) const
2977     {   return Bignum(true, op_dispatch2<Logand,std::intptr_t>(val,
2978                       Bignum(x).val));
2979     }
2980 
operator |(const Bignum & x) const2981     inline Bignum operator |(const Bignum &x) const
2982     {   return Bignum(true, op_dispatch2<Logor,std::intptr_t>(val,
2983                       x.val));
2984     }
2985 
2986     template <typename T,
2987               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator |(const T x) const2988     inline Bignum operator |(const T x) const
2989     {   return Bignum(true, op_dispatch2<Logor,std::intptr_t>(val,
2990                       Bignum(x).val));
2991     }
2992 
operator ^(const Bignum & x) const2993     inline Bignum operator ^(const Bignum &x) const
2994     {   return Bignum(true, op_dispatch2<Logxor,std::intptr_t>(val,
2995                       x.val));
2996     }
2997 
2998     template <typename T,
2999               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator ^(const T x) const3000     inline Bignum operator ^(const T x) const
3001     {   return Bignum(true, op_dispatch2<Logxor,std::intptr_t>(val,
3002                       Bignum(x).val));
3003     }
3004 
3005     template <typename T,
3006               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator <<(T n) const3007     inline Bignum operator <<(T n) const
3008     {   return Bignum(true, op_dispatch1<LeftShift,std::intptr_t>(val,
3009                       static_cast<std::int64_t>(n)));
3010     }
3011 
operator <<(Bignum n) const3012     inline Bignum operator <<(Bignum n) const
3013     {   return Bignum(true, op_dispatch1<LeftShift,std::intptr_t>(val,
3014                       static_cast<std::int64_t>(n)));
3015     }
3016 
3017     template <typename T,
3018               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator >>(T n) const3019     inline Bignum operator >>(T n) const
3020     {   return Bignum(true, op_dispatch1<RightShift,std::intptr_t>(val,
3021                       static_cast<std::int64_t>(n)));
3022     }
3023 
operator >>(Bignum n) const3024     inline Bignum operator >>(Bignum n) const
3025     {   return Bignum(true, op_dispatch1<RightShift,std::intptr_t>(val,
3026                       static_cast<std::int64_t>(n)));
3027     }
3028 
operator ~() const3029     inline Bignum operator ~() const
3030     {   return Bignum(true, op_dispatch1<Lognot,std::intptr_t>(val));
3031     }
3032 
operator ==(const Bignum & x) const3033     inline bool operator ==(const Bignum &x) const
3034     {   return op_dispatch2<Eqn,bool>(val, x.val);
3035     }
3036     template <typename T,
3037               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator ==(const T x) const3038     inline bool operator ==(const T x) const
3039     {   return op_dispatch2<Eqn,bool>(val, Bignum(x).val);
3040     }
3041 
operator !=(const Bignum & x) const3042     inline bool operator !=(const Bignum &x) const
3043     {   return !op_dispatch2<Eqn,bool>(val, x.val);
3044     }
3045     template <typename T,
3046               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator !=(const T x) const3047     inline bool operator !=(const T x) const
3048     {   return !op_dispatch2<Eqn,bool>(val, Bignum(x).val);
3049     }
3050 
operator >(const Bignum & x) const3051     inline bool operator >(const Bignum &x) const
3052     {   return op_dispatch2<Greaterp,bool>(val, x.val);
3053     }
3054     template <typename T,
3055               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator >(const T x) const3056     inline bool operator >(const T x) const
3057     {   return op_dispatch2<Greaterp,bool>(val, Bignum(x).val);
3058     }
3059 
operator >=(const Bignum & x) const3060     inline bool operator >=(const Bignum &x) const
3061     {   return op_dispatch2<Geq,bool>(val, x.val);
3062     }
3063     template <typename T,
3064               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator >=(const T x) const3065     inline bool operator >=(const T x) const
3066     {   return op_dispatch2<Geq,bool>(val, Bignum(x).val);
3067     }
3068 
operator <(const Bignum & x) const3069     inline bool operator <(const Bignum &x) const
3070     {   return op_dispatch2<Lessp,bool>(val, x.val);
3071     }
3072     template <typename T,
3073               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator <(const T x) const3074     inline bool operator <(const T x) const
3075     {   return op_dispatch2<Lessp,bool>(val, Bignum(x).val);
3076     }
3077 
operator <=(const Bignum & x) const3078     inline bool operator <=(const Bignum &x) const
3079     {   return op_dispatch2<Leq,bool>(val, x.val);
3080     }
3081     template <typename T,
3082               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator <=(const T x) const3083     inline bool operator <=(const T x) const
3084     {   return op_dispatch2<Leq,bool>(val, Bignum(x).val);
3085     }
3086 
operator +=(const Bignum & x)3087     inline void operator +=(const Bignum &x)
3088     {   std::intptr_t r = op_dispatch2<Plus,std::intptr_t>(val, x.val);
3089         abandon(val);
3090         val = r;
3091     }
3092 
3093     template <typename T,
3094               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator +=(T x)3095     inline void operator +=(T x)
3096     {   std::intptr_t r = op_dispatch2<Plus,std::intptr_t>(val,
3097                           Bignum(x).val);
3098         abandon(val);
3099         val = r;
3100     }
3101 
operator -=(const Bignum & x)3102     inline void operator -=(const Bignum &x)
3103     {   std::intptr_t r = op_dispatch2<Difference,std::intptr_t>(val,
3104                           x.val);
3105         abandon(val);
3106         val = r;
3107     }
3108 
3109     template <typename T,
3110               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator -=(T x)3111     inline void operator -=(T x)
3112     {   std::intptr_t r = op_dispatch2<Difference,std::intptr_t>(val,
3113                           Bignum(x).val);
3114         abandon(val);
3115         val = r;
3116     }
3117 
operator *=(const Bignum & x)3118     inline void operator *=(const Bignum &x)
3119     {   std::intptr_t r = op_dispatch2<Times,std::intptr_t>(val, x.val);
3120         abandon(val);
3121         val = r;
3122     }
3123     template <typename T,
3124               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator *=(T x)3125     inline void operator *=(T x)
3126     {   std::intptr_t r = op_dispatch2<Times,std::intptr_t>(val,
3127                           Bignum(x).val);
3128         abandon(val);
3129         val = r;
3130     }
3131 
3132 
operator /=(const Bignum & x)3133     inline void operator /=(const Bignum &x)
3134     {   std::intptr_t r = op_dispatch2<Quotient,std::intptr_t>(val,
3135                           x.val);
3136         abandon(val);
3137         val = r;
3138     }
3139     template <typename T,
3140               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator /=(T x)3141     inline void operator /=(T x)
3142     {   std::intptr_t r = op_dispatch2<Quotient,std::intptr_t>(val,
3143                           Bignum(x).val);
3144         abandon(val);
3145         val = r;
3146     }
3147 
3148 
operator %=(const Bignum & x)3149     inline void operator %=(const Bignum &x)
3150     {   std::intptr_t r = op_dispatch2<Remainder,std::intptr_t>(val,
3151                           x.val);
3152         abandon(val);
3153         val = r;
3154     }
3155 
3156     template <typename T,
3157               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator %=(T x)3158     inline void operator %=(T x)
3159     {   std::intptr_t r = op_dispatch2<Remainder,std::intptr_t>(val,
3160                           Bignum(x).val);
3161         abandon(val);
3162         val = r;
3163     }
3164 
operator &=(const Bignum & x)3165     inline void operator &=(const Bignum &x)
3166     {   std::intptr_t r = op_dispatch2<Logand,std::intptr_t>(val, x.val);
3167         abandon(val);
3168         val = r;
3169     }
3170     template <typename T,
3171               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator &=(T x)3172     inline void operator &=(T x)
3173     {   std::intptr_t r = op_dispatch2<Logand,std::intptr_t>(val,
3174                           Bignum(x).val);
3175         abandon(val);
3176         val = r;
3177     }
3178 
operator |=(const Bignum & x)3179     inline void operator |=(const Bignum &x)
3180     {   std::intptr_t r = op_dispatch2<Logor,std::intptr_t>(val, x.val);
3181         abandon(val);
3182         val = r;
3183     }
3184 
3185     template <typename T,
3186               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator |=(T x)3187     inline void operator |=(T x)
3188     {   std::intptr_t r = op_dispatch2<Logor,std::intptr_t>(val,
3189                           Bignum(x).val);
3190         abandon(val);
3191         val = r;
3192     }
3193 
operator ^=(const Bignum & x)3194     inline void operator ^=(const Bignum &x)
3195     {   std::intptr_t r = op_dispatch2<Logxor,std::intptr_t>(val, x.val);
3196         abandon(val);
3197         val = r;
3198     }
3199 
3200     template <typename T,
3201               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator ^=(T x)3202     inline void operator ^=(T x)
3203     {   std::intptr_t r = op_dispatch2<Logxor,std::intptr_t>(val,
3204                           Bignum(x).val);
3205         abandon(val);
3206         val = r;
3207     }
3208 
3209     template <typename T,
3210               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator <<=(T n)3211     inline void operator <<=(T n)
3212     {   std::intptr_t r = op_dispatch1<LeftShift,std::intptr_t>(val,
3213                           static_cast<std::int64_t>(n));
3214         abandon(val);
3215         val = r;
3216     }
3217 
3218     template <typename T,
3219               typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator >>=(T n)3220     inline void operator >>=(T n)
3221     {   std::intptr_t r = op_dispatch1<RightShift,std::intptr_t>(val,
3222                           static_cast<std::int64_t>(n));
3223         abandon(val);
3224         val = r;
3225     }
3226 
operator ++()3227     inline Bignum operator ++()
3228     {   std::intptr_t r = bigplus_small(val, 1);
3229         abandon(val);
3230         val = r;
3231         return *this;
3232     }
3233 
operator ++(int)3234     inline Bignum operator ++(int)
3235     {   std::intptr_t r = bigplus_small(val, 1);
3236 // I assign explicitly to oldval.val because trying to use a constructor
3237 // of Bignum or assigning to one would so things more complicated than I want!
3238         Bignum oldval;
3239         oldval.val = val;
3240         val = r;
3241         return oldval;
3242     }
3243 
operator --()3244     inline Bignum operator --()
3245     {   std::intptr_t r = bigplus_small(val, -1);
3246         abandon(val);
3247         val = r;
3248         return *this;
3249     }
3250 
operator --(int)3251     inline Bignum operator --(int)
3252     {   std::intptr_t r = bigplus_small(val, -1);
3253         Bignum oldval;
3254         oldval.val = val;
3255         val = r;
3256         return oldval;
3257     }
3258 
operator <<(std::ostream & out,const Bignum & a)3259     friend std::ostream & operator << (std::ostream &out, const Bignum &a)
3260     {   std::ios_base::fmtflags fg = out.flags();
3261 #if defined LISP && !defined ZAPPA
3262         LispObject s;
3263 #else
3264         char *s;
3265 #endif
3266         if ((static_cast<unsigned int>(fg) & std::ios_base::hex) != 0U)
3267             s = bignum_to_string_hex(a.val);
3268         else if ((static_cast<unsigned int>(fg) & std::ios_base::oct) != 0U)
3269             s = bignum_to_string_octal(a.val);
3270         else if ((static_cast<unsigned int>(fg) & std::ios_base::dec) != 0U)
3271             s = bignum_to_string(a.val);
3272         else if (radix::is_binary_output(out))
3273             s = bignum_to_string_binary(a.val);
3274         else s = bignum_to_string(a.val);
3275 #if defined LISP && !defined ZAPPA
3276         std::string ss(s, length_of_byteheader(qheader(s)));
3277         out << ss;
3278 #else
3279         out << s;
3280 #endif
3281         abandon_string(s);
3282         return out;
3283     }
operator >>(std::istream & in,Bignum & a)3284     friend std::istream & operator >> (std::istream &in, Bignum &a)
3285     {   std::int64_t n;
3286 // What I really want to do is to read in a string of digits and then
3287 // use string_to_bignum().
3288         in >> n;
3289         abandon(a.val);
3290         a.val = int_to_bignum(n);
3291         return in;
3292     }
3293 };
3294 
3295 // I use a suffix "_Z" for bignums, with Z chosen to reminding me that this
3296 // gives me an Integer, the "Z" (typically written in a blackboard font)
3297 // standing for the ring of integers.
operator ""_Z(const char * s)3298 inline Bignum operator "" _Z(const char *s)
3299 {   return Bignum(s);
3300 }
3301 
to_string(Bignum x)3302 inline const string_handle to_string(Bignum x)
3303 {   return bignum_to_string(x.val);
3304 }
3305 
uniform_positive_bignum(std::size_t n)3306 inline Bignum uniform_positive_bignum(std::size_t n)
3307 {   return Bignum(true, uniform_positive(n));
3308 }
3309 
uniform_signed_bignum(std::size_t n)3310 inline Bignum uniform_signed_bignum(std::size_t n)
3311 {   return Bignum(true, uniform_signed(n));
3312 }
3313 
uniform_upto_bignum(Bignum a)3314 inline Bignum uniform_upto_bignum(Bignum a)
3315 {   return Bignum(true, uniform_upto(a.val));
3316 }
3317 
fudge_distribution_bignum(Bignum a,int n)3318 inline Bignum fudge_distribution_bignum(Bignum a, int n)
3319 {   return Bignum(true, fudge_distribution(a.val, n));
3320 }
3321 
random_upto_bits_bignum(std::size_t n)3322 inline Bignum random_upto_bits_bignum(std::size_t n)
3323 {   return Bignum(true, random_upto_bits(n));
3324 }
3325 
square(const Bignum & x)3326 inline Bignum square(const Bignum &x)
3327 {   return Bignum(true, op_dispatch1<Square,std::intptr_t>(x.val));
3328 }
3329 
isqrt(const Bignum & x)3330 inline Bignum isqrt(const Bignum &x)
3331 {   return Bignum(true, op_dispatch1<Isqrt,std::intptr_t>(x.val));
3332 }
3333 
abs(const Bignum & x)3334 inline Bignum abs(const Bignum &x)
3335 {   return Bignum(true, op_dispatch1<Abs,std::intptr_t>(x.val));
3336 }
3337 
zerop(const Bignum & x)3338 inline bool zerop(const Bignum &x)
3339 {   return op_dispatch1<Zerop,bool>(x.val);
3340 }
3341 
onep(const Bignum & x)3342 inline bool onep(const Bignum &x)
3343 {   return op_dispatch1<Onep,bool>(x.val);
3344 }
3345 
minusp(const Bignum & x)3346 inline bool minusp(const Bignum &x)
3347 {   return op_dispatch1<Minusp,bool>(x.val);
3348 }
3349 
evenp(const Bignum & x)3350 inline bool evenp(const Bignum &x)
3351 {   return op_dispatch1<Evenp,bool>(x.val);
3352 }
3353 
oddp(const Bignum & x)3354 inline bool oddp(const Bignum &x)
3355 {   return op_dispatch1<Oddp,bool>(x.val);
3356 }
3357 template <typename T,
3358           typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
pow(const Bignum & x,T n)3359 inline Bignum pow(const Bignum &x, T n)
3360 {   if (n == 0) return Bignum(true, int_to_bignum(1));
3361     else if (n == 1) return Bignum(true,
3362                                        copy_if_no_garbage_collector(x.val));
3363     else if (n == 2) return square(x);
3364     else return Bignum(true,
3365                            op_dispatch1<Pow,std::intptr_t>(x.val, static_cast<std::int64_t>(n)));
3366 }
3367 
3368 inline double double_bignum(const Bignum &x);
3369 
pow(const Bignum & x,double n)3370 inline double pow(const Bignum &x, double n)
3371 {   return std::pow(double_bignum(x), n);
3372 }
3373 
gcd(const Bignum & x,const Bignum & y)3374 inline Bignum gcd(const Bignum &x, const Bignum &y)
3375 {   return Bignum(true, op_dispatch2<Gcd,std::intptr_t>(x.val,
3376                   y.val));
3377 }
3378 
lcm(const Bignum & x,const Bignum & y)3379 inline Bignum lcm(const Bignum &x, const Bignum &y)
3380 {   return Bignum(true, op_dispatch2<Lcm,std::intptr_t>(x.val,
3381                   y.val));
3382 }
3383 
fix_bignum(double d)3384 inline Bignum fix_bignum(double d)
3385 {   return Bignum(true, trunc_double_to_int(d));
3386 }
3387 
round_bignum(double d)3388 inline Bignum round_bignum(double d)
3389 {   return Bignum(true, round_double_to_int(d));
3390 }
3391 
trunc_bignum(double d)3392 inline Bignum trunc_bignum(double d)
3393 {   return Bignum(true, trunc_double_to_int(d));
3394 }
3395 
floor_bignum(double d)3396 inline Bignum floor_bignum(double d)
3397 {   return Bignum(true, floor_double_to_int(d));
3398 }
3399 
ceil_bignum(double d)3400 inline Bignum ceil_bignum(double d)
3401 {   return Bignum(true, ceiling_double_to_int(d));
3402 }
3403 
fix_bignum(float d)3404 inline Bignum fix_bignum(float d)
3405 {   return fix_bignum(static_cast<double>(d));
3406 }
3407 
round_bignum(float d)3408 inline Bignum round_bignum(float d)
3409 {   return round_bignum(static_cast<double>(d));
3410 }
3411 
trunc_bignum(float d)3412 inline Bignum trunc_bignum(float d)
3413 {   return trunc_bignum(static_cast<double>(d));
3414 }
3415 
floor_bignum(float d)3416 inline Bignum floor_bignum(float d)
3417 {   return floor_bignum(static_cast<double>(d));
3418 }
3419 
ceil_bignum(float d)3420 inline Bignum ceil_bignum(float d)
3421 {   return ceil_bignum(static_cast<double>(d));
3422 }
3423 
float_bignum(const Bignum & x)3424 inline double float_bignum(const Bignum &x)
3425 {   return op_dispatch1<Float,float>(x.val);
3426 }
3427 
double_bignum(const Bignum & x)3428 inline double double_bignum(const Bignum &x)
3429 {   return op_dispatch1<Double,double>(x.val);
3430 }
3431 
3432 // This will return a normalized double and an integer exponent.
3433 // It can be better than using frexp(double_bignum(x), ..) because it
3434 // avoids overflow.
3435 
frexp_bignum(const Bignum & x,std::int64_t & xx)3436 inline double frexp_bignum(const Bignum &x, std::int64_t &xx)
3437 {   return op_dispatch1<Frexp,double>(x.val, xx);
3438 }
3439 
3440 #ifdef softfloat_h
3441 
frexp128_bignum(const Bignum & x,std::int64_t & xx)3442 inline float128_t frexp128_bignum(const Bignum &x, std::int64_t &xx)
3443 {   return op_dispatch1<Frexp128,float128_t>(x.val, xx);
3444 }
3445 
float128_bignum(const Bignum & x)3446 inline float128_t float128_bignum(const Bignum &x)
3447 {   return op_dispatch1<Float128,float128_t>(x.val);
3448 }
3449 
3450 #endif // softfloat_h
3451 
3452 //=========================================================================
3453 // display() will show the internal representation of a bignum as a
3454 // sequence of hex values. This is obviously useful while debugging!
3455 //=========================================================================
3456 
display(const char * label,const std::uint64_t * a,std::size_t lena)3457 inline void display(const char *label, const std::uint64_t *a,
3458                     std::size_t lena)
3459 {   std::cout << label << " [" << static_cast<int>(lena) << "]";
3460     for (std::size_t i=0; i<lena; i++)
3461     {   if (i!=0 && i%3==0) std::cout << std::endl << "     ";
3462         std::cout << " "
3463                   << std::hex << std::setfill('0')
3464                   << "0x" << std::setw(16) << a[lena-i-1]
3465                   << std::dec << std::setfill(' ');
3466     }
3467     std::cout << std::endl;
3468 }
3469 
3470 // "rdisplay" is for generating trace output for use with Reduce.
3471 // The format is    name := 0xDDDDDDD$
3472 // which will be easy to copy and paste into Reduce.
3473 
rdisplay(const char * label,const std::uint64_t * a,std::size_t lena)3474 inline void rdisplay(const char *label, const std::uint64_t *a,
3475                      std::size_t lena)
3476 {   std::cout << label << " := 0x";
3477     for (std::size_t i=0; i<lena; i++)
3478     {   std::cout << std::hex << std::setfill('0')
3479                   << std::setw(16) << a[lena-i-1]
3480                   << std::dec << std::setfill(' ');
3481     }
3482     std::cout << "$" << std::endl;
3483 }
3484 
3485 // I provide a function that accesses (b<<shift)[n]. Note that the
3486 // valid index values n will from from 0 up to and including lenb.
3487 
shifted_digit(std::uint64_t * b,std::size_t lenb,int shift,std::size_t n)3488 inline std::uint64_t shifted_digit(std::uint64_t *b, std::size_t lenb,
3489                                    int shift, std::size_t n)
3490 {   if (n == 0) return b[0]<<shift;
3491     else if (n == lenb) return b[lenb-1]>>(64-shift);
3492     else return (b[n]<<shift) | (b[n-1]>>(64-shift));
3493 }
3494 
display(const char * label,std::uint64_t * a,std::size_t lena,int shift)3495 inline void display(const char *label, std::uint64_t *a,
3496                     std::size_t lena,
3497                     int shift)
3498 {   std::cout << label << " [" << static_cast<int>(lena) << "]";
3499     for (std::size_t i=0; i<=lena; i++)
3500     {   if (i!=0 && i%3==0) std::cout << std::endl << "     ";
3501         std::cout << " "
3502                   << std::hex << std::setfill('0')
3503                   << "0x" << std::setw(16)
3504                   << shifted_digit(a, lena, shift, lena-i)
3505                   << std::dec << std::setfill(' ');
3506     }
3507     std::cout << std::endl;
3508 }
3509 
display(const char * label,std::intptr_t a)3510 inline void display(const char *label, std::intptr_t a)
3511 {   if (stored_as_fixnum(a))
3512     {   std::cout << label << " [fixnum] " << std::hex
3513                   << "0x" << a << std::dec << " = "
3514                   << int_of_handle(a) << std::endl;
3515         return;
3516     }
3517     std::uint64_t *d = vector_of_handle(a);
3518     std::size_t len = number_size(d);
3519     std::cout << label << " [" << static_cast<int>(len) << "]";
3520     for (std::size_t i=0; i<len; i++)
3521         std::cout << " "
3522                   << std::hex << std::setfill('0')
3523                   << "0x" << std::setw(16) << d[len-i-1]
3524                   << std::dec << std::setfill(' ');
3525     std::cout << std::endl;
3526 }
3527 
display(const char * label,const Bignum & a)3528 inline void display(const char *label, const Bignum &a)
3529 {   display(label, a.val);
3530 }
3531 
3532 
3533 //=========================================================================
3534 //=========================================================================
3535 // I will have a collection of low level functions that support the
3536 // fundamental operations needed for implementing big-number arithmetic:
3537 // add-with-carry, multiplication and division.
3538 //=========================================================================
3539 //=========================================================================
3540 
3541 #ifdef __GNUC__
3542 
3543 // Note that __GNUC__ also gets defined by clang on the Macintosh, so
3544 // this code is probably optimized there too. This must NEVER be called
3545 // with a zero argument.
3546 
3547 // Count the leading zeros in a 64-bit word.
3548 
nlz(std::uint64_t x)3549 inline int nlz(std::uint64_t x)
3550 {   return __builtin_clzll(x);  // Must use the 64-bit version of clz.
3551 }
3552 
popcount(std::uint64_t x)3553 inline int popcount(std::uint64_t x)
3554 {   return __builtin_popcountll(x);
3555 }
3556 
3557 #else // __GNUC__
3558 
nlz(std::uint64_t x)3559 inline int nlz(std::uint64_t x)
3560 {   int n = 0;
3561     if (x <= 0x00000000FFFFFFFFU)
3562     {   n = n +32;
3563         x = x <<32;
3564     }
3565     if (x <= 0x0000FFFFFFFFFFFFU)
3566     {   n = n +16;
3567         x = x <<16;
3568     }
3569     if (x <= 0x00FFFFFFFFFFFFFFU)
3570     {   n = n + 8;
3571         x = x << 8;
3572     }
3573     if (x <= 0x0FFFFFFFFFFFFFFFU)
3574     {   n = n + 4;
3575         x = x << 4;
3576     }
3577     if (x <= 0x3FFFFFFFFFFFFFFFU)
3578     {   n = n + 2;
3579         x = x << 2;
3580     }
3581     if (x <= 0x7FFFFFFFFFFFFFFFU)
3582     {   n = n + 1;
3583     }
3584     return n;
3585 }
3586 
popcount(std::uint64_t x)3587 inline int popcount(std::uint64_t x)
3588 {   x = (x & 0x5555555555555555U) + (x >> 1 & 0x5555555555555555U);
3589     x = (x & 0x3333333333333333U) + (x >> 2 & 0x3333333333333333U);
3590     x = x + (x >> 4) & 0x0f0f0f0f0f0f0f0fU;
3591     x = x + (x >> 8);
3592     x = x + (x >> 16);
3593     x = x + (x >> 32) & 0x7f;
3594 }
3595 
3596 #endif // __GNUC__
3597 
3598 // Round a size_t integer up to the next higher power of 2.
3599 // I do this based on counting the number of leading zeros in the
3600 // binary representation of n-1.
3601 
next_power_of_2(std::size_t n)3602 inline std::size_t next_power_of_2(std::size_t n)
3603 {   return (static_cast<std::size_t>(1)) << (64-nlz(
3604                 static_cast<std::uint64_t>(n-1)));
3605 }
3606 
log_next_power_of_2(std::size_t n)3607 inline unsigned int log_next_power_of_2(std::size_t n)
3608 {   return (64-nlz(static_cast<std::uint64_t>(n-1)));
3609 }
3610 
3611 // I am going to represent bignums as arrays of 64-bit digits.
3612 // Overall the representation will use 2s complement, and so all but the
3613 // top digit will be treated as unsigned, while the top one is signed
3614 // and the whole number must act as if it had its sign bit propagated
3615 // indefinitely to the left. When I pass numbers to the low level
3616 // code I will pass references to the input arrays and lengths. I will
3617 // pass an arrange that will certainly be large enough to hold the result
3618 // and the arithmetic functions will return the length in it that is used.
3619 // This length will be such that the overall number does not have any
3620 // extraneous leading zeros or leading 0xffffffffffffffff words, save that
3621 // the value zero will be returned as a single word value not a no-word
3622 // one. Note the word "extraneous", because the positive value 2^64-1
3623 // will be represented as a 2-word item with 0 in the higher digit and
3624 // 0xffffffffffffffff in the lower one - the leading zero is needed so
3625 // that it is clear that the value is positive. A consequence of all this
3626 // is that any bignum with length 1 can be extracted as an int64_t without
3627 // loss.
3628 
3629 // I want "add-with-carry" operations, and so I provide a function here to
3630 // implement it. If the C++ compiler had a nice intrinsic I would like
3631 // to use that! Well Intel compilers have an _addcarry_u64 that passes and
3632 // returns the carry in an unsigned char and uses a pointer not a reference
3633 // argument for passing back the result.
3634 
3635 // a1 and a2 are 64-bit unsigned integers. While c_in is also that type it
3636 // must only have one of the values 0 or 1. The effect will be to set r to
3637 // the low 64-bits of a1+a2+c_in and return any carry that is generated.
3638 
3639 // I have an overload of add_with_carry for use where it is known that
3640 // the input carry is zero. That cases saves a small amount of work.
3641 // The code as written here seems to lead to a good compiled version using
3642 // g++ on x86_64 and -O3.
3643 
add_with_carry(std::uint64_t a1,std::uint64_t a2,std::uint64_t & r)3644 inline std::uint64_t add_with_carry(std::uint64_t a1,
3645                                     std::uint64_t a2,
3646                                     std::uint64_t &r)
3647 {   return ((r = a1 + a2) < a1);
3648 }
3649 
3650 // Now the general version with a carry-in. Note that in fact this version
3651 // will support any value in c_in, not merely a single bit. Thus the
3652 // carry_out can end up as 0, 1 or 2.
3653 
add_with_carry(std::uint64_t a1,std::uint64_t a2,std::uint64_t a3,std::uint64_t & r)3654 inline std::uint64_t add_with_carry(std::uint64_t a1,
3655                                     std::uint64_t a2,
3656                                     std::uint64_t a3, std::uint64_t &r)
3657 {   std::uint64_t w;
3658     int c1 = add_with_carry(a1, a3, w);
3659     return c1 + add_with_carry(w, a2, r);
3660 }
3661 
3662 // In some places my code may be made nicer by having a version that
3663 // adds 4 values.
3664 
add_with_carry(std::uint64_t a1,std::uint64_t a2,std::uint64_t a3,std::uint64_t a4,std::uint64_t & r)3665 inline std::uint64_t add_with_carry(std::uint64_t a1,
3666                                     std::uint64_t a2,
3667                                     std::uint64_t a3, std::uint64_t a4,
3668                                     std::uint64_t &r)
3669 {   std::uint64_t w1, w2;
3670     int c1 = add_with_carry(a1, a2, w1);
3671     int c2 = add_with_carry(a3, a4, w2);
3672     return c1 + c2 + add_with_carry(w1, w2, r);
3673 }
3674 
3675 // subtract_with_borrow does
3676 //     r = a1 - a2 - b_in;
3677 // and returns 1 is there is a borrow out.
3678 
subtract_with_borrow(std::uint64_t a1,std::uint64_t a2,std::uint64_t & r)3679 inline std::uint64_t subtract_with_borrow(std::uint64_t a1,
3680         std::uint64_t a2,
3681         std::uint64_t &r)
3682 {   r = a1 - a2;
3683     return (r > a1);
3684 }
3685 
subtract_with_borrow(std::uint64_t a1,std::uint64_t a2,std::uint64_t b_in,std::uint64_t & r)3686 inline std::uint64_t subtract_with_borrow(std::uint64_t a1,
3687         std::uint64_t a2,
3688         std::uint64_t b_in, std::uint64_t &r)
3689 {   std::uint64_t w;
3690     int b1 = subtract_with_borrow(a1, b_in, w);
3691     return b1 + subtract_with_borrow(w, a2, r);
3692 }
3693 
3694 // I want code that will multiply two 64-bit values and yield a 128-bit
3695 // result. The result must be expressed as a pair of 64-bit integers.
3696 // If I have a type "__int128", as will often be the case when using gcc,
3697 // this is very easy to express. Otherwise I split the two inputs into
3698 // 32-bit halves, do 4 multiplications and some additions to construct
3699 // the result. At least I can keep the code portable, even if I can then
3700 // worry about performance a bit.
3701 
3702 
3703 
3704 
3705 #ifdef __SIZEOF_INT128__
3706 
3707 // Well it seems that g++ and clang have different views about how to
3708 // ask for unsigned 128-bit integers! So I abstract that away via a typedef
3709 // called UNIT128.
3710 
3711 #ifdef __CLANG__
3712 typedef __int128  INT128;
3713 typedef __uint128 UINT128;
3714 #else // __CLANG__
3715 typedef __int128  INT128;
3716 typedef unsigned __int128 UINT128;
3717 #endif // __CLANG__
3718 
3719 // At least for debugging I may wish to display 128-bit integers. Here I
3720 // only do hex printing. I could do decimal and octal if I really wanted
3721 // but just for debugging that does not seem vital. If some C++ compiler
3722 // already supported printing of 128-bit ints this definition might clash
3723 // and would need commenting out.
3724 
operator <<(std::ostream & out,UINT128 a)3725 inline std::ostream & operator << (std::ostream &out, UINT128 a)
3726 {   out << std::hex << std::setw(16) << std::setfill('0') <<
3727         static_cast<std::uint64_t>(a>>64)
3728         << " "
3729         << static_cast<std::uint64_t>(a) << std::dec << std::setfill(' ');
3730     return out;
3731 }
3732 
pack128(std::uint64_t hi,std::uint64_t lo)3733 inline UINT128 pack128(std::uint64_t hi, std::uint64_t lo)
3734 {   return (static_cast<UINT128>(hi)<<64) | lo;
3735 }
3736 
multiply64(std::uint64_t a,std::uint64_t b,std::uint64_t & hi,std::uint64_t & lo)3737 inline void multiply64(std::uint64_t a, std::uint64_t b,
3738                        std::uint64_t &hi, std::uint64_t &lo)
3739 {   UINT128 r = static_cast<UINT128>(a)*static_cast<UINT128>(b);
3740     hi = static_cast<std::uint64_t>(r >> 64);
3741     lo = static_cast<std::uint64_t>(r);
3742 }
3743 
3744 // Now much the same but forming a*b+c. Note that this can not overflow
3745 // the 128-bit result. Both hi and lo are only updated at the end
3746 // of this, and so they are allowed to be the same as other arguments.
3747 
multiply64(std::uint64_t a,std::uint64_t b,std::uint64_t c,std::uint64_t & hi,std::uint64_t & lo)3748 inline void multiply64(std::uint64_t a, std::uint64_t b,
3749                        std::uint64_t c,
3750                        std::uint64_t &hi, std::uint64_t &lo)
3751 {   UINT128 r = static_cast<UINT128>(a)*static_cast<UINT128>
3752                 (b) + static_cast<UINT128>(c);
3753     hi = static_cast<std::uint64_t>(r >> 64);
3754     lo = static_cast<std::uint64_t>(r);
3755 }
3756 
signed_multiply64(std::int64_t a,std::int64_t b,std::int64_t & hi,std::uint64_t & lo)3757 inline void signed_multiply64(std::int64_t a, std::int64_t b,
3758                               std::int64_t &hi, std::uint64_t &lo)
3759 {   INT128 r = static_cast<INT128>(a)*static_cast<INT128>(b);
3760     hi = static_cast<std::int64_t>(static_cast<UINT128>(r) >> 64);
3761     lo = static_cast<std::uint64_t>(r);
3762 }
3763 
signed_multiply64(std::int64_t a,std::int64_t b,std::uint64_t c,std::int64_t & hi,std::uint64_t & lo)3764 inline void signed_multiply64(std::int64_t a, std::int64_t b,
3765                               std::uint64_t c,
3766                               std::int64_t &hi, std::uint64_t &lo)
3767 {   UINT128 r = static_cast<UINT128>(
3768                     static_cast<INT128>(a)*static_cast<INT128>(b))
3769                 + static_cast<UINT128>(c);
3770     hi = static_cast<std::int64_t>(r >> 64);
3771     lo = static_cast<std::uint64_t>(r);
3772 }
3773 
3774 // divide {hi,lo} by divisor and generate a quotient and a remainder. The
3775 // version of the code that is able to use __int128 can serve as clean
3776 // documentation of the intent.
3777 
divide64(std::uint64_t hi,std::uint64_t lo,std::uint64_t divisor,std::uint64_t & q,std::uint64_t & r)3778 inline void divide64(std::uint64_t hi, std::uint64_t lo,
3779                      std::uint64_t divisor,
3780                      std::uint64_t &q, std::uint64_t &r)
3781 {   arithlib_assert(divisor != 0 && hi < divisor);
3782     UINT128 dividend = pack128(hi, lo);
3783     q = dividend / divisor;
3784     r = dividend % divisor;
3785 }
3786 
3787 #else // __SIZEOF_INT128__
3788 
3789 // If the C++ system I am using does not support and 128-bit integer
3790 // type or if I have not detected it everything can still be done using
3791 // lots of 64-bit operations, with each 64-bit value often treated as
3792 // two 32-bit halves.
3793 
multiply64(std::uint64_t a,std::uint64_t b,std::uint64_t & hi,std::uint64_t & lo)3794 inline void multiply64(std::uint64_t a, std::uint64_t b,
3795                        std::uint64_t &hi, std::uint64_t &lo)
3796 {   std::uint64_t a1 = a >> 32,           // top half
3797                       a0 = a & 0xFFFFFFFFU;   // low half
3798     std::uint64_t b1 = b >> 32,           // top half
3799                   b0 = b & 0xFFFFFFFFU;   // low half
3800     std::uint64_t u1 = a1*b1,             // top of result
3801                   u0 = a0*b0;             // bottom of result
3802 // Now I need to add in the two "middle" bits a0*b1 and a1*b0
3803     std::uint64_t w = a0*b1;
3804     u1 += w >> 32;
3805     w <<= 32;
3806     u0 += w;
3807     if (u0 < w) u1++;
3808 // a0*b1 done
3809     w = a1*b0;
3810     u1 += w >> 32;
3811     w <<= 32;
3812     u0 += w;
3813     if (u0 < w) u1++;
3814     hi = u1;
3815     lo = u0;
3816 }
3817 
3818 // Now much the same but forming a*b+c. Note that this can not overflow
3819 // the 128-bit result. Both hi and lo are only updated at the end
3820 // of this, and so they are allowed to be the same as other arguments.
3821 
multiply64(std::uint64_t a,std::uint64_t b,std::uint64_t c,std::uint64_t & hi,std::uint64_t & lo)3822 inline void multiply64(std::uint64_t a, std::uint64_t b,
3823                        std::uint64_t c,
3824                        std::uint64_t &hi, std::uint64_t &lo)
3825 {   std::uint64_t a1 = a >> 32,           // top half
3826                       a0 = a & 0xFFFFFFFFU;   // low half
3827     std::uint64_t b1 = b >> 32,           // top half
3828                   b0 = b & 0xFFFFFFFFU;   // low half
3829     std::uint64_t u1 = a1*b1,             // top of result
3830                   u0 = a0*b0;             // bottom of result
3831 // Now I need to add in the two "middle" bits a0*b1 and a1*b0
3832     std::uint64_t w = a0*b1;
3833     u1 += w >> 32;
3834     w <<= 32;
3835     u0 += w;
3836     if (u0 < w) u1++;
3837 // a0*b1 done
3838     w = a1*b0;
3839     u1 += w >> 32;
3840     w <<= 32;
3841     u0 += w;
3842     if (u0 < w) u1++;
3843     u0 += c;                         // add in C.
3844     if (u0 < c) u1++;
3845     hi = u1;
3846     lo = u0;
3847 }
3848 
signed_multiply64(std::int64_t a,std::int64_t b,std::int64_t & hi,std::uint64_t & lo)3849 inline void signed_multiply64(std::int64_t a, std::int64_t b,
3850                               std::int64_t &hi, std::uint64_t &lo)
3851 {   std::uint64_t h, l;
3852     multiply64(static_cast<std::uint64_t>(a),
3853                static_cast<std::uint64_t>(b), h,
3854                l);
3855     if (a < 0) h -= static_cast<std::uint64_t>(b);
3856     if (b < 0) h -= static_cast<std::uint64_t>(a);
3857     hi = static_cast<std::int64_t>(h);
3858     lo = l;
3859 }
3860 
signed_multiply64(std::int64_t a,std::int64_t b,std::uint64_t c,std::int64_t & hi,std::uint64_t & lo)3861 inline void signed_multiply64(std::int64_t a, std::int64_t b,
3862                               std::uint64_t c,
3863                               std::int64_t &hi, std::uint64_t &lo)
3864 {   std::uint64_t h, l;
3865     multiply64(static_cast<std::uint64_t>(a),
3866                static_cast<std::uint64_t>(b), c, h,
3867                l);
3868     if (a < 0) h -= static_cast<std::uint64_t>(b);
3869     if (b < 0) h -= static_cast<std::uint64_t>(a);
3870     hi = static_cast<std::int64_t>(h);
3871     lo = l;
3872 }
3873 
divide64(std::uint64_t hi,std::uint64_t lo,std::uint64_t divisor,std::uint64_t & q,std::uint64_t & r)3874 inline void divide64(std::uint64_t hi, std::uint64_t lo,
3875                      std::uint64_t divisor,
3876                      std::uint64_t &q, std::uint64_t &r)
3877 {   arithlib_assert(divisor != 0 && hi < divisor);
3878     std::uint64_t u1 = hi;
3879     std::uint64_t u0 = lo;
3880     std::uint64_t c = divisor;
3881 // See the Hacker's Delight for commentary about what follows. The associated
3882 // web-site explains usage rights:
3883 // "You are free to use, copy, and distribute any of the code on this web
3884 // site (www.hackersdelight.org) , whether modified by you or not. You need
3885 // not give attribution. This includes the algorithms (some of which appear
3886 // in Hacker's Delight), the Hacker's Assistant, and any code submitted by
3887 // readers. Submitters implicitly agree to this." and then "The author has
3888 // taken care in the preparation of this material, but makes no expressed
3889 // or implied warranty of any kind and assumes no responsibility for errors
3890 // or omissions. No liability is assumed for incidental or consequential
3891 // damages in connection with or arising out of the use of the information
3892 // or programs contained herein."
3893 // I may not be obliged to give attribution, but I view it as polite to!
3894 // Any error that have crept in in my adapaptation of the original code
3895 // will be my fault, but you see in the BSD license at the top of this
3896 // file that I disclaim any possible liability for consequent loss or damage.
3897     const std::uint64_t base = 0x100000000U; // Number base (32 bits).
3898     std::uint64_t un1, un0,        // Norm. dividend LSD's.
3899         vn1, vn0,        // Norm. divisor digits.
3900         q1, q0,          // Quotient digits.
3901         un32, un21, un10,// Dividend digit pairs.
3902         rhat;            // A remainder.
3903 // I am going to shift both operands left until the divisor has its
3904 // most significant bit set.
3905     int s = nlz(c);           // Shift amount for norm. 0 <= s <= 63.
3906     c = c << s;               // Normalize divisor.
3907 // Now I split the divisor from a single 64-bit number into a pair
3908 // of 32-vit values.
3909     vn1 = c >> 32;            // Break divisor up into
3910     vn0 = c & 0xFFFFFFFFU;    // two 32-bit digits.
3911 // Shift the dividend... and split it into parts.
3912     if (s == 0) un32 = u1;
3913     else un32 = (u1 << s) | (u0 >> (64 - s));
3914     un10 = u0 << s;           // Shift dividend left.
3915     un1 = un10 >> 32;         // Break right half of
3916     un0 = un10 & 0xFFFFFFFFU; // dividend into two digits.
3917 // Predict a 32-bit quotient digit...
3918     q1 = un32/vn1;            // Compute the first
3919     rhat = un32 - q1*vn1;     // quotient digit, q1.
3920 again1:
3921     if (q1 >= base || q1*vn0 > base*rhat + un1)
3922     {   q1 = q1 - 1;
3923         rhat = rhat + vn1;
3924         if (rhat < base) goto again1;
3925     }
3926     un21 = un32*base + un1 - q1*c;  // Multiply and subtract.
3927     q0 = un21/vn1;            // Compute the second
3928     rhat = un21 - q0*vn1;     // quotient digit, q0.
3929 again2:
3930     if (q0 >= base || q0*vn0 > base*rhat + un0)
3931     {   q0 = q0 - 1;
3932         rhat = rhat + vn1;
3933         if (rhat < base) goto again2;
3934     }
3935     q = (q1 << 32) | q0;      // assemble and return quotient & remainder
3936     r = (un21*base + un0 - q0*c) >> s;
3937 }
3938 
3939 #endif // __SIZEOF_INT128__
3940 
3941 // While my arithmetic is all done in uint64_t (and that is important so
3942 // that in C++ the consequences of overflow are defined) I need to treat
3943 // some top-digits as signed: here are values and tests relating to that.
3944 
3945 INLINE_VAR const std::uint64_t allbits   =
3946     ~static_cast<std::uint64_t>(0);
3947 INLINE_VAR const std::uint64_t topbit    = static_cast<std::uint64_t>
3948         (1)<<63;
3949 INLINE_VAR const std::uint64_t allbuttop = topbit - 1;
3950 
positive(std::uint64_t a)3951 inline bool positive(std::uint64_t a)
3952 {   return static_cast<std::int64_t>(a) >= 0;
3953 }
3954 
negative(std::uint64_t a)3955 inline bool negative(std::uint64_t a)
3956 {   return static_cast<std::int64_t>(a) < 0;
3957 }
3958 
3959 
3960 // This next function might be naivly written as
3961 //    return ((a1==0 && positive(a2)) ||
3962 //            (a1==-1 && negative(a2)));
3963 // and it is to test if a bignum can have its top digit removed.
3964 
shrinkable(std::uint64_t a1,std::uint64_t a2)3965 inline bool shrinkable(std::uint64_t a1, std::uint64_t a2)
3966 {   return ((a1 + (a2>>63)) == 0);
3967 }
3968 
internal_copy(const std::uint64_t * a,std::size_t lena,std::uint64_t * b)3969 inline void internal_copy(const std::uint64_t *a, std::size_t lena,
3970                           std::uint64_t *b)
3971 {   std::memcpy(b, a, lena*sizeof(std::uint64_t));
3972 }
3973 
3974 // This internal functions sets b to be -a without altering its length.
3975 // Because length is not changed it does not need a length for the
3976 // destination passed to it.
3977 
internal_negate(const std::uint64_t * a,std::size_t lena,std::uint64_t * b)3978 inline void internal_negate(const std::uint64_t *a, std::size_t lena,
3979                             std::uint64_t *b)
3980 {   std::uint64_t carry = 1;
3981     for (std::size_t i=0; i<lena; i++)
3982     {   std::uint64_t w = b[i] = ~a[i] + carry;
3983         carry = (w < carry ? 1 : 0);
3984     }
3985 }
3986 
3987 // When printing numbers in octal it will be handy to be able treat the
3988 // data as an array of 3-bit digits, so here is an access function that
3989 // does that. There is a messy issue about the top of a number, where it
3990 // may not be a whole number of 3-bit octal digits. I pass in v, a vector
3991 // of 64-bit values, n which is the length of that vector and i which
3992 // is the index of the octal digit that I wish to extract. To help with
3993 // that I have a function virtual_digit64() which lets me read from a
3994 // bignum as if it has been usefully sign-extended.
3995 
virtual_digit64(const std::uint64_t * v,std::size_t n,std::size_t j)3996 inline std::uint64_t virtual_digit64(const std::uint64_t *v,
3997                                      std::size_t n,
3998                                      std::size_t j)
3999 {   if (j < n) return v[j];
4000     else if (positive(v[n-1])) return 0;
4001     else return UINT64_C(0xffffffffffffffff);
4002 }
4003 
4004 // This function reads a 3-bit digit from a bignum, and is for use when
4005 // printing in octal.
4006 
read_u3(const std::uint64_t * v,std::size_t n,std::size_t i)4007 inline int read_u3(const std::uint64_t *v, std::size_t n,
4008                    std::size_t i)
4009 {   std::size_t bits = 3*i;
4010     std::size_t n0 = bits/64;   // word with lowest bit of the 3
4011     std::size_t s0 =
4012         bits%64;   // amount to shift right to align it properly
4013     std::uint64_t w = virtual_digit64(v, n, n0) >> s0;
4014 // If I needed to shift by 62 or 63 bits then the octal digit I am interested
4015 // in needs some bits from the next word up.
4016     if (s0 >= 62) w |= (virtual_digit64(v, n, n0+1) << (64-s0));
4017     return static_cast<int>(w & 0x7);
4018 }
4019 
4020 //=========================================================================
4021 //=========================================================================
4022 // Some support for two models of memory layout. If VSL is set a number
4023 // will be represented as an intptr_t value with its low 3 bits used as
4024 // a tag. When the tag is removed and the intptr_t is cast to (uint64_t *)
4025 // it points at a block of words. The first word holds a header value
4026 // that includes (in packed form) the length of the block. Beyond that
4027 // is the row of uint64_t values making up the bignum itself.
4028 //
4029 // For more direct C++ use the type is just (uint64_t *) and it refers
4030 // directly to the row of digits of the bignum. However at the address
4031 // just ahead of that (ie at v[-1]) there is a header word giving the
4032 // length of the bignum.
4033 // Sometime soon this header word will be structured as two 32-bit
4034 // parts. One will give the number of 64-bit elements of the vector that
4035 // are actually in ise. The other will be a small integer indicating
4036 // a power of two that is the size of memory block that was allocated.
4037 // Such a scheme always rounds allocated sizes up using next_power_of_2()
4038 // and then when the actual number of digits a number occupies turns out
4039 // to be less than it might have there is no need to recycle memory - the
4040 // "actual length" field is just updates. Furthermore a modest sized
4041 // table can keep freelists of discarded blocks on each size, so allocation
4042 // is potentially speeded up.
4043 //=========================================================================
4044 //=========================================================================
4045 
4046 
4047 // For a free-standing bignum application (including my test code for the
4048 // stuff here, bignums are represented as blocks of memory (allocated using
4049 // malloc) where the pointer that is used points to the start of the
4050 // array of bignum digits, and the word in memory before that contains
4051 // the length (in words) of the block.
4052 // Strings are returned to the user as freshly malloced memory holding a
4053 // native-style C++ string with a terminating NUL character at the end.
4054 
4055 
4056 //=========================================================================
4057 //=========================================================================
4058 // Random number support
4059 //=========================================================================
4060 //=========================================================================
4061 
4062 // Well this is the first place in this file where an issue arises that will
4063 // apply in several other places. I want some data that is static and
4064 // some that is thread_local. But I am making this a header-only library, so
4065 // it is liable to be included from several compilation units, and so if
4066 // I simply make variables
4067 //     static int something;
4068 //     thread_local int something_else;
4069 // I will suffer from multiple-definition problems. For the next few years
4070 // (the current date is May 2019) I can not use "inline int something;"
4071 // which would do the trick but which is not supported by all the slightly
4072 // older C++ implementations I want to be able to use. My resolution is
4073 // along the lines of
4074 //     inline int &something() { static int x = 17; return x; }
4075 // And then I can go for instance:
4076 //     int a = something();
4077 //     something()++;
4078 //     int &aref = something();
4079 //     ... aref .. aref ... aref;
4080 // The effect is that about the only misery is that when I refer to the value
4081 // I need to write and extra pair of parentheses. And the last fragment of
4082 // example shows how if I am going to make multiple uses of something I can
4083 // do the function call just once. That minor optimization may become
4084 // more important if I use thread_local instead of static, since it can
4085 // make the access overhead of of thread local variable arise just once
4086 // rather than potentially many times.
4087 // There is (I am afraid) more to say, especially in the static value is
4088 // a more complex type than merely "int". The initialization of x will
4089 // happen just once. But about all you can guarantee is that it will happen
4090 // at latest when something() is first called. This could result in the code
4091 // for something() needing to incorporate a "has it been initialized" test.
4092 // That though may be extra encouragement to use the "aref" idiom.
4093 // Now if x is a class object we can also worry about when its destructor
4094 // will be called! Well x is static it gets destroyed at program termination
4095 // with all the usual joys regarding seqencing. If it is thread_local its
4096 // destructor is invoked as the thread terminates. Again it will be prudent
4097 // to avoid too many assumptions about the ordering of destruction, and when
4098 // such odrerings are relied upon to include copious comments in the code
4099 // as to why everything will be OK. In particular it will be hard to have
4100 // guarantees about the order in which constructors have been called.
4101 //
4102 // Note to self: one might believe that
4103 //    class Something { static int x; ... }
4104 // was going to be helpful, because that guarantees that there will only be
4105 // a single instance of x (and if it was thread_local similarly). However
4106 // what you have there is a declation of x not a definition, and without a
4107 //    ?thread_local? int Something::x = 3;
4108 // sort of definition somewhere. But without it being "inline int.." This
4109 // would cause trouble with multiple-file scenarios. So I am not going to
4110 // be using static members of classes to sort myself out.
4111 //
4112 // It is useful to be able to generate random values. C++11 is simultaneously
4113 // very helpful and rather unhelpful. The class std::random_device is
4114 // expected to yield genuine unpredictable values, but it is not guaranteed
4115 // to and it fails to on some platforms, so despite the fact that when it
4116 // works it is a really good thing I can not rely solely on it. Each time I
4117 // use a random_device it gives me just 32 bits. For my real generator that
4118 // is not really enough.
4119 // So here I create 3 notionally unpredictable units and then merge in the
4120 // identity of the current thread and two measurements related to time.
4121 // To avoid thread safety issues with random_device I make calls to it
4122 // global, and then the thread identifier and time of day information stands
4123 // a prospect of arranging that each thread gets its own mersenne-twister
4124 // with its own seeding.
4125 // Note that Wikipedia explains "Multiple instances that differ only in
4126 // seed value (but not other parameters) are not generally appropriate
4127 // for Monte-Carlo simulations that require independent random number
4128 // generators" and here even the independence of my thread-specific
4129 // seed values is questionable.
4130 
4131 // I perform all this setup at initialization time, but by wrapping the
4132 // same sequence of steps as a critical region I could use it to re-seed
4133 // generators whenever I felt the need to.
4134 //
4135 
4136 // The code here is explicitly aware of the prospect of threading, and
4137 // should lead to different pseudo-random sequences in each thread.
4138 // If you do not have C++17 inline variables then there will be a separate
4139 // generator for each compilation unit and especially if you have an
4140 // unreliable std::random_device about the only thing liable to keep
4141 // things distinct in each will be the high resolution clock. Well to
4142 // try to improve things there I will use the address of one of these
4143 // variables as part of the seeding process, so that if they all end
4144 // up static rather than inline that will give per-compilation-uint
4145 // variation.
4146 
4147 // Note that the thread local status information for a random number
4148 // generator will be initialized in EVERY thread that is created. This
4149 // includes the worker threads for Karatsuba multiplicatin and in a
4150 // broader context where I use this library it will include threads that
4151 // are used for GUI or other I/O purposes. So theer is a benefit if C++
4152 // delays initialization of any of the variables within the following
4153 // function until the function is first used!
4154 
ref_mersenne_twister()4155 inline std::mt19937_64 &ref_mersenne_twister()
4156 {   std::random_device basic_randomness;
4157 // Yes the static procedure-local variables here may carry some
4158 // overhead as the system considers whether it wants to initialize them, but
4159 // the overall cost here is already probably high as I accumulate entropy
4160 // and so I am not going to worry.
4161     static thread_local std::uint64_t threadid =
4162         static_cast<std::uint64_t>(std::hash<std::thread::id>()(
4163                                        std::this_thread::get_id()));
4164     static std::uint64_t seed_component_1 = static_cast<std::uint64_t>
4165                                             (basic_randomness());
4166     static std::uint64_t seed_component_2 = static_cast<std::uint64_t>
4167                                             (basic_randomness());
4168     static std::uint64_t seed_component_3 = static_cast<std::uint64_t>
4169                                             (basic_randomness());
4170     static thread_local std::uint64_t time_now =
4171         static_cast<std::uint64_t>
4172         (std::time(NULL));
4173     static thread_local std::uint64_t chrono_now =
4174         static_cast<std::uint64_t>(
4175             std::chrono::high_resolution_clock::now().time_since_epoch().count());
4176 // In my first draft of this library I had made the random seed directly
4177 // from uint64_t values. However when testing on a Raspberry Pi that
4178 // triggered a messages about bugs in gcc before gcc7 (relating to the
4179 // alignment of some values passed as arguments in obscure cases). Building
4180 // the seed sequence using 32-bit values avoids that issue, and since this
4181 // is only done during initialization it is not time-critical.
4182 //
4183     static thread_local std::seed_seq random_seed
4184     {   static_cast<std::uint32_t>(threadid),
4185         static_cast<std::uint32_t>(seed_component_1),
4186         static_cast<std::uint32_t>(seed_component_2),
4187         static_cast<std::uint32_t>(seed_component_3),
4188         static_cast<std::uint32_t>(time_now),
4189         static_cast<std::uint32_t>(chrono_now),
4190         static_cast<std::uint32_t>(threadid>>32),
4191         static_cast<std::uint32_t>(seed_component_1>>32),
4192         static_cast<std::uint32_t>(seed_component_2>>32),
4193         static_cast<std::uint32_t>(seed_component_3>>32),
4194         static_cast<std::uint32_t>(time_now>>32),
4195         static_cast<std::uint32_t>(chrono_now>>32),
4196         static_cast<std::uint32_t>(
4197             reinterpret_cast<std::uintptr_t>(&seed_component_1)),
4198         static_cast<std::uint32_t>(
4199             static_cast<std::uint64_t>(
4200                 reinterpret_cast<std::uintptr_t>(&seed_component_1))>>32)
4201     };
4202 
4203     static thread_local std::mt19937_64 inner_mersenne_twister(
4204         random_seed);
4205 // mersenne_twister() now generates 64-bit unsigned integers.
4206     return inner_mersenne_twister;
4207 }
4208 
4209 // If you are going to use vary many random numbers it may be a good
4210 // and you might be running under Cygwin or mingw32 it could be idea to
4211 // use ref_mersenne_twister() once to collect the thread local instance
4212 // relevant to you [note that it is a class object that provides operator(),
4213 // not really a function, despite appearances!]. That way you only do the
4214 // thread_local activity once (I hope).
4215 
mersenne_twister()4216 MAYBE_UNUSED static std::uint64_t mersenne_twister()
4217 {   return ref_mersenne_twister()();
4218 }
4219 
4220 // To re-seed I can just call this. I think that when I re-seed it will be
4221 // to gain more repeatable behaviour, and so I am fairly happy about
4222 // limiting the amount of input entropy here to 64-bits. If I was keen I
4223 // could provide a reseed-method taking a bignum argument that could have
4224 // lots of data in it. Note that this will reseed the random number
4225 // generator associated with the thread it is called from. Note that there
4226 // is one generator per thread, so if you have multiple threads and you reseed
4227 // you are liable to want to reseed each of them.
4228 //
4229 
reseed(std::uint64_t n)4230 MAYBE_UNUSED static void reseed(std::uint64_t n)
4231 {   ref_mersenne_twister().seed(n);
4232 }
4233 
4234 // Now a number of functions for setting up random bignums. These may be
4235 // useful for users, but they will also be very useful while testing this
4236 // code.
4237 
4238 // Return a random integer in the range 0 ... n-1.
4239 // Given that the largest n that can be passed is UINT64_MAX the biggest
4240 // rangs that can be generated here is 1 less than the full range of 64-bit
4241 // values. To get a full 64-bit range merely call mersenne_twister()
4242 // directly.
4243 
uniform_uint64(std::uint64_t n)4244 inline std::uint64_t uniform_uint64(std::uint64_t n)
4245 {   if (n <= 1) return 0;
4246 // I I want the remainder operation on the last line of this function to
4247 // return a uniformly distributed result. To ensure that I want r to be
4248 // drawn uniformly from a range that is a multiple of n.
4249     std::uint64_t q = UINT64_MAX/n;
4250     std::uint64_t w = n*q;
4251     std::uint64_t r;
4252 // In the worst case here n was just over UINT64_MAX/2 and q came out
4253 // as 1. In that case on average I will need to call mersenne_twister
4254 // twice. Either larger or smaller inputs will behave better, and rather
4255 // small inputs will mean I hardly ever need to re-try.
4256     std::mt19937_64 &mt = ref_mersenne_twister();
4257     do
4258     {   r = mt();
4259     }
4260     while (r >= w);
4261     return r%n;
4262 }
4263 
4264 // A uniform distribution across the range [0 .. (2^bits)-1], ie
4265 // a bignum using (up to) the given number of bits. So eg uniform_positive(3)
4266 // should return 0,1,2,3,4,5,6 or 7 each with equal probability.
4267 
uniform_positive(std::uint64_t * r,std::size_t & lenr,std::size_t bits)4268 inline void uniform_positive(std::uint64_t *r, std::size_t &lenr,
4269                              std::size_t bits)
4270 {   if (bits == 0)
4271     {   r[0] = 0;
4272         lenr = 1;
4273     }
4274     lenr = (bits+63)/64;
4275 // ref_mersenne_twister returns a reference to a thread_local entity and
4276 // I hope that my cacheing its value here I reduce thread local access
4277 // overheads.
4278     std::mt19937_64 &mt = ref_mersenne_twister();
4279     for (std::size_t i=0; i<lenr; i++)
4280         r[i] = mt();
4281     if (bits%64 == 0) r[lenr-1] = 0;
4282     else r[lenr-1] &= UINT64_C(0xffffffffffffffff) >> (64-bits%64);
4283     while (lenr!=1 && shrinkable(r[lenr-1], r[lenr-2])) lenr--;
4284 }
4285 
uniform_positive(std::size_t n)4286 inline std::intptr_t uniform_positive(std::size_t n)
4287 {   std::size_t lenr = (n + 63)/64;
4288     if (lenr == 0) lenr = 1; // special case!
4289     std::size_t save = lenr;
4290     std::uint64_t *r = reserve(lenr);
4291     uniform_positive(r, lenr, n);
4292     return confirm_size(r, save, lenr);
4293 }
4294 
4295 // As above but returning a value that may be negative. uniform_signed(3)
4296 // could return -8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6 or 7.
4297 // Note that while uniform_unsigned(0) can only return the value 0,
4298 // uniform_signed(0) can return -1 or 0.
4299 
uniform_signed(std::uint64_t * r,std::size_t & lenr,std::size_t bits)4300 inline void uniform_signed(std::uint64_t *r, std::size_t &lenr,
4301                            std::size_t bits)
4302 {   lenr = 1 + bits/64;
4303     std::mt19937_64 &mt = ref_mersenne_twister();
4304     for (std::size_t i=0; i<lenr; i++)
4305         r[i] = mt();
4306 // Now if the "extra" bit is zero my number will end up positive.
4307     if ((r[lenr-1] & (UINT64_C(1) << (bits%64))) == 0)
4308     {   r[lenr-1] &= UINT64_C(0xffffffffffffffff) >> (63-bits%64);
4309         while (lenr!=1 && r[lenr-1] == 0) lenr--;
4310     }
4311 // Otherwise the result will end up negative.
4312     else
4313     {   r[lenr-1] |= UINT64_C(0xffffffffffffffff) << (bits%64);
4314         while (lenr!=1 && r[lenr-1] == UINT64_C(0xffffffffffffffff)) lenr--;
4315     }
4316 }
4317 
uniform_signed(std::size_t n)4318 inline std::intptr_t uniform_signed(std::size_t n)
4319 {   std::size_t lenr = n/64+1;
4320     std::size_t save = lenr;
4321     std::uint64_t *r = reserve(lenr);
4322     uniform_signed(r, lenr, n);
4323     return confirm_size(r, save, lenr);
4324 }
4325 
4326 inline std::size_t bignum_bits(const std::uint64_t *a,
4327                                std::size_t lena);
4328 
4329 // Generate a a value in the range 0 .. a-1 using a uniform distribution
4330 
uniform_upto(std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr)4331 inline void uniform_upto(std::uint64_t *a, std::size_t lena,
4332                          std::uint64_t *r,
4333                          std::size_t &lenr)
4334 {   std::size_t n = bignum_bits(a, lena);
4335 // I will repeatedly generate numbers that have as many bits as a until
4336 // I get one that has a value less than a has. On average that should only
4337 // take two tries.
4338     for (;;)
4339     {   push(a);
4340         uniform_positive(r, lenr, n);
4341         pop(a);
4342         if (lena > lenr) return;
4343         for (std::size_t len=lena;;)
4344         {   len--;
4345             if (a[len] > r[len]) return;
4346             if (a[len] < r[len] || len == 0) break;
4347         }
4348     }
4349 }
4350 
uniform_upto(std::intptr_t aa)4351 inline std::intptr_t uniform_upto(std::intptr_t aa)
4352 {   if (stored_as_fixnum(aa))
4353     {   std::uint64_t r = uniform_uint64(static_cast<std::uint64_t>
4354                                          (int_of_handle(
4355                                               aa)));
4356         return int_to_handle(r);
4357     }
4358     std::uint64_t *a = vector_of_handle(aa);
4359     std::size_t lena = number_size(a);
4360     push(a);
4361     std::uint64_t *r = reserve(lena);
4362     pop(a);
4363     std::size_t lenr;
4364     uniform_upto(a, lena, r, lenr);
4365     return confirm_size(r, lena, lenr);
4366 }
4367 
truncate_positive(const std::uint64_t * r,std::size_t & n)4368 inline void truncate_positive(const std::uint64_t *r, std::size_t &n)
4369 {   while (r[n-1]==0 && n>1 && positive(r[n-2])) n--;
4370 }
4371 
truncate_negative(const std::uint64_t * r,std::size_t & n)4372 inline void truncate_negative(const std::uint64_t *r, std::size_t &n)
4373 {   while (r[n-1]==allbits && n>1 && negative(r[n-2])) n--;
4374 }
4375 
truncate_unsigned(const std::uint64_t * r,std::size_t & n)4376 inline void truncate_unsigned(const std::uint64_t *r, std::size_t &n)
4377 {   while (r[n-1]==0 && n>1) n--;
4378 }
4379 
4380 // The following is a rather strange function. It looks at the 4 bit number n.
4381 // It then processes its input a in accordance with the following table, where
4382 // A is the (positive) input value and X is A rounded down to the nearest
4383 // power of 2 less than it (ie keeping just the top bit of A):
4384 //
4385 //    0   X-1                     8   -(X-1)
4386 //    1   X                       9   -X
4387 //    2   X+1                    10   -(X+1)
4388 //    3   A                      11   -A
4389 //    4   A                      12   -A
4390 //    5   A                      13   -A
4391 //    6   A                      14   -A
4392 //    7   A                      15   -A
4393 
4394 // The idea behind this is that the input A will be a random value from a
4395 // reasonably smooth distribution, and n will be a random 4 bit value. The
4396 // output will still be random, but now half the time it will be negative.
4397 // And a significant proportion of the time it will be a power of 2 (or one
4398 // either side of being a power of 2). This last is something I want because
4399 // with an internal representation that is based on 2s complement values
4400 // close to powers of 2 can easily be "edge cases" that deserve extra attention
4401 // during testing.
4402 
fudge_distribution(const std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr,int n)4403 inline void fudge_distribution(const std::uint64_t *a,
4404                                std::size_t lena,
4405                                std::uint64_t *r, std::size_t &lenr, int n)
4406 {   lenr = lena;
4407     switch (n&7)
4408     {   case 0:
4409         case 1:
4410         case 2:
4411             for (std::size_t i=0; i<lena+1; i++) r[i] = 0;
4412             if (a[lena-1] == 0)
4413             {   if (lena>1) r[lena-2] = static_cast<std::uint64_t>(1)<<63;
4414             }
4415             else r[lena-1] = static_cast<std::uint64_t>(1) << (63-nlz(a[lena-1]));
4416             if ((n&7) == 0) // decrement it
4417             {   if (lena!=1 || a[0]!=0) // avoid decrementing zero.
4418                 {   std::uint64_t *p = r;
4419                     while (*p == 0) *p++ = static_cast<std::uint64_t>(-1);
4420                     (*p)--;
4421                 }
4422             }
4423             else if ((n&7) == 2) // increment it
4424             {   std::uint64_t *p = r;
4425                 while (*p == static_cast<std::uint64_t>(-1)) *p++ = 0;
4426                 (*p)++;
4427             }
4428             break;
4429         default:
4430             for (std::size_t i=0; i<lena; i++) r[i] = a[i];
4431             break;
4432     }
4433     if ((n&8) != 0)
4434     {   std::uint64_t carry = 1;
4435         for (std::size_t i=0; i<lena+1; i++)
4436         {   carry = add_with_carry(~r[i], carry, r[i]);
4437         }
4438         truncate_negative(r, lenr);
4439     }
4440     else truncate_positive(r, lenr);
4441 }
4442 
fudge_distribution(std::intptr_t aa,int n)4443 inline std::intptr_t fudge_distribution(std::intptr_t aa, int n)
4444 {   std::uint64_t *a;
4445     std::size_t lena;
4446     std::uint64_t w[2];
4447     if (stored_as_fixnum(aa))
4448     {   w[1] = static_cast<std::uint64_t>(int_of_handle(aa));
4449         lena = 1;
4450         a = &w[1];
4451     }
4452     else
4453     {   a = vector_of_handle(aa);
4454         lena = number_size(a);
4455     }
4456     push(a);
4457     std::uint64_t *r = reserve(lena+1);
4458     pop(a);
4459     std::size_t lenr;
4460     fudge_distribution(a, lena, r, lenr, n);
4461     return confirm_size(r, lena+1, lenr);
4462 }
4463 
4464 // Generate a value in the range 0 .. 2^bits-1 using a distribution such
4465 // numbers with each bit-length are equally probable. This works by
4466 // selecting a big-length uniformly and then creating a number uniformly
4467 // distributed across all those with that exact bit-width. This is perhaps
4468 // not a very nice distribution from a mathematical perspective, but is is
4469 // nevertheless a useful one to have in some test code.
4470 
random_upto_bits(std::uint64_t * r,std::size_t & lenr,std::size_t n)4471 inline void random_upto_bits(std::uint64_t *r, std::size_t &lenr,
4472                              std::size_t n)
4473 {   std::size_t bits = static_cast<std::size_t>(uniform_uint64(n));
4474     if (bits == 0)
4475     {   r[0] = 0;
4476         lenr = 1;
4477         return;
4478     }
4479 // The number will have from 1 to 64 bits in its top digit.
4480     lenr = (bits+63)/64;
4481     std::mt19937_64 &mt = ref_mersenne_twister();
4482     for (std::size_t i=0; i<lenr; i++)
4483         r[i] = mt();
4484     if (n%64 != 0)
4485         r[lenr-1] &= UINT64_C(0xffffffffffffffff) >> (64-bits%64);
4486     r[lenr-1] |= UINT64_C(1) << ((bits-1)%64);
4487     if (bits%64 == 0) r[lenr++] = 0;
4488     arithlib_assert(!negative(r[lenr-1]));
4489 }
4490 
random_upto_bits(std::size_t bits)4491 inline std::intptr_t random_upto_bits(std::size_t bits)
4492 {   std::size_t m = 1+bits/64;
4493     if (m == 0) m = 1;
4494     std::uint64_t *r = reserve(m);
4495     std::size_t lenr;
4496     random_upto_bits(r, lenr, bits);
4497     return confirm_size(r, m, lenr);
4498 }
4499 
4500 //=========================================================================
4501 //=========================================================================
4502 // Here I have a few tiny conversion functions followed by code for
4503 // conversion between big numbers and strings. All of these are rather
4504 // important for getting data in and out of the big number format and so
4505 // deserve to be shown early.
4506 //=========================================================================
4507 //=========================================================================
4508 
4509 
4510 // Convert a 64-bit integer to a bignum.
4511 // This can be useful when there is no special-purpose code to
4512 // perform arithmetic between a bignum and a native int64_t integer
4513 // directly.
4514 
int_to_bignum(std::int64_t n,std::uint64_t * r)4515 inline void int_to_bignum(std::int64_t n, std::uint64_t *r)
4516 {   r[0] = static_cast<std::uint64_t>(n);
4517 }
4518 
int_to_bignum(std::int64_t n)4519 inline std::intptr_t int_to_bignum(std::int64_t n)
4520 {   if (fits_into_fixnum(n)) return int_to_handle(n);
4521     std::uint64_t *r = reserve(1);
4522     int_to_bignum(n, r);
4523     return confirm_size(r, 1, 1);
4524 }
4525 
unsigned_int_to_bignum(std::uint64_t n,std::uint64_t * r,std::size_t & lenr)4526 inline void unsigned_int_to_bignum(std::uint64_t n, std::uint64_t *r,
4527                                    std::size_t &lenr)
4528 {   r[0] = n;
4529     if (negative(n))
4530     {   r[1] = 0;
4531         lenr = 2;
4532     }
4533     else lenr = 1;
4534 }
4535 
unsigned_int_to_bignum(std::uint64_t n)4536 inline std::intptr_t unsigned_int_to_bignum(std::uint64_t n)
4537 {   std::size_t w = (negative(n) ? 2 : 1);
4538     std::uint64_t *r = reserve(w);
4539     std::size_t lenr;
4540     unsigned_int_to_bignum(n, r, lenr);
4541     return confirm_size(r, w, lenr);
4542 }
4543 
4544 #ifdef softfloat_h
4545 // Some constants that are useful when I am dealing with float128_t.
4546 
4547 #ifdef LITTLEENDIAN
4548 INLINE_VAR float128_t
4549 f128_0      = {{0, INT64_C(0x0000000000000000)}},
4550 f128_half   = {{0, INT64_C(0x3ffe000000000000)}},
4551 f128_mhalf  = {{0, INT64_C(0xbffe000000000000)}},
4552 f128_1      = {{0, INT64_C(0x3fff000000000000)}},
4553 f128_m1     = {{0, INT64_C(0xbfff000000000000)}},
4554 f128_N1     = {{0, INT64_C(0x4fff000000000000)}}; // 2^4096
4555 #else // !LITTLEENDIAN
4556 INLINE_VAR float128_t
4557 f128_0      = {{INT64_C(0x0000000000000000), 0}},
4558 f128_half   = {{INT64_C(0x3ffe000000000000), 0}},
4559 f128_mhalf  = {{INT64_C(0xbffe000000000000), 0}},
4560 f128_1      = {{INT64_C(0x3fff000000000000), 0}},
4561 f128_m1     = {{INT64_C(0xbfff000000000000), 0}},
4562 f128_N1     = {{INT64_C(0x4fff000000000000), 0}};
4563 #endif // !LITTLEENDIAN
4564 
4565 // The following tests are not supported by the version of softfloat that
4566 // I am using, so I implement them myself.
4567 
f128_zero(float128_t p)4568 inline bool f128_zero(float128_t p)
4569 {   return (p.v[HIPART] & 0x7fffffffffffffff) == 0 &&
4570            p.v[LOPART] == 0;
4571 }
4572 
f128_infinite(float128_t p)4573 inline bool f128_infinite(float128_t p)
4574 {   return (p.v[HIPART] & 0x7fffffffffffffff) == 0x7fff000000000000 &&
4575            p.v[LOPART] == 0;
4576 }
4577 
f128_nan(float128_t p)4578 inline bool f128_nan(float128_t p)
4579 {   return (p.v[HIPART] & 0x7fff000000000000) == 0x7fff000000000000 &&
4580            ((p.v[HIPART] & 0x0000ffffffffffff) != 0 ||
4581             p.v[LOPART] != 0);
4582 }
4583 
ldexp(float128_t p,int x)4584 inline float128_t ldexp(float128_t p, int x)
4585 {   if (f128_zero(p) ||
4586         f128_infinite(p) ||
4587         f128_nan(p)) return p;  // special cases!
4588 // Calculate the value I expect to want to leave in the exponent field.
4589     x = ((p.v[HIPART] >> 48) & 0x7fff) + x;
4590 // In case of overflow leave an infinity of the right sign. This involves
4591 // forcing all bits of the exponent to be 1, all bits of the mantissa to be
4592 // zero and leaving the sign bit unaltered.
4593     if (x >= 0x7fff)
4594     {   p.v[HIPART] |= INT64_C(0x7fff000000000000);
4595         p.v[HIPART] &= INT64_C(0xffff000000000000);
4596         p.v[LOPART] = 0;
4597         return p;
4598     }
4599 // Using ldexp() to decrease an expeonent can lead to underflow. The value
4600 // 0 in x here would be the exponent one below that of the smallest
4601 // normal number, so a value < -114 corresponds to a number so much smaller
4602 // that it would not even qualify as a sub-norm. But even in that case
4603 // I need to preserve the sign bit.
4604     else if (x < -114)
4605     {   p.v[HIPART] &= INT64_C(
4606                            0x8000000000000000); // preserve sign of input
4607         p.v[LOPART] = 0;
4608         return p;
4609     }
4610 // In the case that ldexp underflows I have to be especially careful
4611 // because of the joys of sub-normal numbers and gradual underflow.
4612 // I deal with this by first forcing the exponent to be one that will
4613 // not lead to a sub-norm and then using a multiply to scale it down.
4614     if (x <= 0)
4615     {   p.v[HIPART] = (p.v[HIPART] & INT64_C(0x8000ffffffffffff)) |
4616                       (static_cast<std::uint64_t>(x+4096) << 48);
4617         p = f128_div(p, f128_N1);
4618     }
4619     else p.v[HIPART] = (p.v[HIPART] & INT64_C(0x8000ffffffffffff)) |
4620                            (static_cast<std::uint64_t>(x) << 48);
4621     return p;
4622 }
4623 
frexp(float128_t p,int & x)4624 inline float128_t frexp(float128_t p, int &x)
4625 {   if (f128_zero(p) ||
4626         f128_infinite(p) ||
4627         f128_nan(p))
4628     {   x = 0;
4629         return p;
4630     }
4631     int px = ((p.v[HIPART] >> 48) & 0x7fff);
4632 // If I had a sub-normal number I will multiply if by 2^4096 before
4633 // extracting its exponent. Doing that will have turned any non-zero
4634 // sub-norm into a legitimate normalized number while not getting large
4635 // enough to risk overflow...
4636     if (px == 0)
4637     {   p = f128_mul(p, f128_N1);
4638         px = ((p.v[HIPART] >> 48) & 0x7fff) - 4096;
4639     }
4640 // Now I can set the exponent field such that the resulting number is in
4641 // the range 0.5 <= p < 1.0.
4642     p.v[HIPART] = (p.v[HIPART] & INT64_C(0x8000ffffffffffff)) |
4643                   (static_cast<std::uint64_t>(0x3ffe) << 48);
4644 // .. and adjust the exponent value that I will return so it is if the
4645 // scaled mantissa is now exactly the same as the input.
4646     x = px - 0x3ffe;
4647     return p;
4648 }
4649 
4650 // return fractional part and set i to integer part. Since this is in C++
4651 // I can use a reference argument for i now a pointer and I can overload the
4652 // vanilla name "modf" along the style of the way C++11 does.
4653 
modf(float128_t d,float128_t & i)4654 inline float128_t modf(float128_t d, float128_t &i)
4655 {   i = d;
4656 // Extract the exponent
4657     int x = ((d.v[HIPART] >> 48) & 0x7fff) - 0x3ffe;
4658 // If |d| < 1.0 then the integer part is zero.
4659     if (x <= 0) i = f128_0;
4660 // Next look at cases where the integer part will life entirely within
4661 // the high word.
4662     else if (x <= 49)   // 49 not 48 because of hidden bit.
4663     {   i.v[HIPART] &=
4664             ASR(static_cast<std::int64_t>(0xffff000000000000), x-1);
4665         i.v[LOPART] = 0;
4666     }
4667     else if (x <= 112)
4668     {   i.v[LOPART] &= (-static_cast<std::uint64_t>(1)) << (113-x);
4669     }
4670 // If the number is large enough then then it is its own integer part, and
4671 // the fractional part will be zero.
4672     else return f128_0;
4673     return f128_sub(d, i);
4674 }
4675 
4676 #endif // softfloat_h
4677 
4678 // When doubles (and float128_t values where available) are to be
4679 // compared against a bignum to get proper results the double should
4680 // (at least in effect) be converted to a bignum. If one does the comparison
4681 // by converting both inputs to floating point (which may feel easier) there
4682 // are multiple problems. First the bignum might have a value outside
4683 // the range of floats, so you get overflow. Then it might differ from
4684 // a float in a bit position several hundred betlow its most significant
4685 // one, and converting to a float would lose that information.
4686 
4687 // double_to_bits() turns a floating point value into an integer plus
4688 // an exponent. It sets mantissa*2^exponent = d. This function will not
4689 // give sensible output if passed an infinity or a NaN and so they should be
4690 // filtered out before it is called.
4691 
double_to_bits(double d,std::int64_t & mantissa,int & exponent)4692 inline void double_to_bits(double d, std::int64_t &mantissa,
4693                            int &exponent)
4694 {   if (d == 0.0)
4695     {   mantissa = 0;
4696         exponent = 0;
4697         return;
4698     }
4699     int x;
4700     d = std::frexp(d, &x);
4701 // now |d| is in the range [0.5,1) -- note closed at the 0.5 end and open
4702 // at the other. And x is the power of 2 that the original input was scaled
4703 // by to achieve this.
4704     d = d*9007199254740992.0; // 2^53;
4705 // The conversion to an integer here will always be exact.
4706     mantissa = static_cast<std::int64_t>(d);
4707     exponent = x - 53;
4708 }
4709 
4710 // There are places where I need to shift a 128 or 192-bit number that is
4711 // represented using several int64 values...
4712 
shiftleft(std::int64_t & hi,std::uint64_t & lo,int n)4713 inline void shiftleft(std::int64_t &hi, std::uint64_t &lo, int n)
4714 {   if (n == 0) return;
4715     else if (n < 64)
4716     {   hi = ASL(hi, n) | (lo >> (64-n));
4717         lo = lo << n;
4718     }
4719     else if (n == 64)
4720     {   hi = lo;
4721         lo = 0;
4722     }
4723     else
4724     {   hi = lo << (n-64);
4725         lo = 0;
4726     }
4727 }
4728 
shiftleft(std::int64_t & hi,std::uint64_t & mid,std::uint64_t & lo,int n)4729 inline void shiftleft(std::int64_t &hi, std::uint64_t &mid,
4730                       std::uint64_t &lo,
4731                       int n)
4732 {   if (n == 0) return;
4733     else if (n < 64)
4734     {   hi = ASL(hi, n) | (mid >> (64-n));
4735         mid = (mid << n) | (lo >> (64-n));
4736         lo = lo << n;
4737     }
4738     else if (n == 64)
4739     {   hi = mid;
4740         mid = lo;
4741         lo = 0;
4742     }
4743     else if (n < 128)
4744     {   hi = (mid << (n-64)) | (lo >> (128-n));
4745         mid = lo << (n-64);
4746         lo = 0;
4747     }
4748     else if (n == 128)
4749     {   hi = lo;
4750         mid = lo = 0;
4751     }
4752     else
4753     {   hi = lo << (n-128);
4754         mid = lo = 0;
4755     }
4756 }
4757 
shiftright(std::int64_t & hi,std::uint64_t & lo,int n)4758 inline void shiftright(std::int64_t &hi, std::uint64_t &lo, int n)
4759 {   if (n == 0) return;
4760     else if (n < 64)
4761     {   lo = (lo >> n) | ASL(hi, 64-n);
4762         hi = ASR(hi, n);
4763     }
4764     else if (n == 64)
4765     {   lo = hi;
4766         hi = hi<0 ? -1 : 0;
4767     }
4768     else
4769     {   lo = ASR(hi, n-64);
4770         hi = hi<0 ? -1 : 0;
4771     }
4772 }
4773 
4774 // This next sets top and next to the two top 64-bit digits for a bignum,
4775 // and len to the length (measured in words) of that bignum. For values
4776 // |d| < 2^63 next will in fact be a signed value, len==1 and top will
4777 // in fact be irrelevant. This should be seen as a special degenerate case.
4778 // Whenever len>1 on output the number should be such that to make a bignum
4779 // with value to match the float you append len-2 zero words. Note that
4780 // for inputs in 2^63 <= d < 2^64 the result will have top==0 and next
4781 // the integer value of d and len==2, with something similar for the
4782 // equivalent negative range. The leading 0 or -1 is required in those
4783 // cases. The result will be be any fractional part left over when d is
4784 // converted to an integer, and this can only be nonzero is cases where
4785 // |d| < 2^53.
4786 //
4787 // In the case that the floating point input is small its value may lie
4788 // between two integers, and in that case I might want to adjust it in the
4789 // sense of ROUND, TRUNC, FLOOR or CEILING. I will pass an extra argument
4790 // to explain which I require.
4791 
4792 enum RoundingMode {ROUND, TRUNC, FLOOR, CEILING};
4793 
double_to_virtual_bignum(double d,std::int64_t & top,std::uint64_t & next,std::size_t & len,RoundingMode mode)4794 inline void double_to_virtual_bignum(double d,
4795                                      std::int64_t &top, std::uint64_t &next, std::size_t &len,
4796                                      RoundingMode mode)
4797 {   if (d == 0.0)
4798     {   top = next = 0;
4799         len = 1;
4800         return;
4801     }
4802 // NaN leads to a returned value with zero length. Having a zero length
4803 // for a bignum is invalid, so this marks the situation well.
4804     else if (std::isnan(d))
4805     {   top = next = 0;
4806         len = 0;
4807         return;
4808     }
4809 // Infinties turn into values with maximum length and a top digit that
4810 // captures the sign of the input.
4811     if (std::isinf(d))
4812     {   top = d < 0.0 ? -1 : 1;
4813         next = 0;
4814         len = SIZE_MAX;
4815         return;
4816     }
4817 // From here down I do not need to worry about zero, infinity or NaNs. But
4818 // I may need to think about rounding!
4819     double intpart;
4820     double fracpart = std::modf(d, &intpart);
4821     std::int64_t mantissa;
4822     int exponent;
4823     double_to_bits(intpart, mantissa, exponent);
4824 // Now I know intpart(d) = mantissa*2^exponent and mantissa is an integer.
4825     std::uint64_t lowbit = mantissa & -static_cast<std::uint64_t>
4826                            (mantissa);
4827     int lz = 63 - nlz(lowbit); // low zero bits
4828     mantissa = ASR(mantissa, lz);
4829     exponent += lz;
4830 // Now mantissa has its least significant bit a "1". At this stage the
4831 // input 1.0 (eg) should have turned into mantissa=1, exponent==0. And 1.5
4832 // should have become mantissa=1, exponent=0 and fracpart = 0.5. fracpart has
4833 // the same sign as the original input.
4834 // So now I can apply my rounding mode...
4835     switch (mode)
4836     {   case ROUND:
4837             if (fracpart >= 0.5) mantissa++;
4838             else if (fracpart <= -0.5) mantissa--;
4839             break;
4840         case TRUNC:  // the effect of modf is this already.
4841             break;
4842         case FLOOR:
4843             if (fracpart != 0.0 && d < 0.0) mantissa--;
4844             break;
4845         case CEILING:
4846             if (fracpart != 0.0 && d > 0.0) mantissa++;
4847             break;
4848     }
4849     next = static_cast<std::uint64_t>(mantissa);
4850     top = d<0.0 && mantissa!=0 ? -1 : 0;
4851     if (exponent < 0)
4852     {   top = 0;
4853         len = 1;
4854     }
4855     else
4856     {   len = 2 + exponent/64;
4857         exponent = exponent%64;
4858 // Now shift left by exponent, which is less than 64 here.
4859         shiftleft(top, next, exponent);
4860 // In some cases this has still left all the bits I care about in next,
4861 // with top not needed, so in such cases I will shrink by a word.
4862         if (shrinkable(top, next))
4863         {   top = next;
4864             next = 0;
4865             len--;
4866         }
4867     }
4868 }
4869 
4870 #ifdef softfloat_h
4871 
4872 // For int128_t the mantissa needs to be returned as a 128-bit integer, and
4873 // I do that as a pair of 64-bit integers here. Infinities and NaNs would
4874 // lead to nonsense output. Subnormal numbers are got wrong at present!
4875 
float128_to_bits(float128_t d,std::int64_t & mhi,std::uint64_t & mlo,int & exponent)4876 inline void float128_to_bits(float128_t d,
4877                              std::int64_t &mhi, std::uint64_t &mlo, int &exponent)
4878 {   if (f128_nan(d) || f128_zero(d))
4879     {   mhi = mlo = 0;
4880         exponent = 0;
4881         return;
4882     }
4883     else if (f128_infinite(d))
4884     {   if (f128_lt(d, f128_0)) mhi = mlo = -1;
4885         else mhi = mlo = 0;
4886         exponent = INT_MAX;
4887         return;
4888     }
4889 // With float128_t the easier way to go is to access the bit-patterns.
4890     exponent = ((d.v[HIPART] >> 48) & 0x7fff);
4891     if (exponent == 0) // subnormal number
4892     {   d = f128_mul(d, f128_N1);
4893         exponent -= 4096;
4894     }
4895     exponent -= 0x3ffe;
4896     mhi = (d.v[HIPART] & 0xffffffffffff) | 0x0001000000000000;;
4897     mlo = d.v[LOPART];
4898     if (static_cast<std::int64_t>(d.v[HIPART]) < 0)
4899     {   mlo = -mlo;
4900         if (mlo == 0) mhi = -mhi;
4901         else mhi = ~mhi;
4902     }
4903     exponent -= 113;
4904 }
4905 
inc128(std::int64_t & hi,std::uint64_t & lo)4906 inline void inc128(std::int64_t &hi, std::uint64_t &lo)
4907 {   if (++lo == 0) hi++;
4908 }
4909 
dec128(std::int64_t & hi,std::uint64_t & lo)4910 inline void dec128(std::int64_t &hi, std::uint64_t &lo)
4911 {   if (lo-- == 0) hi--;
4912 }
4913 
4914 // For a float128_t value I need to generate (up to) 3 64-bit digits for
4915 // the way it would end up as a bignum.
4916 
float128_to_virtual_bignum(float128_t d,std::int64_t & top,std::uint64_t & mid,std::uint64_t & next,std::size_t & len,RoundingMode mode)4917 inline void float128_to_virtual_bignum(float128_t d,
4918                                        std::int64_t &top, std::uint64_t &mid, std::uint64_t &next,
4919                                        std::size_t &len,
4920                                        RoundingMode mode)
4921 {   if (f128_zero(d))
4922     {   top = mid = next = 0;
4923         len = 1;
4924         return;
4925     }
4926     else if (f128_nan(d))
4927     {   top = mid = next = 0;
4928         len = 0;
4929         return;
4930     }
4931     else if (f128_infinite(d))
4932     {   if (f128_lt(d, f128_0)) top = mid = next = -1;
4933         else top = mid = next = 0;
4934         len = SIZE_MAX;
4935         return;
4936     }
4937     float128_t intpart;
4938     float128_t fracpart = modf(d, intpart);
4939     std::int64_t mhi;
4940     std::uint64_t mlo;
4941     int exponent;
4942     float128_to_bits(intpart, mhi, mlo, exponent);
4943 // Now I know intpart(d) = mantissa*2^exponent and mantissa is an integer.
4944     int lz;
4945     if (mlo != 0)
4946     {   std::uint64_t lowbit = mlo & (-mlo);
4947         lz = 63 - nlz(lowbit); // low zero bits
4948     }
4949     else
4950     {   std::uint64_t lowbit = mhi & (-static_cast<std::uint64_t>(mhi));
4951         lz = 64 + 63 - nlz(lowbit); // low zero bits
4952     }
4953     shiftright(mhi, mlo, lz);
4954     exponent += lz;
4955 // Now mantissa has its least significant bit a "1". At this stage the
4956 // input 1.0 (eg) should have turned into mantissa=1, exponent==0. And 1.5
4957 // should have become mantissa=1, exponent=0 and fracpart = 0.5. fracpart has
4958 // the same sign as the original input.
4959 // So now I can apply my rounding mode...
4960     switch (mode)
4961     {   case ROUND:
4962             if (!f128_lt(fracpart, f128_half)) inc128(mhi, mlo);
4963             else if (f128_le(fracpart, f128_mhalf)) dec128(mhi, mlo);
4964             break;
4965         case TRUNC:  // the effect of modf is this already.
4966             break;
4967         case FLOOR:
4968             if (!f128_zero(fracpart) && f128_lt(d, f128_0)) dec128(mhi, mlo);
4969             break;
4970         case CEILING:
4971             if (!f128_zero(fracpart) && !f128_lt(d, f128_0)) inc128(mhi, mlo);
4972             break;
4973     }
4974 // Now I need to shift things left so that the number of trailing zeros
4975 // to the right of my value is a multiple of 64. That may cause the
4976 // mantissa to spread into parts of 3 words: (top, mid, next).
4977 
4978     next = mlo;
4979     mid = mhi;
4980     top = mhi<0 ? -1 : 0;
4981     if (exponent < 0)
4982     {   top = 0;
4983         len = 1;
4984     }
4985     else
4986     {   len = 3 + exponent/64;
4987         exponent = exponent%64;
4988         shiftleft(top, mid, next, exponent);
4989 // In some cases this has still left all the bits I care about in next.
4990         if (shrinkable(top, mid))
4991         {   top = mid;
4992             mid = next;
4993             next = 0;
4994             len--;
4995         }
4996         if (shrinkable(top, mid))
4997         {   top = mid;
4998             mid = next;
4999             next = 0;
5000             len--;
5001         }
5002     }
5003 }
5004 
5005 #endif // softfloat_h
5006 
double_to_int(double d,RoundingMode mode)5007 inline std::intptr_t double_to_int(double d, RoundingMode mode)
5008 {
5009 // I return 0 if the input is a NaN or either +infinity or -infinity.
5010 // This is somewhat arbitrary, but right now I am not minded to raise an
5011 // exception.
5012     if (!std::isfinite(d) || d==0.0) return int_to_handle(0);
5013     std::int64_t top;
5014     std::uint64_t next;
5015     std::size_t len;
5016     double_to_virtual_bignum(d, top, next, len, mode);
5017     std::uint64_t *r = reserve(len);
5018     if (len == 1) r[0] = top;
5019     else
5020     {   for (std::size_t i=0; i<len-2; i++) r[i] = 0;
5021         r[len-1] = top;
5022         r[len-2] = next;
5023     }
5024     return confirm_size(r, len, len);
5025 }
5026 
round_double_to_int(double d)5027 inline std::intptr_t round_double_to_int(double d)
5028 {   return double_to_int(d, RoundingMode::ROUND);
5029 }
5030 
trunc_double_to_int(double d)5031 inline std::intptr_t trunc_double_to_int(double d)
5032 {   return double_to_int(d, RoundingMode::TRUNC);
5033 }
5034 
floor_double_to_int(double d)5035 inline std::intptr_t floor_double_to_int(double d)
5036 {   return double_to_int(d, RoundingMode::FLOOR);
5037 }
5038 
ceiling_double_to_int(double d)5039 inline std::intptr_t ceiling_double_to_int(double d)
5040 {   return double_to_int(d, RoundingMode::CEILING);
5041 }
5042 
5043 #ifdef softfloat_h
5044 
float128_to_int(float128_t d,RoundingMode mode)5045 inline std::intptr_t float128_to_int(float128_t d, RoundingMode mode)
5046 {   if (f128_zero(d) ||
5047         f128_infinite(d) ||
5048         f128_nan(d)) return int_to_handle(0);
5049     std::int64_t top;
5050     std::uint64_t mid, next;
5051     std::size_t len;
5052     float128_to_virtual_bignum(d, top, mid, next, len, mode);
5053     std::uint64_t *r = reserve(len);
5054     if (len == 1) r[0] = top;
5055     else if (len == 2)
5056     {   r[1] = top;
5057         r[0] = mid;
5058     }
5059     else
5060     {   for (std::size_t i=0; i<len-3; i++) r[i] = 0;
5061         r[len-1] = top;
5062         r[len-2] = mid;
5063         r[len-3] = next;
5064     }
5065     return confirm_size(r, len, len);
5066 }
5067 
round_float128_to_int(float128_t d)5068 inline std::intptr_t round_float128_to_int(float128_t d)
5069 {   return float128_to_int(d, RoundingMode::ROUND);
5070 }
5071 
trunc_float128_to_int(float128_t d)5072 inline std::intptr_t trunc_float128_to_int(float128_t d)
5073 {   return float128_to_int(d, RoundingMode::TRUNC);
5074 }
5075 
floor_float128_to_int(float128_t d)5076 inline std::intptr_t floor_float128_to_int(float128_t d)
5077 {   return float128_to_int(d, RoundingMode::FLOOR);
5078 }
5079 
ceiling_float128_to_int(float128_t d)5080 inline std::intptr_t ceiling_float128_to_int(float128_t d)
5081 {   return float128_to_int(d, RoundingMode::CEILING);
5082 }
5083 
5084 #endif // softfloat_h
5085 
op(std::int64_t a)5086 inline std::int64_t Int64_t::op(std::int64_t a)
5087 {   return a;
5088 }
5089 
op(std::uint64_t * a)5090 inline std::int64_t Int64_t::op(std::uint64_t *a)
5091 {   return static_cast<std::int64_t>(a[0]);
5092 }
5093 
op(std::int64_t a)5094 inline std::uint64_t Uint64_t::op(std::int64_t a)
5095 {   return static_cast<std::uint64_t>(a);
5096 }
5097 
op(std::uint64_t * a)5098 inline std::uint64_t Uint64_t::op(std::uint64_t *a)
5099 {   return a[0];
5100 }
5101 
5102 // A cast from a double to a float is entitled, by the C++ standard to
5103 // make a system-defined choice as to whether to round up or down.
5104 // I want to guarantee to follow IEEE round-to-nearest-with-tie-break-
5105 // -to-even, and so I will write a messy function here to achieve that and
5106 // hence end up with better portability.
5107 //
5108 // The two things I think are illustrated here are
5109 // (1) How horrible this is!
5110 // (2) That C++11 manages to provide enough facilities for me to implement it
5111 //     in a manner that I believe is standards-conforming at least on IEEE
5112 //     platforms with the rounding-style set normally.
5113 
cast_to_float(double d)5114 inline float cast_to_float(double d)
5115 {
5116 // If the argument is a NaN then return a NaN of type float.
5117     if (std::isnan(d)) return std::nanf("");
5118 // In C++ a narrowing cast here where the result would be out of range gives
5119 // undefined behaviour, so I need to filter that case first. I am going
5120 // to allow double values that are up to FLT_MAX*(1+2^(-24)) to round
5121 // down to FLT_MAX - beyond that lies overflow reported as HUGE_VALF which
5122 // on all modern systems will be an IEEE infinity.
5123     double limit = static_cast<double>(FLT_MAX) +
5124                    static_cast<double>(FLT_MAX) /
5125                    static_cast<double>(0x1000000);
5126     if (d >= limit) return HUGE_VALF;
5127     else if (d <= -limit) return -HUGE_VALF;
5128     else if (d >= static_cast<double>(FLT_MAX)) return FLT_MAX;
5129     else if (d <= static_cast<double>(FLT_MIN)) return FLT_MIN;
5130 // Now I am not going to get any overflow - whew.
5131     float r1 = static_cast<float>(d);
5132 // If the conversion was exact I do not have anything more to do!
5133     if (static_cast<double>(r1) == d) return r1;
5134     double err1 = static_cast<double>(r1) - d;
5135     float r2;
5136     double err2;
5137 // Now I am going to find the next consecutive floating point value (in
5138 // the correct direction) so that r1 and r2 are two values with d between
5139 // them.
5140     if (err1 > 0.0)
5141     {   r2 = std::nextafterf(r1, FLT_MIN);
5142         err2 = d - static_cast<double>(r2);
5143     }
5144     else
5145     {   r2 = std::nextafterf(r1, FLT_MAX);
5146         err2 = static_cast<double>(r2) - d;
5147         err1 = -err1;
5148     }
5149     if (err1 < err2) return r1;
5150     else if (err2 < err1) return r2;
5151 // Here I am at a half-way point. Hah - can add my error to a candidate
5152 // result and the rounding there will then follow the "rounding style" that
5153 // is in force - which I jolly well expect to be IEEE!
5154     return (r1 < r2 ? r1 : r2) + static_cast<float>(err1);
5155 }
5156 
5157 // On Cygwin (at least) the std::ldexpf function that is part of C++11
5158 // is hidden in the header file perhaps because of issues about thread
5159 // safety in its implementation. I reason here that converting from a
5160 // float to a double will never lose anything, then ldexp() can be used.
5161 // The case back to a float can not introduxce rounding, but might notice
5162 // overflow leading to a result that is an IEEE infinity.
5163 
ldexpf(float a,int n)5164 inline float ldexpf(float a, int n)
5165 {   return cast_to_float(std::ldexp(static_cast<double>(a), n));
5166 }
5167 
op(std::int64_t a)5168 inline float Float::op(std::int64_t a)
5169 {
5170 // if |a| < 2^52 I can convert to a double exactly
5171     if (a > -0x10000000000000 && a < 0x10000000000000)
5172         return cast_to_float(static_cast<double>(a));
5173     std::int64_t hi  = a & 0xfffffc0000000000;   // 22 bits
5174     std::int64_t mid = a & 0x000003fffff00000;   // 22 bits
5175     std::int64_t lo  = a & 0x00000000000fffff;   // 20 bits
5176     if (hi == 0 || hi == 0xfffffc000000000)
5177         return cast_to_float(static_cast<double>(hi) +
5178                              static_cast<double>(mid) + static_cast<double>(lo));
5179 // This next line will move a ">0.5ulp" case so that it is visible
5180 // within just the high 44 bits.  This is because the whole number can
5181 // only be a 0.5ulp case if all the bits below the top 24 are zero, and
5182 // for that to happen certainly the low 20 bits must all be zero...
5183     if (lo != 0) mid |= 1;
5184     return cast_to_float(static_cast<double>(hi) +
5185                          static_cast<double>(mid));
5186 }
5187 
op(std::uint64_t * a)5188 inline float Float::op(std::uint64_t *a)
5189 {   std::size_t lena = number_size(a);
5190     if (lena == 1) return Float::op(static_cast<std::int64_t>(a[0]));
5191 // Now I need to do something similar to that done for the int64_t case
5192 // but written larger. Specifically I want to split my input number into
5193 // its top 24 bits and then all the rest. I will take separate paths
5194 // for the positive and negative cases.
5195     std::uint64_t top24;
5196     int lz;
5197     bool sign = false;
5198     std::uint64_t top, next;
5199     bool carried = true;
5200     for (std::size_t i=0; i<lena-2; i++)
5201     {   if (a[i] != 0)
5202         {   carried = false;
5203             break;
5204         }
5205     }
5206 // Grap the top 128 bits of the number as {top,next}.
5207     top = a[lena-1];
5208     next = a[lena-2];    // lena >= 2 here
5209 // Take its absolute value.
5210     if (negative(top))
5211     {   sign = true;
5212         top = ~top;
5213         next = ~next;
5214         if (carried)
5215         {   next++;
5216             if (next == 0) top++;
5217         }
5218     }
5219     if (!carried) next |= 1;
5220 // Now I need to do something very much like the code for the int64_t case.
5221     if (top == 0) lz = nlz(next) + 64;
5222     else lz = nlz(top);
5223 //
5224 //  uint64_t top24 = {top,next} >> (128-24-lz);
5225     int sh = 128-24-lz;
5226 // Note that sh can never be zero here.
5227     if (sh < 64) top24 = (next >> sh) | (top << (64-sh));
5228     else top24 = top >> (sh-64);
5229 //
5230 //  {top,next} = {top,next} << lz+24; // keep only the fraction bits
5231     sh = lz+24;
5232     if (sh < 64)
5233     {   top = (top << sh) | (next >> (64-sh));
5234         next = next << sh;
5235     }
5236     else
5237     {   top = next << (sh - 64);
5238         next = 0;
5239     }
5240 //
5241 //  if ({top,next} > 0x80000000000000000000000000000000U) top24++;
5242 //  else if ({top,next} == 0x80000000000000000000000000000000U)
5243 //      top24 += (top24 & 1);
5244     if (top > 0x8000000000000000U) top24++;
5245     else if (top == 0x8000000000000000U)
5246     {   if (next != 0) top24++;
5247         else top24 += (top24&1);
5248     }
5249     arithlib_assert(top24 >= (static_cast<std::int64_t>(1))<<23 &&
5250                     top24 <= (static_cast<std::int64_t>(1))<<24);
5251     double d = static_cast<float>(top24);
5252     arithlib_assert(top24 == static_cast<std::uint64_t>(d));
5253     if (sign) d = -d;
5254     return ldexpf(d, static_cast<int>(128-24-lz+64*(lena-2)));
5255 }
5256 
op(std::int64_t a,std::int64_t & x)5257 inline double Frexp::op(std::int64_t a, std::int64_t &x)
5258 {
5259 // The bad news here is that I am not confident that C++ will guarantee
5260 // to round large integer values in any particular way when it converts
5261 // them to floating point. So I will take careful action so that the
5262 // conversions that I do are ones that will be exact, and I will perform
5263 // rounding in IEEE style myself.
5264 // First I will see if the value is small enough that I can work directly.
5265     const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
5266     if (a >= -range && a <= range) return static_cast<double>(a);
5267 // I will now drop down to a sign and magnitude representation
5268     bool sign = a < 0;
5269     std::uint64_t top53 = sign ? -static_cast<std::uint64_t>(a) : a;
5270 // Because top53 >= 2^53 the number of leading zeros in its representation is
5271 // at most 10. Ha ha. That guaranteed that the shift below will not overflow
5272 // and is why I chose my range as I did.
5273     int lz = nlz(top53);
5274     std::uint64_t low = top53 << (lz+53);
5275     top53 = top53 >> (64-53-lz);
5276     if (low > 0x8000000000000000U) top53++;
5277     else if (low == 0x8000000000000000U) top53 += (top53 &
5278                 1); // round to even
5279     arithlib_assert(top53 >= (static_cast<std::int64_t>(1))<<52 &&
5280                     top53 <= (static_cast<std::int64_t>(1))<<53);
5281 // The next line should never introduce any rounding at all.
5282     double d = static_cast<double>(top53);
5283     arithlib_assert(top53 == static_cast<std::uint64_t>(d));
5284     if (sign) d = -d;
5285     x =64-53-lz;
5286     return d;
5287 }
5288 
op(std::int64_t a)5289 inline double Double::op(std::int64_t a)
5290 {
5291 // One would obviously like to go "return (double)a;" however C++ says
5292 //  "If the value being converted is in the range of values that can
5293 //   be represented but the value cannot be represented exactly, it is
5294 //   an implementation-defined choice of either the next lower or higher
5295 //   representable value."
5296 // and I feel I should guarantee to round in IEEE style. I can do that
5297 // by splitting the integer into two parts. Each of the two casts can deliver
5298 // a double precision result without need for rounding
5299     std::int64_t hi = a & 0xffffffff00000000;
5300     std::int64_t lo = a & 0x00000000ffffffff;
5301     double d = static_cast<double>(lo);
5302     return d + static_cast<double>(hi);
5303 }
5304 
op(std::uint64_t * a,std::int64_t & x)5305 inline double Frexp::op(std::uint64_t *a, std::int64_t &x)
5306 {   std::size_t lena = number_size(a);
5307     if (lena == 1) return Frexp::op(static_cast<std::int64_t>(a[0]), x);
5308 // Now I need to do something similar to that done for the int64_t case
5309 // but written larger. Specifically I want to split my input number into
5310 // its top 53 bits and then all the rest. I will take separate paths
5311 // for the positive and negative cases.
5312     std::uint64_t top53;
5313     int lz;
5314     bool sign = false;
5315     std::uint64_t top, next;
5316     bool carried = true;
5317     for (std::size_t i=0; i<lena-2; i++)
5318     {   if (a[i] != 0)
5319         {   carried = false;
5320             break;
5321         }
5322     }
5323 // Grap the top 128 bits of the number as {top,next}.
5324     top = a[lena-1];
5325     next = a[lena-2];    // lena >= 2 here
5326 // Take its absolute value.
5327     if (negative(top))
5328     {   sign = true;
5329         top = ~top;
5330         next = ~next;
5331         if (carried)
5332         {   next++;
5333             if (next == 0) top++;
5334         }
5335     }
5336     if (!carried) next |= 1;
5337 // Now I need to do something very much like the code for the int64_t case.
5338     if (top == 0) lz = nlz(next) + 64;
5339     else lz = nlz(top);
5340 //
5341 //  uint64_t top53 = {top,next} >> (128-53-lz);
5342     int sh = 128-53-lz;
5343 // Note that sh can never be zero here.
5344     if (sh < 64) top53 = (next >> sh) | (top << (64-sh));
5345     else top53 = top >> (sh-64);
5346 //
5347 //  {top,next} = {top,next} << lz+53; // keep only the fraction bits
5348     sh = lz+53;
5349     if (sh < 64)
5350     {   top = (top << sh) | (next >> (64-sh));
5351         next = next << sh;
5352     }
5353     else
5354     {   top = next << (sh - 64);
5355         next = 0;
5356     }
5357 //
5358 //  if ({top,next} > 0x80000000000000000000000000000000U) top53++;
5359 //  else if ({top,next} == 0x80000000000000000000000000000000U)
5360 //      top53 += (top53 & 1);
5361     if (top > 0x8000000000000000U) top53++;
5362     else if (top == 0x8000000000000000U)
5363     {   if (next != 0) top53++;
5364         else top53 += (top53&1);
5365     }
5366     arithlib_assert(top53 >= (static_cast<std::int64_t>(1))<<52 &&
5367                     top53 <= (static_cast<std::int64_t>(1))<<53);
5368     double d = static_cast<double>(top53);
5369     arithlib_assert(top53 == static_cast<std::uint64_t>(d));
5370     if (sign) d = -d;
5371     x = 128-53-lz+64*(lena-2);
5372     return d;
5373 }
5374 
op(std::uint64_t * a)5375 inline double Double::op(std::uint64_t *a)
5376 {   std::int64_t x = 0;
5377     double d = Frexp::op(a, x);
5378     if (x > 10000) x = 10000;
5379     return std::ldexp(d, static_cast<int>(x));
5380 }
5381 
5382 #ifdef softfloat_h
5383 
op(std::int64_t a)5384 inline float128_t Float128::op(std::int64_t a)
5385 {   return i64_to_f128(a);
5386 }
5387 
op(std::int64_t a,std::int64_t & x)5388 inline float128_t Frexp128::op(std::int64_t a, std::int64_t &x)
5389 {   float128_t d = i64_to_f128(a), d1;
5390     int xi = 0;
5391     f128M_frexp(&d, &d1, &xi); // in the CSL sources.
5392     x = xi;
5393     return d1;
5394 }
5395 
op(std::uint64_t * a,std::int64_t & x)5396 inline float128_t Frexp128::op(std::uint64_t *a, std::int64_t &x)
5397 {   std::size_t lena = number_size(a);
5398     if (lena == 1) return Float128::op(static_cast<std::int64_t>(a[0]));
5399     std::uint64_t top113, top113a;
5400     int lz;
5401     bool sign = false;
5402     std::uint64_t top, next1, next2;
5403     bool carried = true;
5404     for (std::size_t i=0; i<lena-3; i++)
5405     {   if (a[i] != 0)
5406         {   carried = false;
5407             break;
5408         }
5409     }
5410 // Grap the top 192 bits of the number as {top,next}.
5411     top = a[lena-1];
5412     next1 = a[lena-2];
5413     next2 = lena==2 ? 0 : a[lena-3];
5414 // Take its absolute value.
5415     if (negative(top))
5416     {   sign = true;
5417         top = ~top;
5418         next1 = ~next1;
5419         next2 = ~next2;
5420         if (carried)
5421         {   next2++;
5422             if (next2 == 0)
5423             {   next1++;
5424                 if (next1 == 0) top++;
5425             }
5426         }
5427     }
5428     if (!carried) next2 |= 1;
5429 // I now have {top,next1,next2} the top 192 bits of my integer. top may be
5430 // zero, but if it is then next1 will have its top bit set, and so within
5431 // these bits I certainly have the 113 that I need to obtain an accurate
5432 // floating point value.
5433     if (top == 0) lz = nlz(next1) + 64;
5434     else lz = nlz(top);
5435 //
5436 //  uint64_t {top113,top112a} = {top,next1,next2} >> (128-113-lz);
5437     int sh = 192-113-lz;
5438 // Note that sh can never be zero here.
5439     if (sh < 64)
5440     {   top113 = (next1 >> sh) | (top << (64-sh));
5441         top113a = (next2 >> sh) | (next1 << (64-sh));
5442     }
5443     else
5444     {   top113 = top >> (sh-64);
5445         top113a = (next1 >> (sh-64)) | (top << (128-sh));
5446     }
5447 //
5448 //  {top,next} = {top,next} << lz+113; // keep only the fraction bits
5449     sh = lz+113;
5450     if (sh < 64)
5451     {   top = (top << sh) | (next1 >> (64-sh));
5452         next1 = (next1 << sh) | (next2 >> (64-sh));
5453         next2 = next2 << sh;
5454     }
5455     else
5456     {   top = next1 << (sh - 64);
5457         next1 = (next1 << (sh-64)) | (next2 >> (129-sh));
5458         next2 = 0;
5459     }
5460 //
5461 //  if ({top,next1,next2} > 0x80000000000000000000000000000000U) top113++;
5462 //  else if ({top,next1, next2} == 0x80000000000000000000000000000000U)
5463 //      top113 += (top113 & 1);
5464     if (top > 0x8000000000000000U)
5465     {   top113a++;
5466         if (top113a == 0) top113++;
5467     }
5468     else if (top == 0x8000000000000000U)
5469     {   if (next1 != 0 || (next1==0 && next2!=0))
5470         {   top113a++;
5471             if (top113a == 0) top113++;
5472         }
5473         else top113 += add_with_carry(top113a, top113a&1, top113a);
5474     }
5475 //  float128_t d = i64_to_f128({top113, top113a});
5476     float128_t d = i64_to_f128(top113);
5477     float128_t two32 = i64_to_f128(0x100000000);
5478     d = f128_add(f128_mul(f128_mul(two32, two32), d),
5479                  ui64_to_f128(top113a));
5480     if (sign) d = f128_sub(i64_to_f128(0), d);
5481     x = 192-113-lz+64*(lena-2);
5482     return d;
5483 }
5484 
op(std::uint64_t * a)5485 inline float128_t Float128::op(std::uint64_t *a)
5486 {   std::int64_t x = 0;
5487     float128_t d = Frexp128::op(a, x);
5488     if (x > 100000) x = 100000;
5489 // There is an implementation of ldexp() for 128-bit floats in
5490 // the CSL source file arith14.cpp.
5491     f128M_ldexp(&d, static_cast<int>(x));
5492     return d;
5493 }
5494 
5495 #endif // softfloat_t
5496 
5497 INLINE_VAR const std::uint64_t ten19 = UINT64_C(10000000000000000000);
5498 
string_to_bignum(const char * s)5499 inline std::intptr_t string_to_bignum(const char *s)
5500 {   bool sign = false;
5501     if (*s == '-')
5502     {   sign = true;
5503         s++;
5504     }
5505     std::size_t chars = std::strlen(s);
5506     std::size_t words = 1 + (108853*static_cast<std::uint64_t>
5507                              (chars))/0x200000;
5508 // I have predicted the number of 64-bit digits that will be needed to
5509 // represent an s-digit (decimal) number based an approximation
5510 // 108853/2^21 for log(10)/log(2^64). In 64-bit arithmetic the numerator
5511 // here will not overflow until you have an improbable string of length
5512 // 2^47 as input! The division by a power of 2 should be done very
5513 // rapidly as a shift. I rather expect this calculation to give a rather
5514 // good measure of how many 64-bit words will be needed! It must never be an
5515 // overestimate so that the vector that I allocate never overflows. Somewhat
5516 // rarely it will be and overestimate and it will be necessary to trim the
5517 // vector at the end.
5518     std::uint64_t *r = reserve(words);
5519     for (std::size_t i=0; i<words; i++) r[i] = 0;
5520 // Now for each chunk of digits NNNN in the input I want to go in effect
5521 //     r = 10^19*r + NNNN;
5522 // where the number 19 is used because 10^19 is the largest power of 10
5523 // that fits in a 64-bit word.
5524     std::size_t next = 19*((chars-1)/19);
5525     while (chars != 0)
5526     {   std::uint64_t d = 0;
5527 // assemble 19 digit blocks from the input into a value (d).
5528         while (chars != next)
5529         {   arithlib_assert(std::isdigit(*s));
5530             d = 10*d + (*s++ - '0');
5531             chars--;
5532         }
5533         next -= 19;
5534 // now perform r = 10^19*r + d to consolidate into the eventual result.
5535         for (std::size_t i=0; i<words; i++)
5536             multiply64(r[i], ten19, d, d, r[i]);
5537     }
5538     std::size_t n1 = words;
5539 // Here I may be negating a positive number, and in 2s complement that
5540 // can never lead to a number growing in length.
5541     if (sign)
5542     {   internal_negate(r, words, r);
5543         truncate_negative(r, n1);
5544     }
5545 // However I could not have been precisely certain how many 64-bit words were
5546 // needed and I arranged that any error was conservative - ie allocating
5547 // more that would eventually be used.
5548     else truncate_positive(r, n1);
5549     return confirm_size(r, words, n1);
5550 }
5551 
5552 // The next functions are a key one for printing values. They convert a
5553 // bignum so that it is still stored as a sequence of digits each within
5554 // a 64-bit work, but now each digit will be be in the range 0 - (10^19-1)
5555 // so that the value is in effect represented base 10^19. From that state
5556 // printing it in decimal becomes easy!
5557 
5558 
5559 // This first one takes a number represented base 2^64 with digits
5560 // 0 to n-1 and divides it by 10^19, returning the remainder and
5561 // setting both the digits and its length suitably to be the quotient.
5562 // The number is POSITIVE here. Note that the function overwrites its input
5563 // with the quotient.
5564 
5565 
short_divide_ten_19(std::uint64_t * r,std::size_t & n)5566 inline std::uint64_t short_divide_ten_19(std::uint64_t *r,
5567         std::size_t &n)
5568 {   std::uint64_t hi = 0;
5569     std::size_t i=n-1;
5570     for (;;)
5571     {   divide64(hi, r[i], ten19, r[i], hi);
5572         if (i == 0) break;
5573         i--;
5574     }
5575     if (r[n-1] == 0) n--;
5576     return hi;
5577 }
5578 
5579 // How many bits are there in a bignum?
5580 
5581 // Note that if a bignum occupies over 1/8 of your total memory that
5582 // the number of bits it uses might overflow size_t. On a 32-bit system
5583 // this might happen if the number occupies over 512 Mbytes and I view
5584 // that as a situation I will accept as a limit for 32-bit platforms.
5585 
bignum_bits(const std::uint64_t * a,std::size_t lena)5586 inline std::size_t bignum_bits(const std::uint64_t *a,
5587                                std::size_t lena)
5588 {   if (lena == 0 && a[0] == 0) return 1; // say that 0 has 1 bit.
5589     std::uint64_t top = a[lena-1];  // top digit.
5590 // The exact interpretation of "the length in bits of a negative number"
5591 // is something I need to think through. Well Common Lisp counts the
5592 // number of bits apart from the sign bit, so we have
5593 //      n      bignum_bits(n)   bignum_bits(-n)
5594 //      0           0                0
5595 //     1           1    1           0 ..11111:
5596 //      2           2   10           1 ..1111:0
5597 //      3           2   11           2 ..111:01
5598 //      4           3  100           2 ..111:00
5599 //      7           3  111           3 ..11:001
5600 //      8           4 1000           3 ..11:000
5601     if (negative(top))
5602     {   std::uint64_t carry = 1;
5603         for (std::size_t i=0; i<lena; i++)
5604         {   top = ~a[i] + carry;
5605             carry = (top < carry ? 1 : 0);
5606         }
5607         top--;
5608     }
5609     return 64*(lena-1) + (top==0 ? 0 : 64-nlz(top));
5610 }
5611 
5612 // I want an estimate of the number of bytes that it will take to
5613 // represent a number when I convert it to a string.
5614 //
5615 // I will work through an example. Consider the input 12024932 = 0xb77c64.
5616 // [I use this value because at one time it revealed a mistake I had made!]
5617 // This value uses 24 bits, ie its value is at least 2^23 (8388608) and
5618 // it is less than 2^26 (16777216). log10(2^24) is 7.2247... so in decimal
5619 // the number will use 7.2 digits, well that must be rounded up to 8.
5620 // log10(2^24) = 24*log10(2) = 24*0.301030.. < 24*(617/2048) [because that
5621 // fraction = 0.30127.. > log10(2)]. So if one the number of decimal digits
5622 // that can be generated will be ceil(24*617/2048). I will compute that by
5623 // forming a quotient that is truncated towards zero and then adding 1, and
5624 // in this case this yields 8 as required. For negative numbers I will add 1
5625 // to allow for a "-" sign.
5626 
predict_size_in_bytes(const std::uint64_t * a,std::size_t lena)5627 inline std::size_t predict_size_in_bytes(const std::uint64_t *a,
5628         std::size_t lena)
5629 {
5630 // I am first going to estimate the size in BITS and then I will
5631 // see how that maps onto bytes.
5632     std::size_t r = bignum_bits(a, lena);
5633     r = 1 + static_cast<std::size_t>(
5634             (617*static_cast<std::uint64_t>(r))/2048);
5635     if (negative(a[lena-1])) r += 2; // allow space for a "-" sign.
5636     return r;
5637 }
5638 
bignum_to_string_length(std::uint64_t * a,std::size_t lena)5639 inline std::size_t bignum_to_string_length(std::uint64_t *a,
5640         std::size_t lena)
5641 {   if (lena == 1)
5642     {   std::int64_t v = a[0];
5643 // Note that the negative numbers checked against are 1 digit shorter so as
5644 // to allow space for the "-" sign.
5645         if (v <= 9999999 && v >= -999999) return 7;
5646         else if (v <= 999999999999999 && v >= -99999999999999) return 15;
5647         else return 23;
5648     }
5649     else return predict_size_in_bytes(a, lena);
5650 }
5651 
5652 // The "as_unsigned" option here is not for general use - it is JUST for
5653 // internal debugging because at times I work with values that are known
5654 // to be positive and so where the top digit must be treated as unsigned...
5655 
bignum_to_string(char * result,std::size_t m,std::uint64_t * a,std::size_t lena,bool as_unsigned=false)5656 inline std::size_t bignum_to_string(char *result, std::size_t m,
5657                                     std::uint64_t *a, std::size_t lena,
5658                                     bool as_unsigned=false)
5659 {
5660 // Making one-word numbers a special case simplifies things later on! It may
5661 // also make this case go just slightly faster.
5662     if (lena == 1)
5663     {   std::uint64_t v = a[0];
5664         bool sign;
5665         if (negative(v) && !as_unsigned)
5666         {   sign = true;
5667             v = -v;
5668         }
5669         else sign = false;
5670         char buffer[24];
5671         std::size_t len = 0;
5672         while (v != 0)
5673         {   buffer[len++] = '0' + v%10;
5674             v = v/10;
5675         }
5676 // Now I have the decimal digits on the number in my buffer, with the
5677 // least significant first and the most significant last. Insert the sign bit
5678 // if needed (and deal with the special case of zero).
5679         if (sign) buffer[len++] = '-';
5680         else if (len == 0) buffer[len++] = '0';
5681         for (std::size_t i=0; i<len; i++) result[i] = buffer[len-i-1];
5682         return len;
5683     }
5684 // The size (m) for the block of memory that I put my result in is
5685 // such that it could hold the string representation of my input, and
5686 // I estimate that via predict_size_in_bytes(). Well the smallest bignum
5687 // that will need 2 words will be {0,0x8000000000000000}, ie 2^63. That
5688 // will need 19 decimal digits plus space for a sign bit, so there will be
5689 // at least 20 bytes allocated for the printed representation of any 2-word
5690 // bignum, and at least 40 for a 3-word value, at least 59 for a 4-word one
5691 // etc. This means that the space I will allocate here for the result
5692 // leaves me with plenty of workspace to use while constructing the
5693 // output string. The case liable to be tightest will be that of the
5694 // smallest 2-woed bignum, so if I ensure that is OK all the rest will
5695 // certainly be safe.
5696 //
5697 // I am going to build up (decimal) digits of the converted number by
5698 // repeatedly dividing by 10^19. Each time I do that the remainder I
5699 // amd left with is the next low 19 decimal digits of my number. Doing the
5700 // divisions needs a vector to store the number I am dividing by 10^19 and
5701 // to put the quotient, and I do not want to corrupt my original input, so
5702 // I will copy my input into a fresh vector. And I will force it to be
5703 // positive. The made-positive version might have a leading digit with
5704 // its top bit set - that will not worry me because I view it as unsigned.
5705 //
5706 // I have allocated the space that will be needed for the eventual string of
5707 // characters. I will use that space to save numeric values along the way, so
5708 // here I cast so I can use that same memory as a vector of 64-bit integers.
5709 // I will only ever access data in the format that it was placed into memory!
5710 // Note that this will assume that the string data was allocated so as to
5711 // be aligned suitably for uint64_t values.
5712     std::uint64_t *r = reinterpret_cast<std::uint64_t *>(result);
5713     std::size_t i;
5714 // For the edge case lena==2 and m==20. I copy 2 words across. That will leave
5715 // 4 bytes unused.
5716     for (i=0; i<lena; i++) r[i] = a[i];
5717     for (; i<m/sizeof(std::uint64_t); i++) r[i] = 0;
5718 // Make the number positive
5719     bool sign = false;
5720     if (negative(r[lena-1]) && !as_unsigned)
5721     {   sign = true;
5722         internal_negate(r, lena, r);
5723     }
5724 // Now my number is positive and is of length lena, but the vector it is
5725 // stored in is length m with m usefully larger than lena. I will repeatedly
5726 // divide by 10^19 and each time I do that I can store the remainder working
5727 // down from the top of the vector. That should JUST keep up so that I
5728 // never overwrite digits of the reducing part! I will stop when the
5729 // number I have been working with end up < 10^19.
5730     std::size_t p = m/sizeof(std::uint64_t)
5731                     -1; // where to put next output digit
5732 // Each value written into the vector here will stand for 19 decimal
5733 // digits, and will use 8 bytes. So here the nastiest case will be when the
5734 // number of decimal digits to end up with is 7 mod 8 (so that I lose as
5735 // much space as possible) and the number is as large as possible. My
5736 // belief is that numbers from 10^16 upwards will lead to there being enough
5737 // space.
5738     while (lena > 1 || r[0] > ten19)
5739     {   std::uint64_t d = short_divide_ten_19(r, lena);
5740         r[p--] = d;
5741     }
5742     r[p] = r[0];
5743 // Now I have the data that has to go into my result as a sequence of
5744 // digits base 10^19, with the most significant one first. Convert
5745 // to character data. I write in the string data just over what has been
5746 // digits data, and I have arranged to position everything to (just)
5747 // avoid overwriting myself.
5748     std::uint64_t top = r[p++];
5749     if (top == 0) top = r[p++]; // discard potential leading zero!
5750 // Get a pointer into the buffer as character data...
5751     char *p1 = reinterpret_cast<char *>(result);
5752     std::size_t len = 0;
5753     if (sign)
5754     {   *p1++ = '-';
5755         len = 1;
5756     }
5757 // I am going to convert my numbers to decimal using explicit code here.
5758 // in an earlier draft I used sprintf(), however that adds unnecessary
5759 // overhead.
5760     char buffer[24];
5761     int bp = 0;
5762 // The first part of the number is printed naturally so that it only
5763 // uses as many bytes of output as it needs.
5764     do
5765     {   buffer[bp++] = '0' + top%10;
5766         top = top/10;
5767     }
5768     while (top != 0);
5769     do
5770     {   *p1++ = buffer[--bp];
5771         len++;
5772     }
5773     while (bp != 0);
5774     arithlib_assert(len + 19*(m/sizeof(std::uint64_t)-p)<= m);
5775     while (p < m/sizeof(std::uint64_t))
5776     {
5777 // I will always pick up the number I am going to expand before writing any
5778 // digits into the buffer.
5779         top = r[p++];
5780 // For subsequent chunks I want to print exactly 19 decimal digits.
5781         for (std::size_t i=0; i<18; i++)
5782         {   p1[18-i] = '0' + top%10;
5783             top = top/10;
5784         }
5785         *p1 = '0' + static_cast<int>(top);
5786         p1 += 19;
5787         len += 19;
5788     }
5789 // To convince myself that this is safe consider when I pick up the final
5790 // chunk. It will turn into 19 bytes of output, so where it comes from must
5791 // be no more than 19 bytes before the length (m) of the final string, because
5792 // otherwise it would have got clobbered when I unpacked the previous chunk.
5793 // But this final chunk is itself 8 bytes wide and there can be up to 7 bytes
5794 // beyond it that are there to support proper alignment - so that last chunk
5795 // lives within the final 15 bytes of the buffer and that is a fortiori within
5796 // the last 19 as required.
5797     return len;
5798 }
5799 
bignum_to_string(std::uint64_t * a,std::size_t lena,bool as_unsigned=false)5800 inline string_handle bignum_to_string(std::uint64_t *a,
5801                                       std::size_t lena,
5802                                       bool as_unsigned=false)
5803 {   std::size_t len = bignum_to_string_length(a, lena);
5804     push(a);
5805     char *s = reserve_string(len);
5806     pop(a);
5807     std::size_t final_len = bignum_to_string(s, len, a, lena,
5808                             as_unsigned);
5809     return confirm_size_string(s, len, final_len);
5810 }
5811 
bignum_to_string(std::intptr_t aa)5812 inline string_handle bignum_to_string(std::intptr_t aa)
5813 {   std::uint64_t *a, v[1];
5814     std::size_t lena;
5815     if (stored_as_fixnum(aa))
5816     {   v[0] = int_of_handle(aa);
5817         a = v;
5818         lena = 1;
5819     }
5820     else
5821     {   a = vector_of_handle(aa);
5822         lena = number_size(a);
5823     }
5824     return bignum_to_string(a, lena);
5825 }
5826 
5827 // As well as converting to decimal I can do hex, octal or binary!
5828 
bignum_to_string_hex_length(std::intptr_t aa)5829 inline std::size_t bignum_to_string_hex_length(std::intptr_t aa)
5830 {   return 24;
5831 }
5832 
bignum_to_string_hex(std::intptr_t aa)5833 inline string_handle bignum_to_string_hex(std::intptr_t aa)
5834 {   std::uint64_t *a, v[1];
5835     std::size_t n;
5836     if (stored_as_fixnum(aa))
5837     {   v[0] = int_of_handle(aa);
5838         a = v;
5839         n = 1;
5840     }
5841     else
5842     {   a = vector_of_handle(aa);
5843         n = number_size(a);
5844     }
5845 // Making the value zero a special case simplifies things later on!
5846     if (n == 1 && a[0] == 0)
5847     {   char *r = reserve_string(1);
5848         std::strcpy(r, "0");
5849         return confirm_size_string(r, 1, 1);
5850     }
5851 // printing in hexadecimal should be way easier!
5852     std::size_t m = 16*n;
5853     std::uint64_t top = a[n-1];
5854     bool sign = negative(top);
5855     if (sign)
5856     {   m += 2; // for "~f"
5857         while ((top>>60) == 0xf)
5858         {   top = top << 4;
5859             m--;
5860         }
5861     }
5862     else
5863     {   while (top == 0)
5864         {   n--;
5865             top = a[n-1];
5866         }
5867         while ((top>>60) == 0)
5868         {   top = top << 4;
5869             m--;
5870         }
5871     }
5872     push(a);
5873     char *r = reserve_string(m);
5874     pop(a);
5875     char *p = reinterpret_cast<char *>(r);
5876     top = a[n-1];
5877     if (sign)
5878     {   *p++ = '~';
5879         *p++ = 'f';
5880     }
5881     bool started = false;
5882     for (std::size_t i=0; i<n; i++)
5883     {   std::uint64_t v = a[n-i-1];
5884         for (int j=0; j<16; j++)
5885         {   int d = static_cast<int>(v >> (60-4*j)) & 0xf;
5886             if (!started)
5887             {   if ((sign && d==0xf) ||
5888                     (!sign && d==0)) continue;
5889                 started = true;
5890             }
5891             *p++ = "0123456789abcdef"[d];
5892         }
5893     }
5894     return confirm_size_string(r, m, m);
5895 }
5896 
bignum_to_string_octal_length(std::intptr_t aa)5897 inline std::size_t bignum_to_string_octal_length(std::intptr_t aa)
5898 {   return 24;
5899 }
5900 
bignum_to_string_octal(std::intptr_t aa)5901 inline string_handle bignum_to_string_octal(std::intptr_t aa)
5902 {   std::uint64_t *a, v[1];
5903     std::size_t n;
5904     if (stored_as_fixnum(aa))
5905     {   v[0] = int_of_handle(aa);
5906         a = v;
5907         n = 1;
5908     }
5909     else
5910     {   a = vector_of_handle(aa);
5911         n = number_size(a);
5912     }
5913     std::size_t width = (64*n +
5914                          2)/3; // raw number of octal digits needed.
5915     std::uint64_t top = a[n-1];
5916     bool sign = negative(top);
5917 // There is a slight misery in that 64 is not a multiple of 3 (!) and so
5918 // the octal representation of a value has some digits that depend on a pair
5919 // of adjacent words from the bignum.
5920     std::size_t
5921     nn;  // will be the number of characters used in the output
5922     if (sign)
5923     {   while (read_u3(a, n, width-1) == 7 && width > 1) width--;
5924         nn = width+2;
5925     }
5926     else
5927     {   while (read_u3(a, n, width-1) == 0 && width > 1) width--;
5928         nn = width;
5929     }
5930     push(a);
5931     char *r = reserve_string(nn);
5932     pop(a);
5933     char *p = reinterpret_cast<char *>(r);
5934     if (sign)
5935     {   *p++ = '~';
5936         *p++ = '7';
5937     }
5938     for (std::size_t i=0; i<width; i++)
5939         *p++ = '0' + read_u3(a, n, width-i-1);
5940     return confirm_size_string(r, nn, width);
5941 }
5942 
bignum_to_string_binary_length(std::intptr_t aa)5943 inline std::size_t bignum_to_string_binary_length(std::intptr_t aa)
5944 {   return 24;
5945 }
5946 
bignum_to_string_binary(std::intptr_t aa)5947 inline string_handle bignum_to_string_binary(std::intptr_t aa)
5948 {   std::uint64_t *a, v[1];
5949     std::size_t n;
5950     if (stored_as_fixnum(aa))
5951     {   v[0] = int_of_handle(aa);
5952         a = v;
5953         n = 1;
5954     }
5955     else
5956     {   a = vector_of_handle(aa);
5957         n = number_size(a);
5958     }
5959 // Making the value zero a special case simplifies things later on!
5960     if (n == 1 && a[0] == 0)
5961     {   char *r = reserve_string(1);
5962         std::strcpy(r, "0");
5963         return confirm_size_string(r, 1, 1);
5964     }
5965     std::size_t m = 64*n;
5966     std::uint64_t top = a[n-1];
5967     bool sign = negative(top);
5968     if (sign)
5969     {   m += 2; // for "~1"
5970         while ((top>>63) == 1)
5971         {   top = top << 1;
5972             m--;
5973         }
5974     }
5975     else
5976     {   arithlib_assert(top != 0);
5977         while ((top>>63) == 0)
5978         {   top = top << 1;
5979             m--;
5980         }
5981     }
5982     push(a);
5983     char *r = reserve_string(m);
5984     pop(a);
5985     char *p = reinterpret_cast<char *>(r);
5986     top = a[n-1];
5987     if (sign)
5988     {   *p++ = '~';
5989         *p++ = '1';
5990     }
5991     bool started = false;
5992     for (std::size_t i=0; i<n; i++)
5993     {   std::uint64_t v = a[n-i-1];
5994         for (int j=0; j<64; j++)
5995         {   int d = static_cast<int>(v >> (63-j)) & 0x1;
5996             if (!started)
5997             {   if ((sign && d==1) ||
5998                     (!sign && d==0)) continue;
5999                 started = true;
6000             }
6001             *p++ = '0' + d;
6002         }
6003     }
6004     return confirm_size_string(r, m, m);
6005 }
6006 
6007 //=========================================================================
6008 //=========================================================================
6009 // Big number comparisons.
6010 //=========================================================================
6011 //=========================================================================
6012 
6013 
op(std::uint64_t * a)6014 inline bool Zerop::op(std::uint64_t *a)
6015 {   return number_size(a) == 1 && a[0] == 0;
6016 }
6017 
op(std::int64_t a)6018 inline bool Zerop::op(std::int64_t a)
6019 {   return a == 0;
6020 }
6021 
op(std::uint64_t * a)6022 inline bool Onep::op(std::uint64_t *a)
6023 {   return number_size(a) == 1 && a[0] == 1;
6024 }
6025 
op(std::int64_t a)6026 inline bool Onep::op(std::int64_t a)
6027 {   return a == 1;
6028 }
6029 
op(std::uint64_t * a)6030 inline bool Minusp::op(std::uint64_t *a)
6031 {   return negative(a[number_size(a)-1]);
6032 }
6033 
op(std::int64_t a)6034 inline bool Minusp::op(std::int64_t a)
6035 {   return a < 0;
6036 }
6037 
op(std::uint64_t * a)6038 inline bool Evenp::op(std::uint64_t *a)
6039 {   return (a[0] & 1) == 0;
6040 }
6041 
op(std::int64_t a)6042 inline bool Evenp::op(std::int64_t a)
6043 {   return (a & 1) == 0;
6044 }
6045 
op(std::uint64_t * a)6046 inline bool Oddp::op(std::uint64_t *a)
6047 {   return (a[0] & 1) != 0;
6048 }
6049 
op(std::int64_t a)6050 inline bool Oddp::op(std::int64_t a)
6051 {   return (a & 1) != 0;
6052 }
6053 
6054 // eqn
6055 
bigeqn(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb)6056 inline bool bigeqn(const std::uint64_t *a, std::size_t lena,
6057                    const std::uint64_t *b, std::size_t lenb)
6058 {   if (lena != lenb) return false;
6059     return std::memcmp(a, b, lena*sizeof(std::uint64_t)) == 0;
6060 }
6061 
6062 
op(std::uint64_t * a,std::uint64_t * b)6063 inline bool Eqn::op(std::uint64_t *a, std::uint64_t *b)
6064 {   std::size_t lena = number_size(a);
6065     std::size_t lenb = number_size(b);
6066     return bigeqn(a, lena, b, lenb);
6067 }
6068 
op(std::uint64_t * a,std::int64_t b)6069 inline bool Eqn::op(std::uint64_t *a, std::int64_t b)
6070 {   std::size_t lena = number_size(a);
6071     return lena==1 && static_cast<std::int64_t>(a[0])==b;
6072 }
6073 
op(std::int64_t a,std::uint64_t * b)6074 inline bool Eqn::op(std::int64_t a, std::uint64_t *b)
6075 {   std::size_t lenb = number_size(b);
6076     return lenb==1 && a==static_cast<std::int64_t>(b[0]);
6077 }
6078 
op(std::int64_t a,std::int64_t b)6079 inline bool Eqn::op(std::int64_t a, std::int64_t b)
6080 {   return (a == b);
6081 }
6082 
op(std::int64_t a,float b)6083 inline bool Eqn::op(std::int64_t a, float b)
6084 {   return Eqn::op(a, static_cast<double>(b));
6085 }
6086 
op(std::uint64_t * a,float b)6087 inline bool Eqn::op(std::uint64_t *a, float b)
6088 {   return Eqn::op(a, static_cast<double>(b));
6089 }
6090 
op(float a,std::int64_t b)6091 inline bool Eqn::op(float a, std::int64_t b)
6092 {   return Eqn::op(static_cast<double>(a), b);
6093 }
6094 
op(float a,std::uint64_t * b)6095 inline bool Eqn::op(float a, std::uint64_t *b)
6096 {   return Eqn::op(static_cast<double>(a), b);
6097 }
6098 
op(std::int64_t a,double b)6099 inline bool Eqn::op(std::int64_t a, double b)
6100 {   const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
6101     if (a >= -range && a <= range) return static_cast<double>(a) == b;
6102 // The value on the next line is a floating point representation of 2^63,
6103 // so any floating value at least that large is bigger than any int64_t value.
6104     if (b >= 9223372036854775808.0) return false;
6105     else if (b < -9223372036854775808.0) return false;
6106     if (std::isnan(b)) return false;
6107     return a == static_cast<std::int64_t>(b);
6108 }
6109 
eqnfloat(std::uint64_t * a,std::size_t lena,double b)6110 inline bool eqnfloat(std::uint64_t *a, std::size_t lena, double b)
6111 {   if (std::isnan(b)||
6112         std::isinf(b)) return false;
6113     std::int64_t top = static_cast<std::int64_t>(a[lena-1]);
6114 // If the signs differn than the values are certainly not equal.
6115     if (top >= 0 && b <= 0.0) return false;
6116     if (top < 0 && b >= 0.0) return false;
6117     double ipart;
6118     double fpart = std::modf(b, &ipart);
6119     if (fpart != 0.0) return false; // not an integer so not equal.
6120     std::int64_t hi;
6121     std::uint64_t next;
6122     std::size_t len;
6123     double_to_virtual_bignum(ipart, hi, next, len, RoundingMode::TRUNC);
6124     if (len != lena) return false;
6125     if (len == 1) return a[0] == static_cast<std::uint64_t>(top);
6126     if (a[len-1] != static_cast<std::uint64_t>(top) ||
6127         a[len-2] != next) return false;
6128     for (std::size_t i=0; i<len-2; i++)
6129         if (a[i] != 0) return false;
6130     return true;
6131 }
6132 
op(std::uint64_t * a,double b)6133 inline bool Eqn::op(std::uint64_t *a, double b)
6134 {   std::size_t lena = number_size(a);
6135     if (lena == 1) return Eqn::op(static_cast<std::int64_t>(a[0]), b);
6136     return eqnfloat(a, lena, b);
6137 }
6138 
op(double a,std::int64_t b)6139 inline bool Eqn::op(double a, std::int64_t b)
6140 {   return Eqn::op(b, a);
6141 }
6142 
op(double a,std::uint64_t * b)6143 inline bool Eqn::op(double a, std::uint64_t *b)
6144 {   return Eqn::op(b, a);
6145 }
6146 
6147 #ifdef softfloat_h
6148 
6149 // The following constants are 2^112 and -2^112 and their reciprocals, which
6150 // are used in rationalf128 because any 128-bit floating point value that
6151 // is that large is necessarily an exact integer.
6152 
6153 #ifdef LITTLEENDIAN
6154 
6155 INLINE_VAR float128_t FP128_INT_LIMIT = {{0, INT64_C(0x406f000000000000)}};
6156 INLINE_VAR float128_t FP128_MINUS_INT_LIMIT = {{0, INT64_C(0xc06f000000000000)}};
6157 
6158 #else // !LITTLEENDIAN
6159 
6160 INLINE_VAR float128_t FP128_INT_LIMIT = {{INT64_C(0x406f000000000000), 0}};
6161 INLINE_VAR float128_t FP128_MINUS_INT_LIMIT = {{INT64_C(0xc06f000000000000), 0}};
6162 
6163 #endif // !LITTLEENDIAN
6164 
6165 
eqnbigfloat(std::uint64_t * a,std::size_t lena,float128_t b)6166 inline bool eqnbigfloat(std::uint64_t *a, std::size_t lena,
6167                         float128_t b)
6168 {   if (!f128_eq(b, b)) return false;  // a NaN if b!=b
6169     std::int64_t top = static_cast<std::int64_t>(a[lena-1]);
6170     if (top >= 0 && f128_lt(b, f128_0)) return false;
6171     if (top < 0 && !f128_lt(b, f128_0)) return false;
6172 // Now the two inputs have the same sign.
6173     if (lena == 1 ||
6174         (lena == 2 &&
6175          !((a[1] > 0x0001000000000000 ||
6176             (a[1] == 0x0001000000000000 && a[0] != 0)) ||
6177            static_cast<std::int64_t>(a[1]) < -static_cast<std::int64_t>
6178            (0x0001000000000000))))
6179     {
6180 // Here the integer is of modest size - if the float is huge we can
6181 // resolve matters cheaply.
6182         if (f128_lt(FP128_INT_LIMIT, b) ||
6183             f128_lt(b, FP128_MINUS_INT_LIMIT)) return false;
6184 // Convert a to a float128 and compare. The conversion will not lose any
6185 // information because the |a| <= 2^112 so it will fit within the mantissa
6186 // bits that are available.
6187         float128_t aa = Float128::op(a);
6188         return f128_eq(aa, b);
6189     }
6190     else
6191     {
6192 // Now the integer is rather big. If I was KEEN I would estimate the size of
6193 // the float from its exponent and compare with the number of bits in the
6194 // integer to filter out cases where their sized were very different. However
6195 // I am not feeling very keen! I can afford to convert the float to an integer,
6196 // and because it is large when I fix it there will not be any discarded
6197 // fractional part...
6198         std::intptr_t bb = round_float128_to_int(b);
6199         return op_dispatch2<Eqn,bool>(vector_to_handle(a), bb);
6200     }
6201 }
6202 
op(std::int64_t a,float128_t b)6203 inline bool Eqn::op(std::int64_t a, float128_t b)
6204 {   return f128_eq(i64_to_f128(a), b);
6205 }
6206 
op(std::uint64_t * a,float128_t b)6207 inline bool Eqn::op(std::uint64_t *a, float128_t b)
6208 {   std::size_t lena = number_size(a);
6209     if (lena == 1) return Eqn::op(static_cast<std::int64_t>(a[0]), b);
6210     return eqnbigfloat(a, lena, b);
6211 }
6212 
op(float128_t a,std::int64_t b)6213 inline bool Eqn::op(float128_t a, std::int64_t b)
6214 {   return Eqn::op(b, a);
6215 }
6216 
op(float128_t a,std::uint64_t * b)6217 inline bool Eqn::op(float128_t a, std::uint64_t *b)
6218 {   return Eqn::op(b, a);
6219 }
6220 
6221 #endif // softfloat_h
6222 
op(std::uint64_t * a,std::uint64_t * b)6223 inline bool Neqn::op(std::uint64_t *a, std::uint64_t *b)
6224 {   std::size_t lena = number_size(a);
6225     std::size_t lenb = number_size(b);
6226     return !bigeqn(a, lena, b, lenb);
6227 }
6228 
op(std::uint64_t * a,std::int64_t b)6229 inline bool Neqn::op(std::uint64_t *a, std::int64_t b)
6230 {   std::size_t lena = number_size(a);
6231     return lena!=1 || static_cast<std::int64_t>(a[0])!=b;
6232 }
6233 
op(std::int64_t a,std::uint64_t * b)6234 inline bool Neqn::op(std::int64_t a, std::uint64_t *b)
6235 {   std::size_t lenb = number_size(b);
6236     return lenb!=1 || a!=static_cast<std::int64_t>(b[0]);
6237 }
6238 
op(std::int64_t a,std::int64_t b)6239 inline bool Neqn::op(std::int64_t a, std::int64_t b)
6240 {   return (a != b);
6241 }
6242 
op(std::int64_t a,float b)6243 inline bool Neqn::op(std::int64_t a, float b)
6244 {   return Neqn::op(a, static_cast<double>(b));
6245 }
6246 
op(std::uint64_t * a,float b)6247 inline bool Neqn::op(std::uint64_t *a, float b)
6248 {   return Neqn::op(a, static_cast<double>(b));
6249 }
6250 
op(float a,std::int64_t b)6251 inline bool Neqn::op(float a, std::int64_t b)
6252 {   return Neqn::op(static_cast<double>(a), b);
6253 }
6254 
op(float a,std::uint64_t * b)6255 inline bool Neqn::op(float a, std::uint64_t *b)
6256 {   return Neqn::op(static_cast<double>(a), b);
6257 }
6258 
op(std::int64_t a,double b)6259 inline bool Neqn::op(std::int64_t a, double b)
6260 {   const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
6261     if (a >= -range && a <= range) return static_cast<double>(a) != b;
6262 // The value on the next line is a floating point representation of 2^63,
6263 // so any floating value at least that large is bigger than any int64_t value.
6264     if (b >= 9223372036854775808.0) return true;
6265     else if (b < -9223372036854775808.0) return true;
6266     if (std::isnan(b)) return false;   // Ha Ha Ha!
6267     return a != static_cast<std::int64_t>(b);
6268 }
6269 
op(std::uint64_t * a,double b)6270 inline bool Neqn::op(std::uint64_t *a, double b)
6271 {   std::size_t lena = number_size(a);
6272     if (lena == 1) return Neqn::op(static_cast<std::int64_t>(a[0]), b);
6273     return !eqnfloat(a, lena, b);
6274 }
6275 
op(double a,std::int64_t b)6276 inline bool Neqn::op(double a, std::int64_t b)
6277 {   return Neqn::op(b, a);
6278 }
6279 
op(double a,std::uint64_t * b)6280 inline bool Neqn::op(double a, std::uint64_t *b)
6281 {   return Neqn::op(b, a);
6282 }
6283 
6284 #ifdef softfloat_h
6285 
op(std::int64_t a,float128_t b)6286 inline bool Neqn::op(std::int64_t a, float128_t b)
6287 {   return !f128_eq(i64_to_f128(a), b);
6288 }
6289 
op(std::uint64_t * a,float128_t b)6290 inline bool Neqn::op(std::uint64_t *a, float128_t b)
6291 {   std::size_t lena = number_size(a);
6292     if (lena == 1) return Neqn::op(static_cast<std::int64_t>(a[0]), b);
6293     return !eqnbigfloat(a, lena, b);
6294 }
6295 
op(float128_t a,std::int64_t b)6296 inline bool Neqn::op(float128_t a, std::int64_t b)
6297 {   return Neqn::op(b, a);
6298 }
6299 
op(float128_t a,std::uint64_t * b)6300 inline bool Neqn::op(float128_t a, std::uint64_t *b)
6301 {   return Neqn::op(b, a);
6302 }
6303 
6304 #endif // softfloat_h
6305 
6306 // greaterp
6307 
biggreaterp(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb)6308 inline bool biggreaterp(const std::uint64_t *a, std::size_t lena,
6309                         const std::uint64_t *b, std::size_t lenb)
6310 {   std::uint64_t a0 = a[lena-1], b0 = b[lenb-1];
6311 // If one of the numbers has more digits than the other then the sign of
6312 // the longer one gives my the answer.
6313     if (lena > lenb) return positive(a0);
6314     else if (lenb > lena) return negative(b0);
6315 // When the two numbers are the same length but the top digits differ
6316 // then comparing those digits tells me all I need to know.
6317     if (static_cast<std::int64_t>(a0) >
6318         static_cast<std::int64_t>(b0)) return true;
6319     if (static_cast<std::int64_t>(a0) <
6320         static_cast<std::int64_t>(b0)) return false;
6321 // Otherwise I need to scan down through digits...
6322     lena--;
6323     while (lena != 0)
6324     {   lena--;
6325         a0 = a[lena];
6326         b0 = b[lena];
6327         if (a0 > b0) return true;
6328         if (a0 < b0) return false;
6329     }
6330     return false;
6331 }
6332 
6333 // This version treats the two inputs as unsigned numbers. It is used from
6334 // within the GCD code (at least)
6335 
big_unsigned_greaterp(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb)6336 inline bool big_unsigned_greaterp(const std::uint64_t *a,
6337                                   std::size_t lena,
6338                                   const std::uint64_t *b, std::size_t lenb)
6339 {
6340 // If one of the numbers has more digits than the other then it is the
6341 // larger.
6342     if (lena > lenb) return true;
6343     else if (lenb > lena) return false;
6344     while (lena != 0)
6345     {   lena--;
6346         std::uint64_t a0 = a[lena];
6347         std::uint64_t b0 = b[lena];
6348         if (a0 > b0) return true;
6349         if (a0 < b0) return false;
6350     }
6351     return false;
6352 }
6353 
op(std::uint64_t * a,std::uint64_t * b)6354 inline bool Greaterp::op(std::uint64_t *a, std::uint64_t *b)
6355 {   std::size_t lena = number_size(a);
6356     std::size_t lenb = number_size(b);
6357     return biggreaterp(a, lena, b, lenb);
6358 }
6359 
op(std::uint64_t * a,std::int64_t bb)6360 inline bool Greaterp::op(std::uint64_t *a, std::int64_t bb)
6361 {   std::uint64_t b[1] = {static_cast<std::uint64_t>(bb)};
6362     std::size_t lena = number_size(a);
6363     return biggreaterp(a, lena, b, 1);
6364 }
6365 
op(std::int64_t aa,std::uint64_t * b)6366 inline bool Greaterp::op(std::int64_t aa, std::uint64_t *b)
6367 {   std::uint64_t a[1] = {static_cast<std::uint64_t>(aa)};
6368     std::size_t lenb = number_size(b);
6369     return biggreaterp(a, 1, b, lenb);
6370 }
6371 
op(std::int64_t a,std::int64_t b)6372 inline bool Greaterp::op(std::int64_t a, std::int64_t b)
6373 {   return a > b;
6374 }
6375 
6376 // I can always widen a float to a double without loss of any information,
6377 // so all the cases of comparisons with floats (as distinct from with
6378 // double) are easy to delegate.
6379 
op(std::int64_t a,float b)6380 inline bool Greaterp::op(std::int64_t a, float b)
6381 {   return Greaterp::op(a, static_cast<double>(b));
6382 }
6383 
op(std::uint64_t * a,float b)6384 inline bool Greaterp::op(std::uint64_t *a, float b)
6385 {   return Greaterp::op(a, static_cast<double>(b));
6386 }
6387 
op(float a,std::int64_t b)6388 inline bool Greaterp::op(float a, std::int64_t b)
6389 {   return Greaterp::op(static_cast<double>(a), b);
6390 }
6391 
op(float a,std::uint64_t * b)6392 inline bool Greaterp::op(float a, std::uint64_t *b)
6393 {   return Greaterp::op(static_cast<double>(a), b);
6394 }
6395 
op(std::int64_t a,double b)6396 inline bool Greaterp::op(std::int64_t a, double b)
6397 {
6398 // If the integer is small enough it can be converted to a double
6399 // without any rounding, so then I can do the comparison easily.
6400     const std::int64_t range = static_cast<std::int64_t>(1)<<53;
6401     if (a >= -range && a <= range) return static_cast<double>(a) > b;
6402 // If the floating point value is >= 2^63 or is less < -2^63 it is beyond
6403 // the range of int64_t, so the result is easy. This situation includes
6404 // the case of infinities.
6405     if (b >= 9223372036854775808.0) return false;
6406     else if (b < -9223372036854775808.0) return true;
6407 // NaNs must always return false from a comparison, so all the cases so
6408 // far will have yielded correct results. But here I must filter out
6409 // that situation.
6410     if (std::isnan(b)) return false;
6411 // Because |b| >= 2^53 but < 2^63 it can be converted to an int64_t value
6412 // without rounding.
6413     return a > static_cast<std::int64_t>(b);
6414 }
6415 
6416 // This compares a bignum against a double. It may in fact only be called
6417 // in the case where it is at least a 2-word bignum, and that would render
6418 // the first segment of code unnecessary!
6419 //
6420 // The code here feels ugly and perhaps repetitive to me. For now I will
6421 // just be content to get something that works in all cases, but thinking
6422 // about how to make it tidier will be desirable. I might perhaps also
6423 // think if generalizing it to have EQN and NEQN options in the CompareMode
6424 // enumeration.
6425 
6426 enum CompareMode {GREATERP, GEQ, LESSP, LEQ};
6427 
greaterpfloat(std::uint64_t * a,std::size_t lena,double b,CompareMode mode)6428 inline bool greaterpfloat(std::uint64_t *a, std::size_t lena,
6429                           double b,
6430                           CompareMode mode)
6431 {   if (std::isnan(b)) return false;
6432 // If the integer is small enough it can be converted to a double
6433 // without any rounding, so then I can do the comparison easily.
6434     if (lena == 1)
6435     {   std::int64_t aa = a[0];
6436         const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
6437         if (aa >= -range && aa <= range)
6438         {   double ad = static_cast<double>(aa);
6439             switch (mode)
6440             {   case CompareMode::GREATERP:
6441                     return static_cast<double>(ad) > b;
6442                 case CompareMode::GEQ:
6443                     return static_cast<double>(ad) >= b;
6444                 case CompareMode::LESSP:
6445                     return static_cast<double>(ad) < b;
6446                 case CompareMode::LEQ:
6447                     return static_cast<double>(ad) <= b;
6448             }
6449         }
6450     }
6451 // If b==+infinity then a<b and a<=b, while if b=-=infinity then
6452 // a>b and a>=b.
6453     if (std::isinf(b))
6454     {   return (b > 0.0 && (mode==CompareMode::LESSP ||
6455                             mode==CompareMode::LEQ)) ||
6456                (b < 0.0 && (mode==CompareMode::GREATERP ||
6457                             mode==CompareMode::GEQ));
6458     }
6459 // Also if a and b have different signs it is easy to resolve the issue.
6460     if (negative(a[lena-1]) && b >= 0.0)
6461         return (mode==CompareMode::LESSP || mode==CompareMode::LEQ);
6462     if (positive(a[lena-1]) && b <= 0.0)
6463         return (mode==CompareMode::GREATERP || mode==CompareMode::GEQ);
6464 // Now if I convert b to an integer and compare I can lose a fractional
6465 // part in the case when b is small. But given that |a| is large if I
6466 // truncate b as I map it onto an integer the comparisons I make will still
6467 // be valid.
6468     std::int64_t top;
6469     std::uint64_t next;
6470     std::size_t len;
6471     double_to_virtual_bignum(b, top, next, len, RoundingMode::TRUNC);
6472 // If the numbers now differ in length that can tell me what the result is.
6473     if (lena > len)
6474     {   if (negative(a[lena-1]))
6475             return (mode==CompareMode::LESSP || mode==CompareMode::LEQ);
6476         if (positive(a[lena-1]))
6477             return (mode==CompareMode::GREATERP || mode==CompareMode::GEQ);
6478     }
6479     if (lena < len)
6480     {   if (positive(a[lena-1]))
6481             return (mode==CompareMode::LESSP || mode==CompareMode::LEQ);
6482         if (negative(a[lena-1]))
6483             return (mode==CompareMode::GREATERP || mode==CompareMode::GEQ);
6484     }
6485 // Now the arguments have the same length as bignums. First check for
6486 // differences in the top two digits.
6487     if (static_cast<std::int64_t>(a[lena-1]) < top ||
6488         (static_cast<std::int64_t>(a[lena-1]) == top && a[lena-2] < next))
6489         return (mode==CompareMode::LESSP || mode==CompareMode::LEQ);
6490     if (static_cast<std::int64_t>(a[lena-1]) > top ||
6491         (static_cast<std::int64_t>(a[lena-1]) == top && a[lena-2] > next))
6492         return (mode==CompareMode::GREATERP || mode==CompareMode::GEQ);
6493 // Now the top two digits of the two inputs match. If all lower digits of a
6494 // are zero then the two inputs are equal.
6495     for (std::size_t i=0; i<len; i++)
6496     {   if (a[i] != 0) return (mode==CompareMode::GREATERP ||
6497                                    mode==CompareMode::GEQ);
6498     }
6499 // Here the inputs seem to be exactly equal in value.
6500     return mode==CompareMode::GEQ || mode==CompareMode::LEQ;
6501 }
6502 
op(std::uint64_t * a,double b)6503 inline bool Greaterp::op(std::uint64_t *a, double b)
6504 {   std::size_t lena = number_size(a);
6505     if (lena == 1) return Greaterp::op(static_cast<std::int64_t>(a[0]),
6506                                            b);
6507     return greaterpfloat(a, lena, b, CompareMode::GREATERP);
6508 }
6509 
op(double a,std::int64_t b)6510 inline bool Greaterp::op(double a, std::int64_t b)
6511 {   return Lessp::op(b, a);
6512 }
6513 
op(double a,std::uint64_t * b)6514 inline bool Greaterp::op(double a, std::uint64_t *b)
6515 {   return Lessp::op(b, a);
6516 }
6517 
6518 #ifdef softfloat_h
6519 
6520 //@@ This is not sorted out yet!
6521 
6522 // This one function does >, >=, < and <= with "great" indicating if
6523 // the base is > or < and "ifequal" distinguishing > from >= and < from <=.
6524 
greaterpbigfloat(std::uint64_t * a,std::size_t lena,float128_t b,bool great,bool ifequal)6525 inline bool greaterpbigfloat(std::uint64_t *a, std::size_t lena,
6526                              float128_t b,
6527                              bool great, bool ifequal)
6528 {   if (f128_nan(b)) return
6529             false;  // Comparisons involving a NaN => false.
6530     std::int64_t top = static_cast<std::int64_t>(a[lena-1]);
6531     if (top >= 0 && f128_lt(b, f128_0)) return great;
6532     if (top < 0 && !f128_lt(b, f128_0)) return !great;
6533 // Now the two inputs have the same sign.
6534     if (lena == 1 ||
6535         (lena == 2 &&
6536          !((a[1] > 0x0001000000000000 ||
6537             (a[1] == 0x0001000000000000 && a[0] != 0)) ||
6538            static_cast<std::int64_t>(a[1]) < -static_cast<std::int64_t>
6539            (0x0001000000000000))))
6540     {
6541 // Here the integer is of modest size - if the float is huge we can
6542 // resolve matters cheaply.
6543         if (f128_lt(FP128_INT_LIMIT, b)) return !great;
6544         if (f128_lt(b, FP128_MINUS_INT_LIMIT)) return great;
6545 // Convert a to a float128 and compare. The conversion will not lose any
6546 // information because the |a| <= 2^112 so it will fit within the mantissa
6547 // bits that are available.
6548         float128_t aa = Float128::op(a);
6549         if (great)
6550         {   if (ifequal) return f128_le(b, aa);
6551             else return f128_lt(b, aa);
6552         }
6553         else
6554         {   if (ifequal) return f128_le(aa, b);
6555             else return f128_lt(aa, b);
6556         }
6557     }
6558     else
6559     {
6560 // Now the integer is rather big. If I was KEEN I would estimate the size of
6561 // the float from its exponent and compare with the number of bits in the
6562 // integer to filter out cases where their sized were very different. However
6563 // I am not feeling very keen! I can afford to convert the float to an integer,
6564 // and because it is large when I fix it there will not be any discarded
6565 // fractional part...
6566         // ...
6567         std::intptr_t bb = round_float128_to_int(b);
6568 // At the moment I think there is a space-leak on bb here...
6569         if (great)
6570             if (ifequal)
6571                 return op_dispatch2<Geq,bool>(vector_to_handle(a), bb);
6572             else return op_dispatch2<Greaterp,bool>(vector_to_handle(a), bb);
6573         else if (ifequal)
6574             return op_dispatch2<Leq,bool>(vector_to_handle(a), bb);
6575         else return op_dispatch2<Lessp,bool>(vector_to_handle(a), bb);
6576     }
6577 }
6578 
op(std::int64_t a,float128_t b)6579 inline bool Greaterp::op(std::int64_t a, float128_t b)
6580 {   return f128_lt(b, i64_to_f128(a));
6581 }
6582 
op(std::uint64_t * a,float128_t b)6583 inline bool Greaterp::op(std::uint64_t *a, float128_t b)
6584 {   std::size_t lena = number_size(a);
6585     if (lena == 1) return Greaterp::op(static_cast<std::int64_t>(a[0]),
6586                                            b);
6587     return greaterpbigfloat(a, lena, b, true, false);
6588 
6589 }
6590 
op(float128_t a,std::int64_t b)6591 inline bool Greaterp::op(float128_t a, std::int64_t b)
6592 {   return Lessp::op(b, a);
6593 }
6594 
op(float128_t a,std::uint64_t * b)6595 inline bool Greaterp::op(float128_t a, std::uint64_t *b)
6596 {   return Lessp::op(b, a);
6597 }
6598 
6599 #endif // softfloat_h
6600 
6601 // geq
6602 
op(std::uint64_t * a,std::uint64_t * b)6603 inline bool Geq::op(std::uint64_t *a, std::uint64_t *b)
6604 {   return !Greaterp::op(b, a);
6605 }
6606 
op(std::uint64_t * a,std::int64_t b)6607 inline bool Geq::op(std::uint64_t *a, std::int64_t b)
6608 {   return !Greaterp::op(b, a);
6609 }
6610 
op(std::int64_t a,std::uint64_t * b)6611 inline bool Geq::op(std::int64_t a, std::uint64_t *b)
6612 {   return !Greaterp::op(b, a);
6613 }
6614 
op(std::int64_t a,std::int64_t b)6615 inline bool Geq::op(std::int64_t a, std::int64_t b)
6616 {   return a >= b;
6617 }
6618 
op(std::int64_t a,float b)6619 inline bool Geq::op(std::int64_t a, float b)
6620 {   return Geq::op(a, static_cast<double>(b));
6621 }
6622 
op(std::uint64_t * a,float b)6623 inline bool Geq::op(std::uint64_t *a, float b)
6624 {   return Geq::op(a, static_cast<double>(b));
6625 }
6626 
op(float a,std::int64_t b)6627 inline bool Geq::op(float a, std::int64_t b)
6628 {   return Geq::op(static_cast<double>(a), b);
6629 }
6630 
op(float a,std::uint64_t * b)6631 inline bool Geq::op(float a, std::uint64_t *b)
6632 {   return Geq::op(static_cast<double>(a), b);
6633 }
6634 
op(std::int64_t a,double b)6635 inline bool Geq::op(std::int64_t a, double b)
6636 {   const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
6637     if (a >= -range && a <= range) return static_cast<double>(a) >= b;
6638     if (b >= 9223372036854775808.0) return false;
6639     else if (b < -9223372036854775808.0) return true;
6640     if (std::isnan(b)) return false;
6641     return a >= static_cast<std::int64_t>(b);
6642 }
6643 
op(std::uint64_t * a,double b)6644 inline bool Geq::op(std::uint64_t *a, double b)
6645 {   std::size_t lena = number_size(a);
6646     if (lena == 1) return Geq::op(static_cast<std::int64_t>(a[0]), b);
6647     return greaterpfloat(a, lena, b, CompareMode::GEQ);
6648 }
6649 
op(double a,std::int64_t b)6650 inline bool Geq::op(double a, std::int64_t b)
6651 {   return Leq::op(b, a);
6652 }
6653 
op(double a,std::uint64_t * b)6654 inline bool Geq::op(double a, std::uint64_t *b)
6655 {   return Leq::op(b, a);
6656 }
6657 
6658 #ifdef softfloat_h
6659 
op(std::int64_t a,float128_t b)6660 inline bool Geq::op(std::int64_t a, float128_t b)
6661 {   return f128_le(b, i64_to_f128(a));
6662     return false;
6663 }
6664 
op(std::uint64_t * a,float128_t b)6665 inline bool Geq::op(std::uint64_t *a, float128_t b)
6666 {   std::size_t lena = number_size(a);
6667     if (lena == 1) return Greaterp::op(static_cast<std::int64_t>(a[0]),
6668                                            b);
6669     return greaterpbigfloat(a, lena, b, true, true);
6670 }
6671 
op(float128_t a,std::int64_t b)6672 inline bool Geq::op(float128_t a, std::int64_t b)
6673 {   return Leq::op(b, a);
6674 }
6675 
op(float128_t a,std::uint64_t * b)6676 inline bool Geq::op(float128_t a, std::uint64_t *b)
6677 {   return Leq::op(b, a);
6678 }
6679 
6680 #endif // softfloat_h
6681 
6682 // lessp
6683 
op(std::uint64_t * a,std::uint64_t * b)6684 inline bool Lessp::op(std::uint64_t *a, std::uint64_t *b)
6685 {   return Greaterp::op(b, a);
6686 }
6687 
op(std::uint64_t * a,std::int64_t b)6688 inline bool Lessp::op(std::uint64_t *a, std::int64_t b)
6689 {   return Greaterp::op(b, a);
6690 }
6691 
op(std::int64_t a,std::uint64_t * b)6692 inline bool Lessp::op(std::int64_t a, std::uint64_t *b)
6693 {   return Greaterp::op(b, a);
6694 }
6695 
op(std::int64_t a,std::int64_t b)6696 inline bool Lessp::op(std::int64_t a, std::int64_t b)
6697 {   return a < b;
6698 }
6699 
op(std::int64_t a,float b)6700 inline bool Lessp::op(std::int64_t a, float b)
6701 {   return Lessp::op(a, static_cast<double>(b));
6702 }
6703 
op(std::uint64_t * a,float b)6704 inline bool Lessp::op(std::uint64_t *a, float b)
6705 {   return Lessp::op(a, static_cast<double>(b));
6706 }
6707 
op(float a,std::int64_t b)6708 inline bool Lessp::op(float a, std::int64_t b)
6709 {   return Lessp::op(static_cast<double>(a), b);
6710 }
6711 
op(float a,std::uint64_t * b)6712 inline bool Lessp::op(float a, std::uint64_t *b)
6713 {   return Lessp::op(static_cast<double>(a), b);
6714 }
6715 
op(std::int64_t a,double b)6716 inline bool Lessp::op(std::int64_t a, double b)
6717 {   const std::int64_t range = static_cast<std::int64_t>(1)<<53;
6718     if (a >= -range && a <= range) return static_cast<double>(a) < b;
6719     if (b >= 9223372036854775808.0) return true;
6720     else if (b < -9223372036854775808.0) return false;
6721     if (std::isnan(b)) return false;
6722     return a < static_cast<std::int64_t>(b);
6723 }
6724 
op(std::uint64_t * a,double b)6725 inline bool Lessp::op(std::uint64_t *a, double b)
6726 {   std::size_t lena = number_size(a);
6727     if (lena == 1) return Lessp::op(static_cast<std::int64_t>(a[0]), b);
6728     return greaterpfloat(a, lena, b, CompareMode::LESSP);
6729 }
6730 
op(double a,std::int64_t b)6731 inline bool Lessp::op(double a, std::int64_t b)
6732 {   return Greaterp::op(b, a);
6733 }
6734 
op(double a,std::uint64_t * b)6735 inline bool Lessp::op(double a, std::uint64_t *b)
6736 {   return Greaterp::op(b, a);
6737 }
6738 
6739 #ifdef softfloat_h
6740 
op(std::int64_t a,float128_t b)6741 inline bool Lessp::op(std::int64_t a, float128_t b)
6742 {   return f128_lt(i64_to_f128(a), b);
6743     return false;
6744 }
6745 
op(std::uint64_t * a,float128_t b)6746 inline bool Lessp::op(std::uint64_t *a, float128_t b)
6747 {   std::size_t lena = number_size(a);
6748     if (lena == 1) return Lessp::op(static_cast<std::int64_t>(a[0]), b);
6749     return greaterpbigfloat(a, lena, b, false, false);
6750 
6751     return false;
6752 }
6753 
op(float128_t a,std::int64_t b)6754 inline bool Lessp::op(float128_t a, std::int64_t b)
6755 {   return Greaterp::op(b, a);
6756 }
6757 
op(float128_t a,std::uint64_t * b)6758 inline bool Lessp::op(float128_t a, std::uint64_t *b)
6759 {   return Greaterp::op(b, a);
6760 }
6761 
6762 #endif // softfloat_h
6763 
6764 // leq
6765 
op(std::uint64_t * a,std::uint64_t * b)6766 inline bool Leq::op(std::uint64_t *a, std::uint64_t *b)
6767 {   return !Greaterp::op(a, b);
6768 }
6769 
op(std::uint64_t * a,std::int64_t b)6770 inline bool Leq::op(std::uint64_t *a, std::int64_t b)
6771 {   return !Greaterp::op(a, b);
6772 }
6773 
op(std::int64_t a,std::uint64_t * b)6774 inline bool Leq::op(std::int64_t a, std::uint64_t *b)
6775 {   return !Greaterp::op(a, b);
6776 }
6777 
op(std::int64_t a,std::int64_t b)6778 inline bool Leq::op(std::int64_t a, std::int64_t b)
6779 {   return a <= b;
6780 }
6781 
op(std::int64_t a,float b)6782 inline bool Leq::op(std::int64_t a, float b)
6783 {   return Leq::op(a, static_cast<double>(b));
6784 }
6785 
op(std::uint64_t * a,float b)6786 inline bool Leq::op(std::uint64_t *a, float b)
6787 {   return Leq::op(a, static_cast<double>(b));
6788 }
6789 
op(float a,std::int64_t b)6790 inline bool Leq::op(float a, std::int64_t b)
6791 {   return Leq::op(static_cast<double>(a), b);
6792 }
6793 
op(float a,std::uint64_t * b)6794 inline bool Leq::op(float a, std::uint64_t *b)
6795 {   return Leq::op(static_cast<double>(a), b);
6796 }
6797 
op(std::int64_t a,double b)6798 inline bool Leq::op(std::int64_t a, double b)
6799 {   const std::int64_t range = static_cast<std::int64_t>(1)<<53;
6800     if (a >= -range && a <= range) return static_cast<double>(a) <= b;
6801     if (b >= 9223372036854775808.0) return true;
6802     else if (b < -9223372036854775808.0) return false;
6803     if (std::isnan(b)) return false;
6804     return a <= static_cast<std::int64_t>(b);
6805 }
6806 
op(std::uint64_t * a,double b)6807 inline bool Leq::op(std::uint64_t *a, double b)
6808 {   std::size_t lena = number_size(a);
6809     if (lena == 1) return Lessp::op(static_cast<std::int64_t>(a[0]), b);
6810     return greaterpfloat(a, lena, b, CompareMode::LEQ);
6811 }
6812 
op(double a,std::int64_t b)6813 inline bool Leq::op(double a, std::int64_t b)
6814 {   return Geq::op(b, a);
6815 }
6816 
op(double a,std::uint64_t * b)6817 inline bool Leq::op(double a, std::uint64_t *b)
6818 {   return Geq::op(b, a);
6819 }
6820 
6821 #ifdef softfloat_h
6822 
op(std::int64_t a,float128_t b)6823 inline bool Leq::op(std::int64_t a, float128_t b)
6824 {   return f128_le(i64_to_f128(a), b);
6825     return false;
6826 }
6827 
op(std::uint64_t * a,float128_t b)6828 inline bool Leq::op(std::uint64_t *a, float128_t b)
6829 {   std::size_t lena = number_size(a);
6830     if (lena == 1) return Leq::op(static_cast<std::int64_t>(a[0]), b);
6831     return greaterpbigfloat(a, lena, b, false, true);
6832 }
6833 
op(float128_t a,std::int64_t b)6834 inline bool Leq::op(float128_t a, std::int64_t b)
6835 {   return Geq::op(b, a);
6836 }
6837 
op(float128_t a,std::uint64_t * b)6838 inline bool Leq::op(float128_t a, std::uint64_t *b)
6839 {   return Geq::op(b, a);
6840 }
6841 
6842 #endif // softfloat_h
6843 
6844 
6845 // Negation, addition and subtraction. These are easy apart from a mess
6846 // concerning the representation of positive numbers that risk having the
6847 // most significant bit of their top word a 1, and the equivalent for
6848 // negative numbers.
6849 // Boolean operations all treat negative numbers as if there had been an
6850 // unending string of 1 bits before the stop bit that is stored.
6851 //=========================================================================
6852 //=========================================================================
6853 
6854 
6855 // Negation. Note that because I am using 2s complement the result could be
6856 // one word longer or shorter than the input. For instance if you negate
6857 // [0x8000000000000000] (a negative value) you get [0,0x8000000000000000],
6858 // and vice versa.
6859 
bignegate(const std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr)6860 inline void bignegate(const std::uint64_t *a, std::size_t lena,
6861                       std::uint64_t *r, std::size_t &lenr)
6862 {   internal_negate(a, lena, r);
6863 // When I negate (-(2^(64n-1))) I will need to place a zero work ahead of the
6864 // value that is mow positive, making the bignum one digit longer.
6865 // If I have 2^(64n-1) it will have been represented with that padding zero
6866 // ahead of it, but when negated the bignum can shrink.
6867     if (r[lena-1]==topbit) r[lena++] = 0;
6868     else if (r[lena-1]==UINT64_C(0xffffffffffffffff) && lena>1 &&
6869              negative(r[lena-2])) lena--;
6870     lenr = lena;
6871 }
6872 
op(std::uint64_t * a)6873 inline std::intptr_t Minus::op(std::uint64_t *a)
6874 {   std::size_t n = number_size(a);
6875     push(a);
6876     std::uint64_t *p = reserve(n+1);
6877     pop(a);
6878     std::size_t final_n;
6879     bignegate(a, n, p, final_n);
6880     return confirm_size(p, n+1, final_n);
6881 }
6882 
6883 // The following can only be called via op_dispatch1(), and in that
6884 // case the argument has to have started off as a fixnum. In such cases
6885 // the result will also be a fixnum except when negating MIN_FIXNUM. But
6886 // even in that case (-a) can not overflow 64-bit arithmetic because
6887 // the fixnum will have had at least one tag bit.
6888 
op(std::int64_t a)6889 inline std::intptr_t Minus::op(std::int64_t a)
6890 {   if (a == MIN_FIXNUM) return int_to_bignum(-a);
6891     else return int_to_handle(-a);
6892 }
6893 
op(std::uint64_t * a)6894 inline std::intptr_t Add1::op(std::uint64_t *a)
6895 {   return Plus::op(a, 1);
6896 }
6897 
op(std::int64_t a)6898 inline std::intptr_t Add1::op(std::int64_t a)
6899 {   return int_to_bignum(a+1);
6900 }
6901 
op(std::uint64_t * a)6902 inline std::intptr_t Sub1::op(std::uint64_t *a)
6903 {   return Plus::op(a, -1);
6904 }
6905 
op(std::int64_t a)6906 inline std::intptr_t Sub1::op(std::int64_t a)
6907 {   return int_to_bignum(a-1);
6908 }
6909 
op(std::uint64_t * a)6910 inline std::intptr_t Abs::op(std::uint64_t *a)
6911 {   std::size_t n = number_size(a);
6912     if (!negative(a[n-1]))
6913     {   push(a);
6914         std::uint64_t *r = reserve(n);
6915         pop(a);
6916         std::memcpy(r, a, n*sizeof(std::uint64_t));
6917         return confirm_size(r, n, n);
6918     }
6919     push(a);
6920     std::uint64_t *r = reserve(n+1);
6921     pop(a);
6922     std::size_t final_n;
6923     bignegate(a, n, r, final_n);
6924     return confirm_size(r, n+1, final_n);
6925 }
6926 
6927 // The following can only be called via op_dispatch1(), and in that
6928 // case the argument has to have started off as a fixnum. In such cases
6929 // the result will also be a fixnum except when negating MIN_FIXNUM. But
6930 // even in that case (-a) can not overflow 64-bit arithmetic because
6931 // the fixnum will have had at least one tag bit.
6932 
op(std::int64_t a)6933 inline std::intptr_t Abs::op(std::int64_t a)
6934 {   if (a == MIN_FIXNUM) return unsigned_int_to_bignum(-a);
6935     else return int_to_handle(a<0 ? -a : a);
6936 }
6937 
6938 // The "bitnot" operation is simple and length can not change.
6939 
biglognot(const std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr)6940 inline void biglognot(const std::uint64_t *a, std::size_t lena,
6941                       std::uint64_t *r, std::size_t &lenr)
6942 {   for (std::size_t i=0; i<lena; i++)
6943     {   r[i] = ~a[i];
6944     }
6945     lenr = lena;
6946 }
6947 
op(std::uint64_t * a)6948 inline std::intptr_t Lognot::op(std::uint64_t *a)
6949 {   std::size_t n = number_size(a);
6950     push(a);
6951     std::uint64_t *p = reserve(n+1);
6952     pop(a);
6953     std::size_t final_n;
6954     biglognot(a, n, p, final_n);
6955     return confirm_size(p, n+1, final_n);
6956 }
6957 
op(std::int64_t a)6958 inline std::intptr_t Lognot::op(std::int64_t a)
6959 {   return int_to_handle(~a);
6960 }
6961 
6962 // logand
6963 
ordered_biglogand(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)6964 inline void ordered_biglogand(const std::uint64_t *a,
6965                               std::size_t lena,
6966                               const std::uint64_t *b, std::size_t lenb,
6967                               std::uint64_t *r, std::size_t &lenr)
6968 {   for (std::size_t i=0; i<lenb; i++)
6969         r[i] = a[i] & b[i];
6970     if (negative(b[lenb-1]))
6971     {   for (std::size_t i=lenb; i<lena; i++) r[i] = a[i];
6972         lenr = lena;
6973     }
6974     else lenr = lenb;
6975     truncate_positive(r, lenr);
6976 }
6977 
biglogand(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)6978 inline void biglogand(const std::uint64_t *a, std::size_t lena,
6979                       const std::uint64_t *b, std::size_t lenb,
6980                       std::uint64_t *r, std::size_t &lenr)
6981 {   if (lena >= lenb) return ordered_biglogand(a, lena, b, lenb, r,
6982                                  lenr);
6983     else return ordered_biglogand(b, lenb, a, lena, r, lenr);
6984 }
6985 
op(std::uint64_t * a,std::uint64_t * b)6986 inline std::intptr_t Logand::op(std::uint64_t *a, std::uint64_t *b)
6987 {   std::size_t lena = number_size(a);
6988     std::size_t lenb = number_size(b);
6989     std::size_t n;
6990     if (lena >= lenb) n = lena;
6991     else n = lenb;
6992     push(a); push(b);
6993     std::uint64_t *p = reserve(n);
6994     pop(b); pop(a);
6995     std::size_t final_n;
6996     biglogand(a, lena, b, lenb, p, final_n);
6997     return confirm_size(p, n, final_n);
6998 }
6999 
7000 // The next two are not optimised - a case of (logand bignum positive-fixnum)
7001 // is guaranteed to end up a fixnum so could be done more slickly - however
7002 // I am not going to expect that to be on the critical performance path for
7003 // enough programs for me to worry too much!
7004 
op(std::uint64_t * a,std::int64_t b)7005 inline std::intptr_t Logand::op(std::uint64_t *a, std::int64_t b)
7006 {   std::size_t lena = number_size(a);
7007     push(a);
7008     std::uint64_t *p = reserve(lena);
7009     pop(a);
7010     std::size_t final_n;
7011     std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
7012     biglogand(a, lena, bb, 1, p, final_n);
7013     return confirm_size(p, lena, final_n);
7014 }
7015 
op(std::int64_t a,std::uint64_t * b)7016 inline std::intptr_t Logand::op(std::int64_t a, std::uint64_t *b)
7017 {   std::size_t lenb = number_size(b);
7018     push(b);
7019     std::uint64_t *p = reserve(lenb);
7020     pop(b);
7021     std::size_t final_n;
7022     std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
7023     biglogand(aa, 1, b, lenb, p, final_n);
7024     return confirm_size(p, lenb, final_n);
7025 }
7026 
op(std::int64_t a,std::int64_t b)7027 inline std::intptr_t Logand::op(std::int64_t a, std::int64_t b)
7028 {   return int_to_handle(a & b);
7029 }
7030 
7031 // logor
7032 
ordered_biglogor(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7033 inline void ordered_biglogor(const std::uint64_t *a, std::size_t lena,
7034                              const std::uint64_t *b, std::size_t lenb,
7035                              std::uint64_t *r, std::size_t &lenr)
7036 {   for (std::size_t i=0; i<lenb; i++)
7037         r[i] = a[i] | b[i];
7038     if (negative(b[lenb-1])) lenr = lenb;
7039     else
7040     {   for (std::size_t i=lenb; i<lena; i++) r[i] = a[i];
7041         lenr = lena;
7042     }
7043     truncate_negative(r, lenr);
7044 }
7045 
biglogor(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7046 inline void biglogor(const std::uint64_t *a, std::size_t lena,
7047                      const std::uint64_t *b, std::size_t lenb,
7048                      std::uint64_t *r, std::size_t &lenr)
7049 {   if (lena >= lenb) return ordered_biglogor(a, lena, b, lenb, r,
7050                                  lenr);
7051     else return ordered_biglogor(b, lenb, a, lena, r, lenr);
7052 }
7053 
op(std::uint64_t * a,std::uint64_t * b)7054 inline std::intptr_t Logor::op(std::uint64_t *a, std::uint64_t *b)
7055 {   std::size_t lena = number_size(a);
7056     std::size_t lenb = number_size(b);
7057     std::size_t n;
7058     if (lena >= lenb) n = lena;
7059     else n = lenb;
7060     push(a); push(b);
7061     std::uint64_t *p = reserve(n);
7062     pop(b); pop(a);
7063     std::size_t final_n;
7064     biglogor(a, lena, b, lenb, p, final_n);
7065     return confirm_size(p, n, final_n);
7066 }
7067 
op(std::uint64_t * a,std::int64_t b)7068 inline std::intptr_t Logor::op(std::uint64_t *a, std::int64_t b)
7069 {   std::size_t lena = number_size(a);
7070     push(a);
7071     std::uint64_t *p = reserve(lena);
7072     pop(a);
7073     std::size_t final_n;
7074     std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
7075     biglogor(a, lena, bb, 1, p, final_n);
7076     return confirm_size(p, lena, final_n);
7077 }
7078 
op(std::int64_t a,std::uint64_t * b)7079 inline std::intptr_t Logor::op(std::int64_t a, std::uint64_t *b)
7080 {   std::size_t lenb = number_size(b);
7081     push(b);
7082     std::uint64_t *p = reserve(lenb);
7083     pop(b);
7084     std::size_t final_n;
7085     std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
7086     biglogor(aa, 1, b, lenb, p, final_n);
7087     return confirm_size(p, lenb, final_n);
7088 }
7089 
op(std::int64_t a,std::int64_t b)7090 inline std::intptr_t Logor::op(std::int64_t a, std::int64_t b)
7091 {   return int_to_handle(a | b);
7092 }
7093 
7094 // logxor
7095 
ordered_biglogxor(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7096 inline void ordered_biglogxor(const std::uint64_t *a,
7097                               std::size_t lena,
7098                               const std::uint64_t *b, std::size_t lenb,
7099                               std::uint64_t *r, std::size_t &lenr)
7100 {   std::size_t i;
7101     for (i=0; i<lenb; i++)
7102         r[i] = a[i] ^ b[i];
7103     if (negative(b[lenb-1]))
7104     {   for (; i<lena; i++)
7105             r[i] = ~a[i];
7106     }
7107     else
7108     {   for (; i<lena; i++)
7109             r[i] = a[i];
7110     }
7111     lenr = lena;
7112 // The logxor operation can cause the inputs to shrink.
7113     truncate_positive(r, lenr);
7114     truncate_negative(r, lenr);
7115 }
7116 
biglogxor(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7117 inline void biglogxor(const std::uint64_t *a, std::size_t lena,
7118                       const std::uint64_t *b, std::size_t lenb,
7119                       std::uint64_t *r, std::size_t &lenr)
7120 {   if (lena >= lenb) return ordered_biglogxor(a, lena, b, lenb, r,
7121                                  lenr);
7122     else return ordered_biglogxor(b, lenb, a, lena, r, lenr);
7123 }
7124 
op(std::uint64_t * a,std::uint64_t * b)7125 inline std::intptr_t Logxor::op(std::uint64_t *a, std::uint64_t *b)
7126 {   std::size_t lena = number_size(a);
7127     std::size_t lenb = number_size(b);
7128     std::size_t n;
7129     if (lena >= lenb) n = lena;
7130     else n = lenb;
7131     push(a); push(b);
7132     std::uint64_t *p = reserve(n);
7133     pop(b); pop(a);
7134     std::size_t final_n;
7135     biglogxor(a, lena, b, lenb, p, final_n);
7136     return confirm_size(p, n, final_n);
7137 }
7138 
op(std::uint64_t * a,std::int64_t b)7139 inline std::intptr_t Logxor::op(std::uint64_t *a, std::int64_t b)
7140 {   std::size_t lena = number_size(a);
7141     push(a);
7142     std::uint64_t *p = reserve(lena);
7143     pop(a);
7144     std::size_t final_n;
7145     std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
7146     biglogxor(a, lena, bb, 1, p, final_n);
7147     return confirm_size(p, lena, final_n);
7148 }
7149 
op(std::int64_t a,std::uint64_t * b)7150 inline std::intptr_t Logxor::op(std::int64_t a, std::uint64_t *b)
7151 {   std::size_t lenb = number_size(b);
7152     push(b);
7153     std::uint64_t *p = reserve(lenb);
7154     pop(b);
7155     std::size_t final_n;
7156     std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
7157     biglogxor(aa, 1, b, lenb, p, final_n);
7158     return confirm_size(p, lenb, final_n);
7159 }
7160 
op(std::int64_t a,std::int64_t b)7161 inline std::intptr_t Logxor::op(std::int64_t a, std::int64_t b)
7162 {   return int_to_handle(a ^ b);
7163 }
7164 
7165 
ordered_biglogeqv(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7166 inline void ordered_biglogeqv(const std::uint64_t *a,
7167                               std::size_t lena,
7168                               const std::uint64_t *b, std::size_t lenb,
7169                               std::uint64_t *r, std::size_t &lenr)
7170 {   std::size_t i;
7171     for (i=0; i<lenb; i++)
7172         r[i] = ~a[i] ^ b[i];
7173     if (negative(b[lenb-1]))
7174     {   for (; i<lena; i++)
7175             r[i] = a[i];
7176     }
7177     else
7178     {   for (; i<lena; i++)
7179             r[i] = ~a[i];
7180     }
7181     lenr = lena;
7182 // The logxor operation can cause the inputs to shrink.
7183     truncate_positive(r, lenr);
7184     truncate_negative(r, lenr);
7185 }
7186 
biglogeqv(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7187 inline void biglogeqv(const std::uint64_t *a, std::size_t lena,
7188                       const std::uint64_t *b, std::size_t lenb,
7189                       std::uint64_t *r, std::size_t &lenr)
7190 {   if (lena >= lenb) return ordered_biglogeqv(a, lena, b, lenb, r,
7191                                  lenr);
7192     else return ordered_biglogeqv(b, lenb, a, lena, r, lenr);
7193 }
7194 
op(std::uint64_t * a,std::uint64_t * b)7195 inline std::intptr_t Logeqv::op(std::uint64_t *a, std::uint64_t *b)
7196 {   std::size_t lena = number_size(a);
7197     std::size_t lenb = number_size(b);
7198     std::size_t n;
7199     if (lena >= lenb) n = lena;
7200     else n = lenb;
7201     push(a); push(b);
7202     std::uint64_t *p = reserve(n);
7203     pop(b); pop(a);
7204     std::size_t final_n;
7205     biglogeqv(a, lena, b, lenb, p, final_n);
7206     return confirm_size(p, n, final_n);
7207 }
7208 
op(std::uint64_t * a,std::int64_t b)7209 inline std::intptr_t Logeqv::op(std::uint64_t *a, std::int64_t b)
7210 {   std::size_t lena = number_size(a);
7211     push(a);
7212     std::uint64_t *p = reserve(lena);
7213     pop(a);
7214     std::size_t final_n;
7215     std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
7216     biglogeqv(a, lena, bb, 1, p, final_n);
7217     return confirm_size(p, lena, final_n);
7218 }
7219 
op(std::int64_t a,std::uint64_t * b)7220 inline std::intptr_t Logeqv::op(std::int64_t a, std::uint64_t *b)
7221 {   std::size_t lenb = number_size(b);
7222     push(b);
7223     std::uint64_t *p = reserve(lenb);
7224     pop(b);
7225     std::size_t final_n;
7226     std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
7227     biglogeqv(aa, 1, b, lenb, p, final_n);
7228     return confirm_size(p, lenb, final_n);
7229 }
7230 
op(std::int64_t a,std::int64_t b)7231 inline std::intptr_t Logeqv::op(std::int64_t a, std::int64_t b)
7232 {   return int_to_handle(~a ^ b);
7233 }
7234 
7235 inline void bigrightshift(const std::uint64_t *a, std::size_t lena,
7236                           std::int64_t n,
7237                           std::uint64_t *r, std::size_t &lenr);
7238 
bigleftshift(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7239 inline void bigleftshift(const std::uint64_t *a, std::size_t lena,
7240                          std::int64_t n,
7241                          std::uint64_t *r, std::size_t &lenr)
7242 {   if (n == 0)
7243     {   internal_copy(a, lena, r);
7244         lenr = lena;
7245         return;
7246     }
7247     else if (n < 0)
7248     {   bigrightshift(a, lena, -n, r, lenr);
7249         return;
7250     }
7251     std::size_t words = n/64;
7252     std::size_t bits = n % 64;
7253     for (std::size_t i=0; i<words; i++) r[i] = 0;
7254     if (bits == 0)
7255     {   for (std::size_t i=0; i<lena; i++)
7256             r[i+words] = a[i];
7257         lenr = lena+words;
7258     }
7259     else
7260     {   r[words] = a[0]<<bits;
7261         for (std::size_t i=1; i<lena; i++)
7262             r[i+words] = (a[i]<<bits) |
7263                          (a[i-1]>>(64-bits));
7264         r[words+lena] = (negative(a[lena-1]) ?
7265                          static_cast<std::uint64_t>(-1)<<bits :
7266                          0) | (a[lena-1]>>(64-bits));
7267         lenr = lena+words+1;
7268     }
7269     truncate_positive(r, lenr);
7270     truncate_negative(r, lenr);
7271 }
7272 
7273 inline std::intptr_t rightshift_b(std::uint64_t *a, std::int64_t n);
7274 
op(std::uint64_t * a,std::int64_t n)7275 inline std::intptr_t LeftShift::op(std::uint64_t *a, std::int64_t n)
7276 {   if (n == 0) return copy_if_no_garbage_collector(a);
7277     else if (n < 0) return RightShift::op(a, -n);
7278     std::size_t lena = number_size(a);
7279     std::size_t nr = lena + (n/64) + 1;
7280     push(a);
7281     std::uint64_t *p = reserve(nr);
7282     pop(a);
7283     std::size_t final_n;
7284     bigleftshift(a, lena, n, p, final_n);
7285     return confirm_size(p, nr, final_n);
7286 }
7287 
op(std::int64_t aa,std::int64_t n)7288 inline std::intptr_t LeftShift::op(std::int64_t aa, std::int64_t n)
7289 {   if (n == 0) return int_to_handle(aa);
7290     else if (n < 0) return RightShift::op(aa, -n);
7291     std::size_t nr = (n/64) + 2;
7292     std::uint64_t *p = reserve(nr);
7293     std::size_t final_n;
7294     std::uint64_t a[1] = {static_cast<std::uint64_t>(aa)};
7295     bigleftshift(a, 1, n, p, final_n);
7296     return confirm_size(p, nr, final_n);
7297 }
7298 
bigrightshift(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7299 inline void bigrightshift(const std::uint64_t *a, std::size_t lena,
7300                           std::int64_t n,
7301                           std::uint64_t *r, std::size_t &lenr)
7302 {   if (n == 0)
7303     {   internal_copy(a, lena, r);
7304         lenr = lena;
7305         return;
7306     }
7307     else if (n < 0)
7308     {   bigleftshift(a, lena, -n, r, lenr);
7309         return;
7310     }
7311     std::size_t words = n/64;
7312     std::size_t bits = n % 64;
7313     if (words >= lena)
7314     {   r[0] = negative(a[lena-1]) ? -static_cast<std::uint64_t>(1) : 0;
7315         lenr = 1;
7316     }
7317     else if (bits == 0)
7318     {   for (std::size_t i=0; i<lena-words; i++)
7319             r[i] = a[i+words];
7320         lenr = lena-words;
7321     }
7322     else
7323     {   for (std::size_t i=0; i<lena-words-1; i++)
7324             r[i] = (a[i+words]>>bits) |
7325                    (a[i+words+1]<<(64-bits));
7326         r[lena-words-1] = ASR(a[lena-1], bits);
7327         lenr = lena-words;
7328     }
7329     truncate_positive(r, lenr);
7330     truncate_negative(r, lenr);
7331 }
7332 
op(std::uint64_t * a,std::int64_t n)7333 inline std::intptr_t RightShift::op(std::uint64_t *a, std::int64_t n)
7334 {   if (n == 0) return copy_if_no_garbage_collector(a);
7335     else if (n < 0) return LeftShift::op(a, -n);
7336     std::size_t lena = number_size(a);
7337     std::size_t nr;
7338     if (lena > static_cast<std::size_t>(n)/64) nr = lena - n/64;
7339     else nr = 1;
7340     push(a);
7341     std::uint64_t *p = reserve(nr);
7342     pop(a);
7343     std::size_t final_n;
7344     bigrightshift(a, lena, n, p, final_n);
7345     return confirm_size(p, nr, final_n);
7346 }
7347 
op(std::int64_t a,std::int64_t n)7348 inline std::intptr_t RightShift::op(std::int64_t a, std::int64_t n)
7349 {   if (n == 0) return int_to_handle(a);
7350     else if (n < 0) return LeftShift::op(a, -n);
7351 // Shifts of 64 and up obviously lose all the input data apart from its
7352 // sign, but so does a shift by 63.
7353     if (n >= 63) return int_to_handle(a>=0 ? 0 : -1);
7354 // Because C++ does not guarantee that right shifts on signed values
7355 // duplicate the sign bit I perform the "shift" here using division by
7356 // a power of 2. Because I have n <= 62 here I will not get overflow.
7357     std::int64_t q = static_cast<std::int64_t>(1)<<n;
7358     return int_to_handle((a & ~(q-1))/q);
7359 }
7360 
op(std::uint64_t * a)7361 inline std::size_t IntegerLength::op(std::uint64_t *a)
7362 {   return bignum_bits(a, number_size(a));
7363 }
7364 
op(std::int64_t aa)7365 inline std::size_t IntegerLength::op(std::int64_t aa)
7366 {   std::uint64_t a;
7367     if (aa == 0 || aa == -1) return 0;
7368     else if (aa < 0) a = -static_cast<std::uint64_t>(aa) - 1;
7369     else a = aa;
7370     return static_cast<std::size_t>(64-nlz(a));
7371 }
7372 
op(std::uint64_t * a)7373 inline std::size_t Low_bit::op(std::uint64_t *a)
7374 {   std::size_t lena = number_size(a);
7375     if (negative(a[lena-1])) // count trailing 1 bits!
7376     {   std::size_t r=0, i=0;
7377         while (a[i++]==-static_cast<std::uint64_t>(1)) r += 64;
7378         std::uint64_t w = ~a[i-1];
7379         return static_cast<std::size_t>(64-nlz(w & (-w))+r);
7380     }
7381     else if (lena==1 && a[0]==0) return 0;
7382     else
7383     {   std::size_t r=0, i=0;
7384         while (a[i++]==0) r += 64;
7385         std::uint64_t w = a[i-1];
7386         return static_cast<std::size_t>(64-nlz(w & (-w))+r);
7387     }
7388 }
7389 
op(std::int64_t aa)7390 inline std::size_t Low_bit::op(std::int64_t aa)
7391 {   std::uint64_t a;
7392     if (aa == 0) return 0;
7393     else if (aa < 0) a = ~static_cast<std::uint64_t>(aa);
7394     else a = aa;
7395     a = a & (-a); // keeps only the lowest bit
7396     return static_cast<std::size_t>(64-nlz(a));
7397 }
7398 
op(std::uint64_t * a)7399 inline std::size_t Logcount::op(std::uint64_t *a)
7400 {   std::size_t lena = number_size(a);
7401     std::size_t r = 0;
7402     if (negative(a[lena-1]))
7403     {   for (std::size_t i=0; i<lena; i++) r += popcount(~a[i]);
7404     }
7405     else for (std::size_t i=0; i<lena; i++) r += popcount(a[i]);
7406     return r;
7407 }
7408 
op(std::int64_t a)7409 inline std::size_t Logcount::op(std::int64_t a)
7410 {   if (a < 0) return static_cast<std::size_t>(popcount(~a));
7411     else return static_cast<std::size_t>(popcount(a));
7412 }
7413 
op(std::uint64_t * a,std::size_t n)7414 inline bool Logbitp::op(std::uint64_t *a, std::size_t n)
7415 {   std::size_t lena = number_size(a);
7416     if (n >= 64*lena) return negative(a[lena-1]);
7417     return (a[n/64] & (static_cast<std::uint64_t>(1) << (n%64))) != 0;
7418 }
7419 
op(std::int64_t a,std::size_t n)7420 inline bool Logbitp::op(std::int64_t a, std::size_t n)
7421 {   if (n >= 64) return (a < 0);
7422     else return (a & (static_cast<std::uint64_t>(1) << n)) != 0;
7423 }
7424 
7425 // Addition when the length of a is art least than that of b.
7426 
ordered_bigplus(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7427 inline void ordered_bigplus(const std::uint64_t *a, std::size_t lena,
7428                             const std::uint64_t *b, std::size_t lenb,
7429                             std::uint64_t *r, std::size_t &lenr)
7430 {   arithlib_assert(lena >= lenb);
7431     std::uint64_t carry = 0;
7432     std::size_t i = 0;
7433 // The lowest digits can be added without there being any carry-in.
7434     carry = add_with_carry(a[0], b[0], r[0]);
7435 // Add the digits that (a) and (b) have in common
7436     for (i=1; i<lenb; i++)
7437         carry = add_with_carry(a[i], b[i], carry, r[i]);
7438 // From there on up treat (b) as if it had its sign bit extended to the
7439 // left.
7440     std::uint64_t topb = negative(b[lenb-1]) ? allbits : 0;
7441     for (; i<lena; i++)
7442         carry = add_with_carry(a[i], topb, carry, r[i]);
7443 // And of course (a) must also be treated as being extended by its sign bit.
7444     std::uint64_t topa = negative(a[lena-1]) ? allbits : 0;
7445 // The result calculated here is 1 word longer than (a), and addition
7446 // can never carry further than that.
7447     r[i] = topa + topb + carry;
7448 // However because I am using (2s complement) signed arithmetic the result
7449 // could be shorter, so I will check for that and return the length that
7450 // is actually needed.
7451     while (r[i]==0 && i>0 && positive(r[i-1])) i--;
7452     while (r[i]==allbits && i>0 && negative(r[i-1])) i--;
7453     lenr = i+1;
7454 }
7455 
7456 // Add a small number to a bignum
7457 
bigplus_small(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7458 inline void bigplus_small(const std::uint64_t *a, std::size_t lena,
7459                           std::int64_t n,
7460                           std::uint64_t *r, std::size_t &lenr)
7461 {   std::uint64_t w[1];
7462     w[0] = static_cast<std::uint64_t>(n);
7463     ordered_bigplus(a, lena, w, 1, r, lenr);
7464 }
7465 
7466 // When I do a general addition I will not know which input is longer.
7467 
bigplus(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7468 inline void bigplus(const std::uint64_t *a, std::size_t lena,
7469                     const std::uint64_t *b, std::size_t lenb,
7470                     std::uint64_t *r, std::size_t &lenr)
7471 {   if (lena >= lenb) return ordered_bigplus(a, lena, b, lenb, r,
7472                                  lenr);
7473     else return ordered_bigplus(b, lenb, a, lena, r, lenr);
7474 }
7475 
op(std::uint64_t * a,std::uint64_t * b)7476 inline std::intptr_t Plus::op(std::uint64_t *a, std::uint64_t *b)
7477 {   std::size_t lena = number_size(a);
7478     std::size_t lenb = number_size(b);
7479     std::size_t n;
7480     if (lena >= lenb) n = lena+1;
7481     else n = lenb+1;
7482     push(a); push(b);
7483     std::uint64_t *p = reserve(n);
7484     pop(b); pop(a);
7485     std::size_t final_n;
7486     bigplus(a, lena, b, lenb, p, final_n);
7487     return confirm_size(p, n, final_n);
7488 }
7489 
7490 // At present I implement the op_ii, opt_ib and opt_bi operations
7491 // by converting the integer argument to a 1-word bignum and dropping into
7492 // the general bignum code. This will generally be a long way from the
7493 // most efficient implementation, so at a later stage I will want to hone
7494 // the code to make it better!
7495 
op(std::int64_t a,std::int64_t b)7496 inline std::intptr_t Plus::op(std::int64_t a, std::int64_t b)
7497 {
7498 // The two integer arguments will in fact each have been derived from a
7499 // tagged representation, and a consequence of that is that I can add
7500 // them and be certain I will not get arithmetic overflow. However the
7501 // resulting value may no longer be representable as a fixnum.
7502     std::int64_t c = a + b;
7503     if (fits_into_fixnum(c)) return int_to_handle(c);
7504 // Now because there had not been overflow I know that the bignum will
7505 // only need one word.
7506     std::uint64_t *r = reserve(1);
7507     r[0] = c;
7508     return confirm_size(r, 1, 1);
7509 }
7510 
op(std::int64_t a,std::uint64_t * b)7511 inline std::intptr_t Plus::op(std::int64_t a, std::uint64_t *b)
7512 {   std::uint64_t aa[1];
7513     aa[0] = a;
7514     std::size_t lenb = number_size(b);
7515     push(b);
7516     std::uint64_t *r = reserve(lenb+1);
7517     pop(b);
7518     std::size_t final_n;
7519     bigplus(aa, 1, b, lenb, r, final_n);
7520     return confirm_size(r, lenb+1, final_n);
7521 }
7522 
op(std::uint64_t * a,std::int64_t b)7523 inline std::intptr_t Plus::op(std::uint64_t *a, std::int64_t b)
7524 {   std::size_t lena = number_size(a);
7525     std::uint64_t bb[1];
7526     bb[0] = b;
7527     push(a);
7528     std::uint64_t *r = reserve(lena+1);
7529     pop(a);
7530     std::size_t final_n;
7531     bigplus(a, lena, bb, 1, r, final_n);
7532     return confirm_size(r, lena+1, final_n);
7533 }
7534 
bigplus_small(std::intptr_t aa,std::int64_t b)7535 inline std::intptr_t bigplus_small(std::intptr_t aa, std::int64_t b)
7536 {   std::uint64_t *a = vector_of_handle(aa);
7537     std::size_t lena = number_size(a);
7538     push(a);
7539     std::uint64_t *p = reserve(lena+1);
7540     pop(a);
7541     std::size_t final_n;
7542     bigplus_small(a, lena, b, p, final_n);
7543     return confirm_size(p, lena+1, final_n);
7544 }
7545 
7546 // For subtraction I implement both a-b and b-a. These work by
7547 // computing a + (~b) + 1 and (~a) + b + 1 respectively.
7548 
ordered_bigsubtract(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7549 inline void ordered_bigsubtract(const std::uint64_t *a,
7550                                 std::size_t lena,
7551                                 const std::uint64_t *b, std::size_t lenb,
7552                                 std::uint64_t *r, std::size_t &lenr)
7553 {   arithlib_assert(lena >= lenb);
7554     std::uint64_t carry = 1;
7555     std::size_t i;
7556 // Add the digits that (a) and (b) have in common
7557     for (i=0; i<lenb; i++)
7558         carry = add_with_carry(a[i], ~b[i], carry, r[i]);
7559 // From there on up treat (b) as if it had its sign bit extended to the
7560 // left.
7561     std::uint64_t topb = negative(~b[lenb-1]) ? allbits : 0;
7562     for (; i<lena; i++)
7563         carry = add_with_carry(a[i], topb, carry, r[i]);
7564 // And of course (a) must also be treated as being extended by its sign bit.
7565     std::uint64_t topa = negative(a[lena-1]) ? allbits : 0;
7566 // The result calculated here is 1 word longer than (a), and addition
7567 // can never carry further than that.
7568     r[i] = topa + topb + carry;
7569 // However because I am using (2s complement) signed arithmetic the result
7570 // could be shorter, so I will check for that and return the length that
7571 // is actually needed.
7572     while (r[i]==0 && i>0 && positive(r[i-1])) i--;
7573     while (r[i]==allbits && i>0 && negative(r[i-1])) i--;
7574     lenr = i+1;
7575 }
7576 
ordered_bigrevsubtract(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7577 inline void ordered_bigrevsubtract(const std::uint64_t *a,
7578                                    std::size_t lena,
7579                                    const std::uint64_t *b, std::size_t lenb,
7580                                    std::uint64_t *r, std::size_t &lenr)
7581 {   arithlib_assert(lena >= lenb);
7582     std::uint64_t carry = 1;
7583     std::size_t i;
7584 // Add the digits that (a) and (b) have in common
7585     for (i=0; i<lenb; i++)
7586         carry = add_with_carry(~a[i], b[i], carry, r[i]);
7587 // From there on up treat (b) as if it had its sign bit extended to the
7588 // left.
7589     std::uint64_t topb = negative(b[lenb-1]) ? allbits : 0;
7590     for (; i<lena; i++)
7591         carry = add_with_carry(~a[i], topb, carry, r[i]);
7592 // And of course (a) must also be treated as being extended by its sign bit.
7593     std::uint64_t topa = negative(~a[lena-1]) ? allbits : 0;
7594 // The result calculated here is 1 word longer than (a), and addition
7595 // can never carry further than that.
7596     r[i] = topa + topb + carry;
7597 // However because I am using (2s complement) signed arithmetic the result
7598 // could be shorter, so I will check for that and return the length that
7599 // is actually needed.
7600     while (r[i]==0 && i>0 && positive(r[i-1])) i--;
7601     while (r[i]==allbits && i>0 && negative(r[i-1])) i--;
7602     lenr = i+1;
7603 }
7604 
7605 // Subtract a small number from a bignum
7606 
bigsubtract_small(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7607 inline void bigsubtract_small(const std::uint64_t *a,
7608                               std::size_t lena,
7609                               std::int64_t n,
7610                               std::uint64_t *r, std::size_t &lenr)
7611 {   std::uint64_t w[1];
7612     w[0] = static_cast<std::uint64_t>(n);
7613     ordered_bigsubtract(a, lena, w, 1, r, lenr);
7614 }
7615 
7616 // subtract a bignum from a small number
7617 
bigrevsubtract_small(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7618 inline void bigrevsubtract_small(const std::uint64_t *a,
7619                                  std::size_t lena,
7620                                  std::int64_t n,
7621                                  std::uint64_t *r, std::size_t &lenr)
7622 {   std::uint64_t w[1];
7623     w[0] = static_cast<std::uint64_t>(n);
7624     ordered_bigrevsubtract(a, lena, w, 1, r, lenr);
7625 }
7626 
7627 
bigsubtract(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7628 inline void bigsubtract(const std::uint64_t *a, std::size_t lena,
7629                         const std::uint64_t *b, std::size_t lenb,
7630                         std::uint64_t *r, std::size_t &lenr)
7631 {   if (lena >= lenb) return ordered_bigsubtract(a, lena, b, lenb, r,
7632                                  lenr);
7633     else return ordered_bigrevsubtract(b, lenb, a, lena, r, lenr);
7634 }
7635 
op(std::uint64_t * a,std::uint64_t * b)7636 inline std::intptr_t Difference::op(std::uint64_t *a,
7637                                     std::uint64_t *b)
7638 {   std::size_t lena = number_size(a);
7639     std::size_t lenb = number_size(b);
7640     std::size_t n;
7641     if (lena >= lenb) n = lena+1;
7642     else n = lenb+1;
7643     push(a); push(b);
7644     std::uint64_t *p = reserve(n);
7645     pop(b); pop(a);
7646     std::size_t final_n;
7647     bigsubtract(a, lena, b, lenb, p, final_n);
7648     return confirm_size(p, n, final_n);
7649 }
7650 
op(std::int64_t a,std::int64_t b)7651 inline std::intptr_t Difference::op(std::int64_t a, std::int64_t b)
7652 {   std::uint64_t aa[1], bb[1];
7653     aa[0] = a;
7654     bb[0] = b;
7655     std::uint64_t *r = reserve(2);
7656     std::size_t final_n;
7657     bigsubtract(aa, 1, bb, 1, r, final_n);
7658     return confirm_size(r, 2, final_n);
7659 }
7660 
op(std::int64_t a,std::uint64_t * b)7661 inline std::intptr_t Difference::op(std::int64_t a, std::uint64_t *b)
7662 {   std::uint64_t aa[1];
7663     aa[0] = a;
7664     std::size_t lenb = number_size(b);
7665     push(b);
7666     std::uint64_t *r = reserve(lenb+1);
7667     pop(b);
7668     std::size_t final_n;
7669     bigsubtract(aa, 1, b, lenb, r, final_n);
7670     return confirm_size(r, lenb+1, final_n);
7671 }
7672 
op(std::uint64_t * a,std::int64_t b)7673 inline std::intptr_t Difference::op(std::uint64_t *a, std::int64_t b)
7674 {   std::size_t lena = number_size(a);
7675     std::uint64_t bb[1];
7676     bb[0] = b;
7677     push(a);
7678     std::uint64_t *r = reserve(lena+1);
7679     pop(a);
7680     std::size_t final_n;
7681     bigsubtract(a, lena, bb, 1, r, final_n);
7682     return confirm_size(r, lena+1, final_n);
7683 }
7684 
7685 
op(std::uint64_t * a,std::uint64_t * b)7686 inline std::intptr_t RevDifference::op(std::uint64_t *a,
7687                                        std::uint64_t *b)
7688 {   std::size_t lena = number_size(a);
7689     std::size_t lenb = number_size(b);
7690     std::size_t n;
7691     if (lena >= lenb) n = lena+1;
7692     else n = lenb+1;
7693     push(a); push(b);
7694     std::uint64_t *p = reserve(n);
7695     pop(b); pop(a);
7696     std::size_t final_n;
7697     bigsubtract(b, lenb, a, lena, p, final_n);
7698     return confirm_size(p, n, final_n);
7699 }
7700 
op(std::int64_t a,std::int64_t b)7701 inline std::intptr_t RevDifference::op(std::int64_t a, std::int64_t b)
7702 {   std::uint64_t aa[1], bb[1];
7703     aa[0] = a;
7704     bb[0] = b;
7705     std::uint64_t *r = reserve(2);
7706     std::size_t final_n;
7707     bigsubtract(bb, 1, aa, 1, r, final_n);
7708     return confirm_size(r, 2, final_n);
7709 }
7710 
op(std::int64_t a,std::uint64_t * b)7711 inline std::intptr_t RevDifference::op(std::int64_t a,
7712                                        std::uint64_t *b)
7713 {   std::uint64_t aa[1];
7714     aa[0] = a;
7715     std::size_t lenb = number_size(b);
7716     push(b);
7717     std::uint64_t *r = reserve(lenb+1);
7718     pop(b);
7719     std::size_t final_n;
7720     bigsubtract(b, lenb, aa, 1, r, final_n);
7721     return confirm_size(r, lenb+1, final_n);
7722 }
7723 
op(std::uint64_t * a,std::int64_t b)7724 inline std::intptr_t RevDifference::op(std::uint64_t *a,
7725                                        std::int64_t b)
7726 {   std::size_t lena = number_size(a);
7727     std::uint64_t bb[1];
7728     bb[0] = b;
7729     push(a);
7730     std::uint64_t *r = reserve(lena+1);
7731     pop(a);
7732     std::size_t final_n;
7733     bigsubtract(bb, 1, a, lena, r, final_n);
7734     return confirm_size(r, lena+1, final_n);
7735 }
7736 
7737 //=========================================================================
7738 //=========================================================================
7739 // multiplication, squaring and exponentiation.
7740 //=========================================================================
7741 //=========================================================================
7742 
7743 // This multiplication code took much longer to write and debug than I had
7744 // expected. Classical multiplication was straightforward to implement,
7745 // but then while Karatsuba is at its heart simple, fitting it into a
7746 // framework where inputs of unequal size can be combined leaves much
7747 // opportunity for mistakes to creep in. Along with that I view this as
7748 // speed-critical, so I take a number of special cases and implement them
7749 // directly rather than via the general code. The earlier drafts of this
7750 // I had ended up with code that was even more tangled and untidy then the
7751 // present version!
7752 // I experimented with versions of classical multiplication that worked using
7753 // pairs of digits at once (and that did not seem to help me), and for
7754 // Karatsuba I tried both a version based and adding the high ane low parts
7755 // of input numbers and the version here that subtracts them.
7756 // Ensuring that the twos complement treatment of signs is supported adds
7757 // further potential confusion!
7758 
7759 
7760 
7761 //                       Karatsuba multiplication.
7762 //                       =========================
7763 
7764 // The multiplication code has a signature something like
7765 //   void mult(uint64_t *a, size_t lena,
7766 //             uint64_t *b, size_t lenb,
7767 //             uint64_t *c,
7768 //             uint64_t *work_vector=NULL)
7769 // where a and b are vectors with lena and lenb words in then respectively.
7770 // c is a vector and lena+lenb words of a product will be written into it.
7771 // Note that sometimes the top digit will end up as either 0 or -1.
7772 // w must be a workspace vector of length lenb+2*log(lenb) [the log is to
7773 // the base 2]. It is not needed if lenb is very short.
7774 //
7775 
7776 // For use within the multiplication code I need variants on my
7777 // addition and subtraction code.
7778 
7779 // I want:
7780 //    kadd(a, lena, c, lenc);          // c += a
7781 
kadd(const std::uint64_t * a,std::size_t lena,std::uint64_t * c,std::size_t lenc,std::uint64_t carry=0)7782 inline std::uint64_t kadd(const std::uint64_t *a, std::size_t lena,
7783                           std::uint64_t *c, std::size_t lenc,
7784                           std::uint64_t carry=0)
7785 {   std::size_t i;
7786     for (i=0; i<lena; i++)
7787         carry = add_with_carry(a[i], c[i], carry, c[i]);
7788     while (carry!=0 && i<lenc)
7789     {   carry = add_with_carry(c[i], carry, c[i]);
7790         i++;
7791     }
7792     return carry;
7793 }
7794 
7795 // c = a - b.   must have length(a) >= length(b).
7796 
ksub(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)7797 inline std::uint64_t ksub(const std::uint64_t *a, std::size_t lena,
7798                           const std::uint64_t *b, std::size_t lenb,
7799                           std::uint64_t *c)
7800 {   arithlib_assert(lena >= lenb);
7801     std::uint64_t borrow = 0;
7802     std::size_t i;
7803     for (i=0; i<lenb; i++)
7804         borrow = subtract_with_borrow(a[i], b[i], borrow, c[i]);
7805     for (; i<lena; i++)
7806         borrow = subtract_with_borrow(a[i], borrow, c[i]);
7807     return borrow;
7808 }
7809 
kneg(std::uint64_t * a,std::size_t lena)7810 inline void kneg(std::uint64_t *a, std::size_t lena)
7811 {   std::uint64_t carry = 0;
7812     for (std::size_t i=0; i<lena; i++)
7813         a[i] = add_with_carry(~a[i], carry, a[i]);
7814 }
7815 
7816 // c = |a - b| and return an indication of which branch of the absolute
7817 // value function was used, ie whether we had a>=b or a<b. If a==b so
7818 // the result is zero the value is not terribly important. Must be
7819 // called with the first argument at least as long as the second.
7820 
absdiff(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)7821 inline bool absdiff(const std::uint64_t *a, std::size_t lena,
7822                     const std::uint64_t *b, std::size_t lenb,
7823                     std::uint64_t *c)
7824 {
7825 // I will do a cheap comparison of a and b first, based on an understanding
7826 // that lena >= lenb. The result will be of length lena.
7827     arithlib_assert(lena >= lenb);
7828     if (lenb < lena ||
7829         b[lenb-1]<=a[lena-1])
7830     {
7831 // If my cheap test suggests that a is the smaller one then I form (b-a).
7832 // If that generates a borrow my "guess" was wrong, so I negate the
7833 // result. For fully random inputs the cheap test is liable to be reliable.
7834 // for values that have long sequences of 0 bits in their binary
7835 // representation, eg values that are close to a power of 2 or ones that
7836 // have a large power of 2 as a factor, the fallback may be activated
7837 // more frequently.
7838         if (ksub(a, lena, b, lenb, c) != 0)
7839         {   kneg(c, lena);
7840             return true;     // Have computed b-a
7841         }
7842         else return false;   // have computed a-b
7843     }
7844     else
7845     {   if (ksub(b, lenb, a, lena, c) != 0)
7846         {   kneg(c, lenb);
7847             return false;    // a-b
7848         }
7849         else return true;    // b-a
7850     }
7851 }
7852 
7853 // I will have in-line code for a number of very small case on the
7854 // expectation that (a) these will arise expecially often in many
7855 // applications and (b) that the inline code will end up faster
7856 // then general loops.
7857 
7858 // Now code that multiplies 2-digit numbers together.
7859 // One version treats them as unsigned, the second as signed.
7860 
mul2x2(std::uint64_t a1,std::uint64_t a0,std::uint64_t b1,std::uint64_t b0,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7861 inline void mul2x2(std::uint64_t a1, std::uint64_t a0,
7862                    std::uint64_t b1, std::uint64_t b0,
7863                    std::uint64_t &c3, std::uint64_t &c2, std::uint64_t &c1,
7864                    std::uint64_t &c0)
7865 {   std::uint64_t c1a, c2a, c2b, c3a;
7866     multiply64(a0, b0, c1a, c0);
7867     multiply64(a0, b1, c1a, c2a, c1a);
7868     multiply64(a1, b0, c1a, c2b, c1);
7869     multiply64(a1, b1, c2a, c3a, c2a);
7870     c3a += add_with_carry(c2a, c2b, c2);
7871     c3 = c3a;
7872 }
7873 
mul2x2S(std::int64_t a1,std::uint64_t a0,std::int64_t b1,std::uint64_t b0,std::int64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7874 inline void mul2x2S(std::int64_t a1, std::uint64_t a0,
7875                     std::int64_t b1, std::uint64_t b0,
7876                     std::int64_t &c3, std::uint64_t &c2, std::uint64_t &c1,
7877                     std::uint64_t &c0)
7878 {   std::uint64_t c1a;
7879     multiply64(a0, b0, c1a, c0);
7880     std::uint64_t c2a;
7881     multiply64(a0, static_cast<std::uint64_t>(b1), c1a, c2a, c1a);
7882     std::uint64_t c2b;
7883     multiply64(static_cast<std::uint64_t>(a1), b0, c1a, c2b, c1);
7884     std::int64_t c3a;
7885     signed_multiply64(a1, b1, c2a, c3a, c2a);
7886     c3a = static_cast<std::int64_t>(
7887               static_cast<std::uint64_t>(c3a) + add_with_carry(c2a, c2b, c2a));
7888 // Do the arithmetic in unsigned mode in case of overflow problems.
7889     if (a1 < 0) c3a = static_cast<std::int64_t>(
7890                               static_cast<std::uint64_t>(c3a) -
7891                               subtract_with_borrow(c2a, b0, c2a));
7892     if (b1 < 0) c3a = static_cast<std::int64_t>(
7893                               static_cast<std::uint64_t>(c3a) -
7894                               subtract_with_borrow(c2a, a0, c2a));
7895     c2 = c2a;
7896     c3 = c3a;
7897 }
7898 
mul3x2(std::uint64_t a2,std::uint64_t a1,std::uint64_t a0,std::uint64_t b1,std::uint64_t b0,std::uint64_t & c4,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7899 inline void mul3x2(std::uint64_t a2, std::uint64_t a1,
7900                    std::uint64_t a0,
7901                    std::uint64_t b1, std::uint64_t b0,
7902                    std::uint64_t &c4, std::uint64_t &c3, std::uint64_t &c2,
7903                    std::uint64_t &c1, std::uint64_t &c0)
7904 {   std::uint64_t c3a;
7905     mul2x2(a1, a0, b1, b0, c3, c2, c1, c0);
7906     multiply64(a2, b0, c2, c3a, c2);
7907     std::uint64_t carry = add_with_carry(c3, c3a, c3);
7908     multiply64(a2, b1, c3, c4, c3);
7909     c4 += carry;
7910 }
7911 
mul3x3(std::uint64_t a2,std::uint64_t a1,std::uint64_t a0,std::uint64_t b2,std::uint64_t b1,std::uint64_t b0,std::uint64_t & c5,std::uint64_t & c4,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7912 inline void mul3x3(std::uint64_t a2, std::uint64_t a1,
7913                    std::uint64_t a0,
7914                    std::uint64_t b2, std::uint64_t b1, std::uint64_t b0,
7915                    std::uint64_t &c5, std::uint64_t &c4, std::uint64_t &c3,
7916                    std::uint64_t &c2, std::uint64_t &c1, std::uint64_t &c0)
7917 {   std::uint64_t c4a, c3a;
7918     mul2x2(a1, a0, b1, b0, c3, c2, c1, c0);
7919     multiply64(a2, b0, c2, c3a, c2);
7920     std::uint64_t carry = add_with_carry(c3, c3a, c3);
7921     multiply64(a0, b2, c2, c3a, c2);
7922     carry += add_with_carry(c3, c3a, c3);
7923     multiply64(a2, b1, c3, c4, c3);
7924     carry = add_with_carry(c4, carry, c4);
7925     multiply64(a1, b2, c3, c4a, c3);
7926     carry = add_with_carry(c4, c4a, c4);
7927     multiply64(static_cast<std::int64_t>(a2),
7928                static_cast<std::int64_t>(b2), c4, c5, c4);
7929     c5 = static_cast<std::int64_t>(static_cast<std::uint64_t>
7930                                    (c5) + carry);
7931 }
7932 
mul3x3S(std::uint64_t a2,std::uint64_t a1,std::uint64_t a0,std::uint64_t b2,std::uint64_t b1,std::uint64_t b0,std::int64_t & c5,std::uint64_t & c4,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7933 inline void mul3x3S(std::uint64_t a2, std::uint64_t a1,
7934                     std::uint64_t a0,
7935                     std::uint64_t b2, std::uint64_t b1, std::uint64_t b0,
7936                     std::int64_t &c5, std::uint64_t &c4, std::uint64_t &c3,
7937                     std::uint64_t &c2, std::uint64_t &c1, std::uint64_t &c0)
7938 {   std::uint64_t c4a, c3a;
7939     mul2x2(a1, a0, b1, b0, c3, c2, c1, c0);
7940     multiply64(a2, b0, c2, c3a, c2);
7941     std::uint64_t carry = add_with_carry(c3, c3a, c3);
7942     multiply64(a0, b2, c2, c3a, c2);
7943     carry += add_with_carry(c3, c3a, c3);
7944     multiply64(a2, b1, c3, c4, c3);
7945     carry = add_with_carry(c4, carry, c4);
7946     multiply64(a1, b2, c3, c4a, c3);
7947     carry += add_with_carry(c4, c4a, c4);
7948     signed_multiply64(static_cast<std::int64_t>(a2),
7949                       static_cast<std::int64_t>(b2), c4, c5, c4);
7950     c5 = static_cast<std::int64_t>(static_cast<std::uint64_t>
7951                                    (c5) + carry);
7952     if (negative(b2))
7953     {   std::uint64_t borrow = subtract_with_borrow(c3, a0, c3);
7954         borrow = subtract_with_borrow(c4, a1, borrow, c4);
7955         c5 = static_cast<std::int64_t>(
7956                  static_cast<std::uint64_t>(c5) - borrow);
7957     }
7958     if (negative(a2))
7959     {   std::uint64_t borrow = subtract_with_borrow(c3, b0, c3);
7960         borrow = subtract_with_borrow(c4, b1, borrow, c4);
7961         c5 = static_cast<std::int64_t>(
7962                  static_cast<std::uint64_t>(c5) - borrow);
7963     }
7964 }
7965 
mul4x4(std::uint64_t a3,std::uint64_t a2,std::uint64_t a1,std::uint64_t a0,std::uint64_t b3,std::uint64_t b2,std::uint64_t b1,std::uint64_t b0,std::uint64_t & c7,std::uint64_t & c6,std::uint64_t & c5,std::uint64_t & c4,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7966 inline void mul4x4(std::uint64_t a3, std::uint64_t a2,
7967                    std::uint64_t a1,
7968                    std::uint64_t a0,
7969                    std::uint64_t b3, std::uint64_t b2, std::uint64_t b1,
7970                    std::uint64_t b0,
7971                    std::uint64_t &c7, std::uint64_t &c6, std::uint64_t &c5,
7972                    std::uint64_t &c4,
7973                    std::uint64_t &c3, std::uint64_t &c2, std::uint64_t &c1,
7974                    std::uint64_t &c0)
7975 {   std::uint64_t w7, w6, w5a, w5b, w5c, w4a, w4b, w4c,
7976     w3a, w3b, w3c, w2a, w2b, w2c;
7977     mul2x2(a1, a0, b1, b0, w3a, w2a, c1, c0);
7978     mul2x2(a1, a0, b3, b2, w5a, w4a, w3b, w2b);
7979     mul2x2(a3, a2, b1, b0, w5b, w4b, w3c, w2c);
7980     mul2x2(a3, a2, b3, b2, w7, w6, w5c, w4c);
7981     std::uint64_t carry = add_with_carry(w2a, w2b, w2c, c2);
7982     carry = add_with_carry(w3a, w3b, w3c, carry, c3);
7983     carry = add_with_carry(w4a, w4b, w4c, carry, c4);
7984     carry = add_with_carry(w5a, w5b, w5c, carry, c5);
7985     carry = add_with_carry(w6, carry, c6);
7986     c7 = w7 + carry;
7987 }
7988 
7989 // c = a*b;
7990 
7991 // This forms a product digit by digit.
7992 
classical_multiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)7993 inline void classical_multiply(const std::uint64_t *a,
7994                                std::size_t lena,
7995                                const std::uint64_t *b, std::size_t lenb,
7996                                std::uint64_t *c)
7997 {   if (lena < lenb)
7998     {   std::swap(a, b);
7999         std::swap(lena, lenb);
8000     }
8001 // (1) do the lowest degree term as a separate step
8002     std::uint64_t carry=0, hi, hi1, lo;
8003     multiply64(b[0], a[0], lo, c[0]);
8004 // Now a sequence of stages where at each the number of terms to
8005 // be combined grows.
8006     hi = 0;
8007     for (std::size_t i=1; i<lenb; i++)
8008     {   carry = 0;
8009         for (std::size_t j=0; j<=i; j++)
8010         {   multiply64(b[j], a[i-j], lo, hi1, lo);
8011             carry += add_with_carry(hi, hi1, hi);
8012         }
8013         c[i] = lo;
8014         lo = hi;
8015         hi = carry;
8016     }
8017 // If the two inputs are not the same size I demand that lena>=lenb and
8018 // there may be some slices to compute in the middle here.
8019 // if lena==lenb the following loop does not get executed at all.
8020     for (std::size_t i=lenb; i<lena; i++)
8021     {   carry = 0;
8022         for (std::size_t j=0; j<lenb; j++)
8023         {   multiply64(b[j], a[i-j], lo, hi1, lo);
8024             carry += add_with_carry(hi, hi1, hi);
8025         }
8026         c[i] = lo;
8027         lo = hi;
8028         hi = carry;
8029     }
8030 // Now I will have some stages where the number of terms to be combined
8031 // gradually decreases. If lenb==2 the following loop is not executed.
8032     for (std::size_t i=1; i<lenb-1; i++)
8033     {   carry = 0;
8034         for (std::size_t j=0; j<lenb-i; j++)
8035         {   multiply64(b[i+j], a[lena-j-1], lo, hi1, lo);
8036             carry += add_with_carry(hi, hi1, hi);
8037         }
8038         c[lena+i-1] = lo;
8039         lo = hi;
8040         hi = carry;
8041     }
8042 // Finally the very top term is computed.
8043     multiply64(b[lenb-1], a[lena-1], lo, hi1, c[lena+lenb-2]);
8044     c[lena+lenb-1] = hi + hi1;
8045 }
8046 
8047 
8048 // c = c + a*b. Potentially carry all the way up to lenc.
8049 
classical_multiply_and_add(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc)8050 inline void classical_multiply_and_add(const std::uint64_t *a,
8051                                        std::size_t lena,
8052                                        const std::uint64_t *b, std::size_t lenb,
8053                                        std::uint64_t *c, std::size_t lenc)
8054 {   if (lena < lenb)
8055     {   std::swap(a, b);
8056         std::swap(lena, lenb);
8057     }
8058 // (1) do the lowest degree term as a separate step
8059     std::uint64_t carry=0, carry1, hi, hi1, lo;
8060     multiply64(b[0], a[0], c[0], lo, c[0]);
8061 // Now a sequence of stages where at each the number of terms to
8062 // be combined grows.
8063     hi = 0;
8064     for (std::size_t i=1; i<lenb; i++)
8065     {   carry = 0;
8066         for (std::size_t j=0; j<=i; j++)
8067         {   multiply64(b[j], a[i-j], lo, hi1, lo);
8068             carry += add_with_carry(hi, hi1, hi);
8069         }
8070         carry1 = add_with_carry(c[i], lo, c[i]);
8071         hi = add_with_carry(hi, carry1, lo) + carry;
8072     }
8073 // If the two inputs are not the same size I demand that lena>=lenb and
8074 // there may be some slices to compute in the middle here.
8075     for (std::size_t i=lenb; i<lena;
8076          i++)  //  If lenb==lena this loop is not executed
8077     {   carry = 0;
8078         for (std::size_t j=0; j<lenb; j++)
8079         {   multiply64(b[j], a[i-j], lo, hi1, lo);
8080             carry += add_with_carry(hi, hi1, hi);
8081         }
8082         carry1 = add_with_carry(c[i], lo, c[i]);
8083         hi = add_with_carry(hi, carry1, lo) + carry;
8084     }
8085 // Now I will have some stages where the number of terms to be combined
8086 // gradually decreases.
8087     for (std::size_t i=1; i<lenb-1;
8088          i++) //  If lenb==2 this loop is not executed
8089     {   carry = 0;
8090         for (std::size_t j=0; j<lenb-i; j++)
8091         {   multiply64(b[i+j], a[lena-j-1], lo, hi1, lo);
8092             carry += add_with_carry(hi, hi1, hi);
8093         }
8094         carry1 = add_with_carry(c[lena+i-1], lo, c[lena+i-1]);
8095         hi = add_with_carry(hi, carry1, lo) + carry;
8096     }
8097 // Finally the very top term is computed.
8098     multiply64(b[lenb-1], a[lena-1], lo, hi1, lo);
8099     carry = add_with_carry(c[lena+lenb-2], lo, c[lena+lenb-2]);
8100     carry = add_with_carry(c[lena+lenb-1], hi+hi1, carry, c[lena+lenb-1]);
8101     for (std::size_t i=lena+lenb; carry!=0 && i<lenc; i++)
8102         carry = add_with_carry(c[i], carry, c[i]);
8103 }
8104 
8105 // Now variants that use just a single digit first argument. These may be seen
8106 // as optimized cases.
8107 
classical_multiply(std::uint64_t a,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)8108 inline void classical_multiply(std::uint64_t a,
8109                                const std::uint64_t *b, std::size_t lenb,
8110                                std::uint64_t *c)
8111 {   std::uint64_t hi=0;
8112     for (std::size_t j=0; j<lenb; j++)
8113         multiply64(a, b[j], hi, hi, c[j]);
8114     c[lenb] = hi;
8115 }
8116 
8117 // c = c + a*b and return any carry.
8118 
classical_multiply_and_add(std::uint64_t a,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc)8119 inline void classical_multiply_and_add(std::uint64_t a,
8120                                        const std::uint64_t *b, std::size_t lenb,
8121                                        std::uint64_t *c, std::size_t lenc)
8122 {   std::uint64_t hi=0, lo;
8123     for (std::size_t j=0; j<lenb; j++)
8124     {   multiply64(a, b[j], hi, hi, lo);
8125         hi += add_with_carry(lo, c[j], c[j]);
8126     }
8127     std::uint64_t carry = add_with_carry(hi, c[lenb], c[lenb]);
8128     for (std::size_t i=lenb+1; carry!=0 && i<lenc; i++)
8129         carry = add_with_carry(c[i], carry, c[i]);
8130 }
8131 
8132 // I make the variables that indicate when it is worth transitioning from
8133 // classical multiplication to something that is assymptotically faster
8134 // static rather than inline because if different overrides are provided
8135 // via the command line definitions when different source files are
8136 // being processed that could cause linker clashes otherwise.
8137 
8138 
8139 #if !defined K && !defined K_DEFINED
8140 // I provide a default here but can override it at compile time
8141 static const std::size_t K=18;
8142 #define K_DEFINED 1
8143 #endif
8144 
8145 // When I have completed and measured things I am liable to make this a
8146 // "const", but for now it is a simple variable so I can tinker with the
8147 // value during testing and tuning.
8148 
8149 #ifndef KARATSUBA_CUTOFF
8150 // It may be defined globally as a severe override of what happens here!
8151 static std::size_t KARATSUBA_CUTOFF = K;
8152 #endif
8153 
8154 #if !defined K1 && !defined K1_DEFINED
8155 // I provide a default here but can override it at compile time.
8156 
8157 static const std::size_t K1=170;
8158 #define K1_DEFINED 1
8159 #endif
8160 
8161 // When I have completed and measured things I am liable to make this a
8162 // "const", but for now it is a simple variable so I can tinker with the
8163 // value during testing and tuning.
8164 
8165 #ifndef PARAKARA_CUTOFF
8166 // It may be defined globally as a severe override of what happens here!
8167 // But also if the current host computer does not support at least three
8168 // genuine concurrent activities I will not try use of threads because it
8169 // would not be helpful!
8170 
8171 static std::size_t PARAKARA_CUTOFF =
8172     std::thread::hardware_concurrency() >= 3 ? K1 : SIZE_MAX;
8173 #endif
8174 
8175 inline void small_or_big_multiply(const std::uint64_t *a,
8176                                   std::size_t lena,
8177                                   const std::uint64_t *b, std::size_t lenb,
8178                                   std::uint64_t *c, std::uint64_t *w);
8179 
8180 inline void small_or_big_multiply_and_add(const std::uint64_t *a,
8181         std::size_t lena,
8182         const std::uint64_t *b, std::size_t lenb,
8183         std::uint64_t *c, std::size_t lenc,
8184         std::uint64_t *w);
8185 
8186 // The key function here multiplies two numbers that are at least almost
8187 // the same length. The cases that can arise here are
8188 //      2n   2n       Easy and neat sub-division
8189 //      2n   2n-1     Treat the second number as if it has a padding zero
8190 //      2n-1 2n-1     Treat both numbers as if padded to size 2n
8191 // Observe that if the two numbers have different lengths then the longer
8192 // one is an even length, so the case (eg) 2n+1,2n will not arise.
8193 // This will also only be used if lenb >= KARATSUBA_CUTOFF.
8194 
8195 // When one multiplies {a1,a0}*{b1,b0} the three sub-multiplications to
8196 // be performed are
8197 //         a1*b1, a0*b0, |a0-a1|*|b0-b1|
8198 
8199 
karatsuba(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w)8200 inline void karatsuba(const std::uint64_t *a, std::size_t lena,
8201                       const std::uint64_t *b, std::size_t lenb,
8202                       std::uint64_t *c, std::uint64_t *w)
8203 {   arithlib_assert(lena == lenb ||
8204                     (lena%2 == 0 && lenb == lena-1));
8205     arithlib_assert(lena >= 2);
8206     std::size_t n = (lena+1)/2;    // size of a "half-number"
8207     std::size_t lenc = lena+lenb;
8208 // lena-n and lenb-n will each be either n or n-1.
8209     if (absdiff(a, n, a+n, lena-n, w) !=
8210         absdiff(b, n, b+n, lenb-n, w+n))
8211     {
8212 // Here I will collect
8213 //    a1*b1    (a1*b0 + b1*a0 - a1*b1 - a0*b0)     a0*b0
8214 // First write the middle part into place.
8215         small_or_big_multiply(w, n, w+n, n, c+n,
8216                               w+2*n);     // (a1-a0)*(b0-b1)
8217 // Now I just need to add back in parts of the a1*b1 and a0*b0
8218         small_or_big_multiply(a+n, lena-n, b+n, lenb-n, w, w+2*n); // a1*b1
8219 // First insert the copy at the very top. Part can just be copied because I
8220 // have not yet put anything into c there, the low half then has to be added
8221 // in (and carries could propagate all the way up).
8222         for (std::size_t i=n; i<lenc-2*n; i++) c[2*n+i] = w[i];
8223         kadd(w, n, c+2*n, lenc-2*n);
8224 // Now add in the second copy
8225         kadd(w, lenc-2*n, c+n, lenc-n);
8226 // Now I can deal with the a0*b0.
8227         small_or_big_multiply(a, n, b, n, w, w+2*n);               // a0*b0
8228         for (std::size_t i=0; i<n; i++) c[i] = w[i];
8229         kadd(w+n, n, c+n, lenc-n);
8230         kadd(w, 2*n, c+n, lenc-n);
8231     }
8232     else
8233     {
8234 // This case is slightly more awkward because the key parts of the middle
8235 // part are negated.
8236 //    a1*b1    (-a1*b0 - b1*a0 + a1*b1 + a0*b0)     a0*b0
8237         small_or_big_multiply(w, n, w+n, n, c+n,
8238                               w+2*n);     // (a1-a0)*(b1-b0)
8239         small_or_big_multiply(a+n, lena-n, b+n, lenb-n, w, w+2*n); // a1*b1
8240         for (std::size_t i=n; i<lenc-2*n; i++) c[2*n+i] = w[i];
8241 // Now I will do {c3,c2,c1} = {c3,w0,0} - {0,c2,c1) which has a mere negation
8242 // step for the c1 digit, but is otherwise a reverse subtraction. Note I had
8243 // just done c3 = w1 so that first term on the RHS is "really" {w1,w0,0}.
8244 //      c1 = 0 - c1 [and generate borrow]
8245 //      c2 = w0 - c2 - borrow [and generate borrow]
8246 //      c3 = c3 - borrow
8247         std::uint64_t borrow = 0;
8248         for (std::size_t i=0; i<n; i++)
8249             borrow = subtract_with_borrow(0, c[n+i], borrow, c[n+i]);
8250         for (std::size_t i=0; i<n; i++)
8251             borrow = subtract_with_borrow(w[i], c[2*n+i], borrow, c[2*n+i]);
8252         for (std::size_t i=0; i<lenc-3*n && borrow!=0; i++)
8253             borrow = subtract_with_borrow(c[3*n+i], borrow, c[3*n+i]);
8254 // Now I can proceed as before
8255         kadd(w, lenc-2*n, c+n, lenc-n);
8256         small_or_big_multiply(a, n, b, n, w, w+2*n);               // a0*b0
8257         for (std::size_t i=0; i<n; i++) c[i] = w[i];
8258         kadd(w+n, n, c+n, lenc-n);
8259         kadd(w, 2*n, c+n, lenc-n);
8260     }
8261 }
8262 
8263 // The worker_thread() function is started in each of two threads, and
8264 // processes requests until a "quit" request is sent to it.
8265 
worker_thread(Worker_data * wd)8266 inline void worker_thread(Worker_data *wd)
8267 {
8268 #ifdef USE_MICROSOFT_MUTEX
8269     WaitForSingleObject(wd->mutex[2], 0xffffffff);
8270     WaitForSingleObject(wd->mutex[3], 0xffffffff);
8271 #else
8272     wd->mutex[2].lock();
8273     wd->mutex[3].lock();
8274 #endif
8275     wd->ready = true;
8276     int receive_count = 0;
8277     for (;;)
8278     {
8279 #ifdef USE_MICROSOFT_MUTEX
8280         WaitForSingleObject(wd->mutex[receive_count], 0xffffffff);
8281 #else
8282         wd->mutex[receive_count].lock();
8283 #endif
8284         if (wd->quit_flag) return;
8285 // This is where I do some work!
8286         small_or_big_multiply(wd->a, wd->lena,
8287                               wd->b, wd->lenb,
8288                               wd->c, wd->w);
8289 #ifdef USE_MICROSOFT_MUTEX
8290         ReleaseMutex(wd->mutex[receive_count^2]);
8291 #else
8292         wd->mutex[receive_count^2].unlock();
8293 #endif
8294         receive_count = (receive_count + 1) & 3;
8295     }
8296 }
8297 
top_level_karatsuba(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w,std::uint64_t * w0,std::uint64_t * w1)8298 inline void top_level_karatsuba(const std::uint64_t *a,
8299                                 std::size_t lena,
8300                                 const std::uint64_t *b, std::size_t lenb,
8301                                 std::uint64_t *c, std::uint64_t *w,
8302                                 std::uint64_t *w0, std::uint64_t *w1)
8303 {
8304 // Here I have a HUGE case and I should use threads!
8305     arithlib_assert(lena == lenb ||
8306                     (lena%2 == 0 && lenb == lena-1));
8307     arithlib_assert(lena >= 2);
8308     DriverData *driverData = getDriverData();
8309     std::size_t n = (lena+1)/2;    // size of a "half-number"
8310     std::size_t lenc = lena+lenb;
8311 // I start by arranging that the two threads that can do things in parallel
8312 // can get access to data from here and that I will be able to retrieve
8313 // results. And that the worker threads have workspace to make use of.
8314     driverData->wd_0.a = a;
8315     driverData->wd_0.lena = n;
8316     driverData->wd_0.b = b;
8317     driverData->wd_0.lenb = n;
8318     driverData->wd_0.c = w0;
8319     driverData->wd_0.w = w0+2*n;
8320 
8321     driverData->wd_1.a = a+n;
8322     driverData->wd_1.lena = lena-n;
8323     driverData->wd_1.b = b+n;
8324     driverData->wd_1.lenb = lenb-n;
8325     driverData->wd_1.c = w1;
8326     driverData->wd_1.w = w1+2*n;
8327 
8328 // Now trigger the two threads to do some work for me. One will be forming
8329 // alo*blo while the other computes ahi*bhi .
8330     driverData->release_workers();
8331 // Now I will work on either |ahi-alo|*|bhi-blo|
8332 // lena-n and lenb-n will each be either n or n-1.
8333     bool signs_differ = absdiff(a, n, a+n, lena-n, w) !=
8334                         absdiff(b, n, b+n, lenb-n, w+n);
8335     small_or_big_multiply(w, n, w+n, n, c+n,
8336                           w+2*n);     // (a1-a0)*(b0-b1)
8337 // That has the product of differences written into the middle of c.
8338     driverData->wait_for_workers();
8339     if (signs_differ)
8340     {
8341 // Here I have
8342 //     w0:             ;              ;           alo*blo [2*n]
8343 //     w1:      ahi*bhi [lenc-2*n]    ;                   ;
8344 //     c:     ?[lenc-n];     (ahi-alo)*(blo-bhi)[2n]      ;    ?[n]
8345 //          =         ?; ahi*blo+bhi*alo -ahi*bhi-alo*blo ;    ?[n]
8346 // so I need to add bits from w0 and w1 into c.
8347 //
8348 // First deal with ahi*bhi. The top half can be copied into the very top of
8349 // the result, then I add in the bottom half.
8350         for (std::size_t i=n; i<lenc-2*n; i++) c[2*n+i] = w1[i];
8351         kadd(w1, n, c+2*n, lenc-2*n);
8352 // Now add in the second copy of ahi*bhi
8353         kadd(w1, lenc-2*n, c+n, lenc-n);
8354 // Now something similar with alo*blo
8355         for (std::size_t i=0; i<n; i++) c[i] = w0[i];
8356         kadd(w0+n, n, c+n, lenc-n);
8357         kadd(w0, 2*n, c+n, lenc-n);
8358     }
8359     else
8360     {
8361 // This case is slightly more awkward because the key parts of the middle
8362 // part are negated.
8363 //    a1*b1    (-a1*b0 - b1*a0 + a1*b1 + a0*b0)     a0*b0
8364 // Call the desired result {c3,c2,c1,c0} then the middle product is in
8365 // {c2,c1} and I can copy the top half of ahi*bhi into c3.
8366         for (std::size_t i=n; i<lenc-2*n; i++) c[2*n+i] = w1[i];
8367 // Now I will do {c3,c2,c1} = {c3,low(ahi*bhi),0} - {0,c2,c1) which has a
8368 // mere negation step for the c1 digit, but is otherwise a reverse
8369 // subtraction. Note I had just done c3 = high(ahi*bhi) so that first term on
8370 // the RHS is "really" {{ahi*bhi},0}.
8371 //      c1 = 0 - c1 [and generate borrow]
8372 //      c2 = low(w0) - c2 - borrow [and generate borrow]
8373 //      c3 = c3 - borrow
8374         std::uint64_t borrow = 0;
8375         for (std::size_t i=0; i<n; i++)
8376             borrow = subtract_with_borrow(0, c[n+i], borrow, c[n+i]);
8377         for (std::size_t i=0; i<n; i++)
8378             borrow = subtract_with_borrow(w1[i], c[2*n+i], borrow, c[2*n+i]);
8379         for (std::size_t i=0; i<lenc-3*n && borrow!=0; i++)
8380             borrow = subtract_with_borrow(c[3*n+i], borrow, c[3*n+i]);
8381         kadd(w1, lenc-2*n, c+n, lenc-n);
8382         for (std::size_t i=0; i<n; i++) c[i] = w0[i];
8383         kadd(w0+n, n, c+n, lenc-n);
8384         kadd(w0, 2*n, c+n, lenc-n);
8385     }
8386 }
8387 
karatsuba_and_add(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc,std::uint64_t * w)8388 inline void karatsuba_and_add(const std::uint64_t *a,
8389                               std::size_t lena,
8390                               const std::uint64_t *b, std::size_t lenb,
8391                               std::uint64_t *c, std::size_t lenc, std::uint64_t *w)
8392 {   arithlib_assert(lena == lenb ||
8393                     (lena%2 == 0 && lenb == lena-1));
8394     arithlib_assert(lena >= 2);
8395     std::size_t n = (lena+1)/2;    // size of a "half-number"
8396     std::size_t lenc1 = lena+lenb;
8397     if (absdiff(a, n, a+n, lena-n, w) !=
8398         absdiff(b, n, b+n, lenb-n, w+n))
8399     {
8400 // Here I will collect
8401 //    a1*b1    (a1*b0 + b1*a0 - a1*b1 - a0*b0)     a0*b0
8402         small_or_big_multiply_and_add(w, n, w+n, n, c+n, lenc-n, w+2*n);
8403         small_or_big_multiply(a+n, lena-n, b+n, lenb-n, w, w+2*n); // a1*b1
8404         kadd(w, lenc1-2*n, c+2*n, lenc-2*n);
8405         kadd(w, lenc1-2*n, c+n, lenc-n);
8406         small_or_big_multiply(a, n, b, n, w, w+2*n);               // a0*b0
8407         kadd(w, 2*n, c, lenc);
8408         kadd(w, 2*n, c+n, lenc-n);
8409     }
8410     else
8411     {
8412 // This case is slightly more awkward because the key parts of the middle
8413 // part are negated.
8414 //    a1*b1    (-a1*b0 - b1*a0 + a1*b1 + a0*b0)     a0*b0
8415 // To perform c=c-w; I go c=~c; c=c+w; c=~c; [that is a NOT rather than
8416 // a MINUS there!].
8417         for (std::size_t i=n; i<lenc; i++) c[i] = ~c[i];
8418         small_or_big_multiply_and_add(w, n, w+n, n, c+n, lenc-n, w+2*n);
8419         for (std::size_t i=n; i<lenc; i++) c[i] = ~c[i];
8420         small_or_big_multiply(a+n, lena-n, b+n, lenb-n, w, w+2*n); // a1*b1
8421         kadd(w, lenc1-2*n, c+2*n, lenc-2*n);
8422         kadd(w, lenc1-2*n, c+n, lenc-n);
8423         small_or_big_multiply(a, n, b, n, w, w+2*n);               // a0*b0
8424         kadd(w, 2*n, c, lenc);
8425         kadd(w, 2*n, c+n, lenc-n);
8426     }
8427 }
8428 
8429 // Here both inputs are of size at least KARATSUBA_CUTOFF. If their sizes
8430 // match exactly I can use Karatsuba directly. I take the view that if
8431 // the two are of sized (2n) and (2n-1) then I will also use Karatsuba
8432 // directly (treating the shorter input as if it had an initial zero padding
8433 // digit). In all other cases I need to do a sequence of multiplications
8434 // rather along the lines of "short" multiplication treating the size of the
8435 // smaller operand as the digit size.
8436 
certainly_big_multiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w)8437 inline void certainly_big_multiply(const std::uint64_t *a,
8438                                    std::size_t lena,
8439                                    const std::uint64_t *b, std::size_t lenb,
8440                                    std::uint64_t *c, std::uint64_t *w)
8441 {   if (lena == lenb)
8442     {   karatsuba(a, lena, b, lenb, c, w);
8443         return;
8444     }
8445     if (lena < lenb)
8446     {   std::swap(a, b);
8447         std::swap(lena, lenb);
8448     }
8449 // Now b is the shorter operand. The case (2n)*(2n-1) will be handled
8450 // using Karatsuba merely by treating the smaller number as if padded with
8451 // a leading zero.
8452     if (lena%2==0 && lenb==lena-1)
8453     {   karatsuba(a, lena, b, lenb, c, w);
8454         return;
8455     }
8456 // If the two inputs are unbalanced in length I will perform multiple
8457 // balanced operations each of which can be handled specially. I will
8458 // try to make each subsidiary multiplication as big as possible.
8459 // This will be lenb rounded up to an even number.
8460 // I will be willing to do chunks that are of an even size that is
8461 // either lenb or lenb+1.
8462     std::size_t len = lenb + (lenb & 1);
8463     const std::uint64_t *a1 = a;
8464     std::uint64_t *c1 = c;
8465     std::size_t lena1 = lena;
8466 // Multiply-and-add will be (slightly) more expensive than just Multiply,
8467 // so I do a sequence of multiplications where their outputs will not overlap
8468 // first, and then do the interleaved multiplications adding in.
8469     for (;;)
8470     {
8471 // I may have rounded the size of b up by 1, and if I have I would generate
8472 // 2*len-1 digits not 2*len and hence risk leaving a 1-word gap between filled
8473 // in data. I zero that here to avoid trouble. However I must not do this
8474 // for if the multiplication I am about to do will write in the very top
8475 // digits of the final answer, because if I did that would be a sort of
8476 // buffer overrun.
8477         if (len < lena1) c1[2*len-1] = 0;
8478         karatsuba(a1, len, b, lenb, c1, w);
8479         c1 += 2*len;
8480 // I will keep going provided the next multiplication I will do will fully fit.
8481         if (lena1 < 3*len) break;
8482         a1 += 2*len;
8483         lena1 -= 2*len;
8484     }
8485     if (lena1 > 2*len)
8486     {   a1 += 2*len;
8487         lena1 -= 2*len;
8488 // Do a shorter nice Multiply (without Add) to reach the end of input a.
8489         small_or_big_multiply(a1, lena1, b, lenb, c1, w);
8490     }
8491     else if (lena1!=len)
8492     {
8493 // I may need to pad with zeros when the top digit to be generated will be
8494 // put there using multiply_and_add.
8495         for (std::size_t i=c1-c; i<lena+lenb; i++) c[i] = 0;
8496     }
8497 // Now I need to do much the same for the odd numbered digits of a, but
8498 // adding the products in rather than writing them into place.
8499     a1 = a + len;
8500     c1 = c + len;
8501     std::size_t lenc1 = lena+lenb-len;
8502 // I know that I had lena>lenb at the start. This means I have at
8503 // least a part block to process here, but there is no guarantee that
8504 // I have a whole one.
8505     lena1 = lena - len;
8506     for (;;)
8507     {   if (lena1 < len) break;
8508         karatsuba_and_add(a1, len, b, lenb, c1, lenc1, w);
8509         if (lena1 <= 2*len)
8510         {   lena1 = 0;
8511             break;
8512         }
8513         c1 += 2*len;
8514         lenc1 -= 2*len;
8515         a1 += 2*len;
8516         lena1 -= 2*len;
8517     }
8518     if (lena1!=0)
8519         small_or_big_multiply_and_add(a1, lena1, b, lenb, c1, lenc1, w);
8520 }
8521 
top_level_certainly_big_multiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w,std::uint64_t * w0,std::uint64_t * w1)8522 inline void top_level_certainly_big_multiply(const std::uint64_t *a,
8523         std::size_t lena,
8524         const std::uint64_t *b, std::size_t lenb,
8525         std::uint64_t *c, std::uint64_t *w,
8526         std::uint64_t *w0, std::uint64_t *w1)
8527 {   if (lena == lenb)
8528     {   top_level_karatsuba(a, lena, b, lenb, c, w, w0, w1);
8529         return;
8530     }
8531     if (lena < lenb)
8532     {   std::swap(a, b);
8533         std::swap(lena, lenb);
8534     }
8535 // Now b is the shorter operand. The case (2n)*(2n-1) will be handled
8536 // using Karatsuba merely by treating the smaller number as if padded with
8537 // a leading zero.
8538     if (lena%2==0 && lenb==lena-1)
8539     {   top_level_karatsuba(a, lena, b, lenb, c, w, w0, w1);
8540         return;
8541     }
8542 // If the two inputs are unbalanced in length I will perform multiple
8543 // balanced operations each of which can be handled specially. I will
8544 // try to make each subsidiary multiplication as big as possible.
8545 // This will be lenb rounded up to an even number.
8546 // I will be willing to do chunks that are of an even size that is
8547 // either lenb or lenb+1.
8548     std::size_t len = lenb + (lenb & 1);
8549     const std::uint64_t *a1 = a;
8550     std::uint64_t *c1 = c;
8551     std::size_t lena1 = lena;
8552 // Multiply-and-add will be (slightly) more expensive than just Multiply,
8553 // so I do a sequence of multiplications where their outputs will not overlap
8554 // first, and then do the interleaved multiplications adding in.
8555     for (;;)
8556     {
8557 // I may have rounded the size of b up by 1, and if I have I would generate
8558 // 2*len-1 digits not 2*len and hence risk leaving a 1-word gap between filled
8559 // in data. I zero that here to avoid trouble. However I must not do this
8560 // for if the multiplication I am about to do will write in the very top
8561 // digits of the final answer, because if I did that would be a sort of
8562 // buffer overrun.
8563         if (len < lena1) c1[2*len-1] = 0;
8564         top_level_karatsuba(a1, len, b, lenb, c1, w, w0, w1);
8565         c1 += 2*len;
8566 // I will keep going provided the next multiplication I will do will fully fit.
8567         if (lena1 < 3*len) break;
8568         a1 += 2*len;
8569         lena1 -= 2*len;
8570     }
8571     if (lena1 > 2*len)
8572     {   a1 += 2*len;
8573         lena1 -= 2*len;
8574 // Do a shorter nice Multiply (without Add) to reach the end of input a.
8575         small_or_big_multiply(a1, lena1, b, lenb, c1, w);
8576     }
8577     else if (lena1!=len)
8578     {
8579 // I may need to pad with zeros when the top digit to be generated will be
8580 // put there using multiply_and_add.
8581         for (std::size_t i=c1-c; i<lena+lenb; i++) c[i] = 0;
8582     }
8583 // Now I need to do much the same for the odd numbered digits of a, but
8584 // adding the products in rather than writing them into place.
8585     a1 = a + len;
8586     c1 = c + len;
8587     std::size_t lenc1 = lena+lenb-len;
8588 // I know that I had lena>lenb at the start. This means I have at
8589 // least a part block to process here, but there is no guarantee that
8590 // I have a whole one.
8591     lena1 = lena - len;
8592     for (;;)
8593     {   if (lena1 < len) break;
8594         karatsuba_and_add(a1, len, b, lenb, c1, lenc1, w);
8595         if (lena1 <= 2*len)
8596         {   lena1 = 0;
8597             break;
8598         }
8599         c1 += 2*len;
8600         lenc1 -= 2*len;
8601         a1 += 2*len;
8602         lena1 -= 2*len;
8603     }
8604     if (lena1!=0)
8605         small_or_big_multiply_and_add(a1, lena1, b, lenb, c1, lenc1, w);
8606 }
8607 
certainly_big_multiply_and_add(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc,std::uint64_t * w)8608 inline void certainly_big_multiply_and_add(const std::uint64_t *a,
8609         std::size_t lena,
8610         const std::uint64_t *b, std::size_t lenb,
8611         std::uint64_t *c, std::size_t lenc,
8612         std::uint64_t *w)
8613 {   if (lena == lenb)
8614     {   karatsuba_and_add(a, lena, b, lenb, c, lenc, w);
8615         return;
8616     }
8617     if (lena < lenb)
8618     {   std::swap(a, b);
8619         std::swap(lena, lenb);
8620     }
8621 // Now b is the shorter operand. The case (2n)*(2n-1) will be handled
8622 // using Karatsuba merely by treating the smaller number as if padded with
8623 // a leading zero.
8624     if (lena%2==0 && lenb==lena-1)
8625     {   karatsuba_and_add(a, lena, b, lenb, c, lenc, w);
8626         return;
8627     }
8628 // If the two inputs are unbalanced in length I will perform multiple
8629 // balanced operations each of which can be handled specially. I will
8630 // try to make each subsidiary multiplication as big as possible.
8631 // This will be lenb rounded up to an even number.
8632 // I will be willing to do chunks that are of an even size that is
8633 // either lenb or lenb+1.
8634     std::size_t len = lenb + (lenb & 1);
8635     const std::uint64_t *a1 = a;
8636     std::uint64_t *c1 = c;
8637     std::size_t lena1 = lena, lenc1 = lenc;
8638 // because this is "certainly big" I know I can do at least one
8639 // Karatsuba stage.
8640     for (;;)
8641     {   karatsuba_and_add(a1, len, b, lenb, c1, lenc1, w);
8642         c1 += len;
8643         lenc1 -= len;
8644         a1 += len;
8645         lena1 -= len;
8646 // I will keep going provided the next multiplication I will do will fully fit.
8647         if (lena1 < len) break;
8648     }
8649 // Do a shorter nice Multiply (without Add) to reach the end of input a.
8650     if (lena1 != 0)
8651         small_or_big_multiply_and_add(a1, lena1, b, lenb, c1, lenc1, w);
8652 }
8653 
8654 // I am going to hope that the compiler turns this into a tail-call to
8655 // either certainly_big_multiply or classical_multiply with very
8656 // little overhead.
8657 
small_or_big_multiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w)8658 inline void small_or_big_multiply(const std::uint64_t *a,
8659                                   std::size_t lena,
8660                                   const std::uint64_t *b, std::size_t lenb,
8661                                   std::uint64_t *c, std::uint64_t *w)
8662 {   if (lena < KARATSUBA_CUTOFF || lenb < KARATSUBA_CUTOFF)
8663     {   if (lena==1) classical_multiply(a[0], b, lenb, c);
8664         else if (lenb==1) classical_multiply(b[0], a, lena, c);
8665         else classical_multiply(a, lena, b, lenb, c);
8666     }
8667     else certainly_big_multiply(a, lena, b, lenb, c, w);
8668 }
8669 
small_or_big_multiply_and_add(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc,std::uint64_t * w)8670 inline void small_or_big_multiply_and_add(const std::uint64_t *a,
8671         std::size_t lena,
8672         const std::uint64_t *b, std::size_t lenb,
8673         std::uint64_t *c, std::size_t lenc,
8674         std::uint64_t *w)
8675 {   if (lena < KARATSUBA_CUTOFF || lenb < KARATSUBA_CUTOFF)
8676     {   if (lena==1) classical_multiply_and_add(a[0], b, lenb, c, lenc);
8677         else if (lenb==1) classical_multiply_and_add(b[0], a, lena, c, lenc);
8678         else classical_multiply_and_add(a, lena, b, lenb, c, lenc);
8679     }
8680     else certainly_big_multiply_and_add(a, lena, b, lenb, c, lenc, w);
8681 }
8682 
8683 // FIXED_LENGTH_LIMIT: Can multiply inputs with up to this number of
8684 //                     64-bit digits using the fixed workspace.
8685 // WORKSPACE_SIZE:     Length of the "w" work-vector needed for the above
8686 //                     which is a bit over twice the length of the inputs.
8687 //
8688 // These need to be such that it is OK to put three arrays of length
8689 // KARA_WORKSPACE_SIZE*sizeof(uint64_t) on the stack without that feeling
8690 // embarassing. The settings I use here can use around 50 Kbytes of stack.
8691 
8692 INLINE_VAR const std::size_t KARA_FIXED_LENGTH_LIMIT = 1000;
8693 INLINE_VAR const std::size_t KARA_WORKSPACE_SIZE = 2050;
8694 
8695 // These two functions allocate workspace for Karatsuba on the stack and
8696 // are called when the inputs are short enough for that to feel reasonable.
8697 
allocate_one_array(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)8698 inline void allocate_one_array(const std::uint64_t *a,
8699                                std::size_t lena,
8700                                const std::uint64_t *b, std::size_t lenb,
8701                                std::uint64_t *c)
8702 {   std::uint64_t kara_workspace[KARA_WORKSPACE_SIZE];
8703     certainly_big_multiply(a, lena, b, lenb, c, kara_workspace);
8704 }
8705 
allocate_three_arrays(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)8706 inline void allocate_three_arrays(const std::uint64_t *a,
8707                                   std::size_t lena,
8708                                   const std::uint64_t *b, std::size_t lenb,
8709                                   std::uint64_t *c)
8710 {   std::uint64_t kara_workspace[KARA_WORKSPACE_SIZE];
8711     std::uint64_t kara_workspace0[KARA_WORKSPACE_SIZE];
8712     std::uint64_t kara_workspace1[KARA_WORKSPACE_SIZE];
8713     top_level_certainly_big_multiply(a, lena, b, lenb, c,
8714                                      kara_workspace, kara_workspace0, kara_workspace1);
8715 }
8716 
BY(int m,int n)8717 inline constexpr int BY(int m, int n)
8718 {   return m + 4*n;
8719 }
8720 
8721 // Finally I can provide the top-level entrypoint that accepts signed
8722 // integers that may not be the same size.
8723 
8724 // This is the main entrypoint to the integer multiplication code. It
8725 // takes two signed numbers and forms their product.
8726 
bigmultiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t & lenc)8727 inline void bigmultiply(const std::uint64_t *a, std::size_t lena,
8728                         const std::uint64_t *b, std::size_t lenb,
8729                         std::uint64_t *c, std::size_t &lenc)
8730 {
8731 // If a and/or be are negative then I can treat their true values as
8732 //    a = sa + va      b = sb + vb
8733 // where sa and sb and the signs - represented here as 0 for a positive
8734 // number and -2^(64*len) for a negative one. va and vb are then the simple
8735 // bit-patterns for a and b but now interpreted as unsigned values. So if
8736 // instead of using 64-bit digits I was using 8 bit ones, the value -3
8737 // would be stored as 0xfd and that would be spit up as -128 + 253.
8738 // Then a*b = sa*sb + sa*vb + sb*va + va*vb.
8739 // The last item there is just the product of a and b when treated as
8740 // unsigned values, and so is what I compute first here rather simply.
8741 // If sa and/or sb is non-zero it is just the negative of a power of 2^64,
8742 // and so I can correct the unsigned product into a signed one by (sometimes)
8743 // subtracting a shifted version of a or b from it.
8744 // If both arguments are tiny I write out the code in-line. The timings I
8745 // have taken suggest that this makes a significant difference to costs, and
8746 // I view it as plausible that "rather small" cases will often dominate.
8747     if (lena <= 4 && lenb <= 4) switch (lena + 4*lenb)
8748         {
8749 // Length 2 result
8750             case BY(1, 1):
8751             {   std::int64_t c1;
8752                 std::uint64_t c0;
8753                 signed_multiply64(a[0], b[0], c1, c0);
8754                 c[0] = c0;
8755                 if (shrinkable(c1, c0)) lenc = 1;
8756                 else
8757                 {   c[1] = static_cast<std::uint64_t>(c1);
8758                     lenc = 2;
8759                 }
8760                 return;
8761             }
8762 
8763 // Length 3 result
8764             case BY(1, 2):
8765                 std::swap(a, b);
8766             // drop through.
8767             case BY(2, 1):
8768             {   if (b[0]==0)
8769                 {   c[0] = 0;
8770                     lenc = 1;
8771                     return;
8772                 }
8773                 std::int64_t c2;
8774                 std::uint64_t c1, c0;
8775                 multiply64(b[0], a[0], c1, c0);
8776                 c[0] = c0;
8777                 signed_multiply64(b[0], a[1], c1, c2, c1);
8778                 if (negative(b[0]))
8779                     c2 = static_cast<std::int64_t>(
8780                              static_cast<std::uint64_t>(c2) -
8781                              subtract_with_borrow(c1, a[0], c1));
8782                 c[1] = c1;
8783 // If you have an input like 2^63 it will be represented as a 2-word
8784 // bignum {0,2^63} so that the most significant bit of the most significant
8785 // digits is 0 to show that it is positive. If you multiply two such numbers
8786 // the direct result you get is {0, 0, 2^62, 0} and the top non-zero digit
8787 // now does not have its top bit set. So TWO words can be trimmed from the
8788 // top. This issue may not arise in the length-1 by length-2 case here, but
8789 // I leave the test in to feel safe and tidy.
8790                 if (shrinkable(c2, c1))
8791                 {   if (shrinkable(c1, c0)) lenc = 1;
8792                     else lenc = 2;
8793                 }
8794                 else
8795                 {   c[2] = static_cast<std::uint64_t>(c2);
8796                     lenc = 3;
8797                 }
8798                 return;
8799             }
8800 
8801 // Length 4 result
8802             case BY(2, 2):
8803             {   std::int64_t c3;
8804                 std::uint64_t c2, c1;
8805                 mul2x2S(static_cast<std::int64_t>(a[1]), a[0],
8806                         static_cast<std::int64_t>(b[1]), b[0],
8807                         c3, c2, c1, c[0]);
8808                 c[1] = c1;
8809                 c[2] = c2;
8810                 if (shrinkable(c3, c2))
8811                 {   if (shrinkable(c2, c1)) lenc = 2;
8812                     else lenc = 3;
8813                 }
8814                 else
8815                 {   c[3] = static_cast<std::uint64_t>(c3);
8816                     lenc = 4;
8817                 }
8818                 return;
8819             }
8820 
8821             case BY(1, 3):
8822                 std::swap(a, b);
8823             // drop through.
8824             case BY(3, 1):
8825             {   if (b[0]==0)
8826                 {   c[0] = 0;
8827                     lenc = 1;
8828                     return;
8829                 }
8830                 std::int64_t c3;
8831                 std::uint64_t c2, c1;
8832                 multiply64(b[0], a[0], c1, c[0]);
8833                 multiply64(b[0], a[1], c1, c2, c1);
8834                 signed_multiply64(b[0], a[2], c2, c3, c2);
8835                 if (negative(b[0]))
8836                 {   std::uint64_t borrow = subtract_with_borrow(c1, a[0], c1);
8837                     borrow = subtract_with_borrow(c2, a[1], borrow, c2);
8838                     c3 = static_cast<std::int64_t>(
8839                              static_cast<std::uint64_t>(c3) - borrow);
8840                 }
8841                 c[1] = c1;
8842                 c[2] = c2;
8843                 if (shrinkable(c3, c2))
8844                 {   if (shrinkable(c2, c1)) lenc = 2;
8845                     else lenc = 3;
8846                 }
8847                 else
8848                 {   c[3] = static_cast<std::uint64_t>(c3);
8849                     lenc = 4;
8850                 }
8851                 return;
8852             }
8853 
8854 // Length 5 result
8855             case BY(1, 4):
8856                 std::swap(a, b);
8857             case BY(4, 1):
8858             {   if (b[0]==0)
8859                 {   c[0] = 0;
8860                     lenc = 1;
8861                     return;
8862                 }
8863                 std::int64_t c4;
8864                 std::uint64_t c3, c2, c1;
8865                 multiply64(b[0], a[0], c1, c[0]);
8866                 multiply64(b[0], a[1], c1, c2, c1);
8867                 multiply64(b[0], a[2], c2, c3, c2);
8868                 signed_multiply64(b[0], a[3], c3, c4, c3);
8869                 if (negative(b[0]))
8870                 {   std::uint64_t borrow = subtract_with_borrow(c1, a[0], c1);
8871                     borrow = subtract_with_borrow(c2, a[1], borrow, c2);
8872                     borrow = subtract_with_borrow(c3, a[2], borrow, c3);
8873                     c4 = static_cast<std::int64_t>(
8874                              static_cast<std::uint64_t>(c4) - borrow);
8875                 }
8876                 c[1] = c1;
8877                 c[2] = c2;
8878                 c[3] = c3;
8879                 if (shrinkable(c4, c3))
8880                 {   if (shrinkable(c3, c2)) lenc = 3;
8881                     else lenc = 4;
8882                 }
8883                 else
8884                 {   c[4] = static_cast<std::uint64_t>(c4);
8885                     lenc = 5;
8886                 }
8887                 return;
8888             }
8889 
8890             case BY(2, 3):
8891                 std::swap(a, b);
8892             case BY(3, 2):
8893             {   std::int64_t c4;
8894                 std::uint64_t c3, c3a, c2;
8895                 mul2x2(a[1], a[0], b[1], b[0],
8896                        c3, c2, c[1], c[0]);
8897                 multiply64(a[2], b[0], c2, c3a, c2);
8898                 std::uint64_t carry = add_with_carry(c3, c3a, c3);
8899                 signed_multiply64(static_cast<std::int64_t>(a[2]),
8900                                   static_cast<std::int64_t>(b[1]), c3, c4, c3);
8901                 c4 = static_cast<std::int64_t>(
8902                          static_cast<std::uint64_t>(c4) + carry);
8903                 if (negative(b[1]))
8904                 {   std::uint64_t borrow = subtract_with_borrow(c2, a[0], c2);
8905                     borrow = subtract_with_borrow(c3, a[1], borrow, c3);
8906                     c4 = static_cast<std::int64_t>(
8907                              static_cast<std::uint64_t>(c4) - borrow);
8908                 }
8909                 if (negative(a[2]))
8910                 {   std::uint64_t borrow = subtract_with_borrow(c3, b[0], c3);
8911                     c4 = static_cast<std::int64_t>(
8912                              static_cast<std::uint64_t>(c4) - borrow);
8913                 }
8914                 c[2] = c2;
8915                 c[3] = c3;
8916                 if (shrinkable(c4, c3))
8917                 {   if (shrinkable(c3, c2)) lenc = 3;
8918                     else lenc = 4;
8919                 }
8920                 else
8921                 {   c[4] = static_cast<std::uint64_t>(c4);
8922                     lenc = 5;
8923                 }
8924                 return;
8925             }
8926 
8927 // Length 6 results
8928             case BY(2, 4):
8929             case BY(4, 2):
8930 // I do not have to implement all the cases here - any that I either choose
8931 // not to or have not got around to can merely go "break;" and join in the
8932 // generic path.
8933                 break;
8934 
8935             case BY(3, 3):
8936             {   std::int64_t c5;
8937                 std::uint64_t c4, c3, c2, c1;
8938                 mul3x3S(a[2], a[1], a[0], b[2], b[1], b[0],
8939                         c5, c4, c3, c2, c1, c[0]);
8940                 c[1] = c1;
8941                 c[2] = c2;
8942                 c[3] = c3;
8943                 c[4] = c4;
8944                 if (shrinkable(c5, c4))
8945                 {   if (shrinkable(c4, c3)) lenc = 4;
8946                     else lenc = 5;
8947                 }
8948                 else
8949                 {   c[5] = static_cast<std::uint64_t>(c5);
8950                     lenc = 6;
8951                 }
8952                 return;
8953             }
8954 
8955 // Length 7 results
8956             case BY(3, 4):
8957             case BY(4, 3):
8958 // As above, cases that have not been coded here do not cause failure,
8959 // they just lead to that case being handled by the general (loopy) code.
8960                 break;
8961 
8962 
8963 // Length 8 result
8964             case BY(4, 4):
8965             {   std::uint64_t c7, c6, c5, c4;
8966                 mul4x4(a[3], a[2], a[1], a[0],
8967                        b[3], b[2], b[1], b[0],
8968                        c7, c6, c5, c4, c[3], c[2], c[1], c[0]);
8969                 if (negative(a[3]))
8970                 {   std::uint64_t borrow = subtract_with_borrow(c4, b[0], c4);
8971                     borrow = subtract_with_borrow(c5, b[1], borrow, c5);
8972                     borrow = subtract_with_borrow(c6, b[2], borrow, c6);
8973                     c7 = static_cast<std::int64_t>(
8974                              static_cast<std::uint64_t>(c7) - b[3] - borrow);
8975                 }
8976                 if (negative(b[3]))
8977                 {   std::uint64_t borrow = subtract_with_borrow(c4, a[0], c4);
8978                     borrow = subtract_with_borrow(c5, a[1], borrow, c5);
8979                     borrow = subtract_with_borrow(c6, a[2], borrow, c6);
8980                     c7 = static_cast<std::int64_t>(
8981                              static_cast<std::uint64_t>(c7) - a[3] - borrow);
8982                 }
8983                 c[4] = c4;
8984                 c[5] = c5;
8985                 c[6] = c6;
8986                 if (shrinkable(c7, c6))
8987                 {   if (shrinkable(c6, c5)) lenc = 6;
8988                     else lenc = 7;
8989                 }
8990                 else
8991                 {   c[7] = static_cast<std::uint64_t>(c7);
8992                     lenc = 8;
8993                 }
8994                 return;
8995             }
8996 
8997             default:
8998 // The default label should never be activated!
8999                 ;
9000         }
9001 
9002 // If the smaller input is reasonably small I will merely use classical
9003 // multiplication.
9004 // It is necessary to make a special case for multiplication by a 1-word
9005 // bignum for two reasons. (a) multiplication by zero yields a zero result
9006 // regardless of the magnitude of the second operand, and (b) one of
9007 // my implementations of classical_multiplication must be called in
9008 // a separate overloaded version to multiply by just one digit.
9009     if (lena < KARATSUBA_CUTOFF || lenb < KARATSUBA_CUTOFF)
9010     {   if (lena==1)
9011         {   if (a[0]==0)
9012             {   c[0] = 0;
9013                 lenc = 1;
9014                 return;
9015             }
9016             else classical_multiply(a[0], b, lenb, c);
9017         }
9018         else if (lenb==1)
9019         {   if (b[0]==0)
9020             {   c[0] = 0;
9021                 lenc = 1;
9022                 return;
9023             }
9024             else classical_multiply(b[0], a, lena, c);
9025         }
9026         else classical_multiply(a, lena, b, lenb, c);
9027 // I do NOT return here because for a non-zero result I will need to adjust
9028 // if one or both of the input numbers were negative.
9029     }
9030     else
9031     {
9032 // For many smaller cases I will just use some static pre-allocated workspace
9033 // and hence avoid potential storage management overheads.
9034         if (lena <= KARA_FIXED_LENGTH_LIMIT ||
9035             lenb <= KARA_FIXED_LENGTH_LIMIT)
9036         {   if (lena < PARAKARA_CUTOFF ||
9037                 lenb < PARAKARA_CUTOFF)
9038                 allocate_one_array(a, lena, b, lenb, c);
9039             else allocate_three_arrays(a, lena, b, lenb, c);
9040         }
9041         else
9042         {   push(a); push(b);
9043             std::size_t lenw;
9044             if (lena < lenb) lenw = lena;
9045             else lenw = lenb;
9046             for (std::size_t i=lenw; i>8; i=i/2) lenw++;
9047 // I give myself workspace as long as the shorter input + log of that. The
9048 // extra logarithmic bit is because each time I split a number into its top
9049 // and bottom parts I may have an odd number and so the workspace needed
9050 // gets rounded up by a constant amount for each level of division.
9051             std::uint64_t *w = reserve(2*lenw);
9052             if (lena < PARAKARA_CUTOFF ||
9053                 lenb < PARAKARA_CUTOFF)
9054             {   pop(b); pop(a);
9055                 certainly_big_multiply(a, lena, b, lenb, c, w);
9056             }
9057             else
9058             {   std::uint64_t *w0 = reserve(4*lenw);
9059                 std::uint64_t *w1 = reserve(4*lenw);
9060                 pop(b); pop(a);
9061                 top_level_certainly_big_multiply(a, lena, b, lenb, c,
9062                                                  w, w0, w1);
9063                 abandon(w1);
9064                 abandon(w0);
9065             }
9066             abandon(w);
9067         }
9068     }
9069 // Now adapt for the situation where one or both of the inputs had been
9070 // negative.
9071     if (negative(a[lena-1]))
9072     {   std::uint64_t borrow = 0;
9073         for (std::size_t i=0; i<lenb; i++)
9074             borrow = subtract_with_borrow(c[i+lena], b[i], borrow, c[i+lena]);
9075     }
9076     if (negative(b[lenb-1]))
9077     {   std::uint64_t borrow = 0;
9078         for (std::size_t i=0; i<lena; i++)
9079             borrow = subtract_with_borrow(c[i+lenb], a[i], borrow, c[i+lenb]);
9080     }
9081 // The actual value may be 1 or 2 words shorter than this. So test the top
9082 // digit of c and if necessary reduce lenc.
9083 // Also note that the pending  result is at least of length 2 here because
9084 // various small cases had been processed in-line earlier.
9085 // eg {0, 0x8000000000000000} times itself is {0, 0, 0x4000000000000000, 0}
9086 // and both leading zeros can be trimmed.
9087     lenc = lena + lenb;
9088     if (lenc > 1 && shrinkable(c[lenc-1], c[lenc-2]))
9089     {   lenc--;
9090         if (lenc > 1 && shrinkable(c[lenc-1], c[lenc-2])) lenc--;
9091     }
9092 }
9093 
9094 //===========================================================================
9095 //===========================================================================
9096 
op(std::uint64_t * a,std::uint64_t * b)9097 inline std::intptr_t Times::op(std::uint64_t *a, std::uint64_t *b)
9098 {   std::size_t lena = number_size(a);
9099     std::size_t lenb = number_size(b);
9100     std::size_t n = lena+lenb;
9101     push(a); push(b);
9102     std::uint64_t *p = reserve(n);
9103     pop(b); pop(a);
9104     std::size_t final_n;
9105 // I might like to optimise the 2*2 case here or even 2*3 and 3*3?
9106     bigmultiply(a, lena, b, lenb, p, final_n);
9107     return confirm_size(p, n, final_n);
9108 }
9109 
op(std::int64_t a,std::int64_t b)9110 inline std::intptr_t Times::op(std::int64_t a, std::int64_t b)
9111 {   std::int64_t hi;
9112     std::uint64_t lo;
9113     signed_multiply64(a, b, hi, lo);
9114     if ((hi==0 && positive(lo)) ||
9115         (hi==-1 && negative(lo)))
9116     {   if (fits_into_fixnum(static_cast<std::int64_t>(lo)))
9117             return int_to_handle(static_cast<std::int64_t>(lo));
9118         std::uint64_t *r = reserve(1);
9119         r[0] = lo;
9120         return confirm_size(r, 1, 1);
9121     }
9122     std::uint64_t *r = reserve(2);
9123     r[0] = lo;
9124     r[1] = hi;
9125     return confirm_size(r, 2, 2);
9126 }
9127 
op(std::int64_t a,std::uint64_t * b)9128 inline std::intptr_t Times::op(std::int64_t a, std::uint64_t *b)
9129 {   std::size_t lenb = number_size(b);
9130     push(b);
9131     std::uint64_t *c = reserve(lenb+1);
9132     pop(b);
9133     std::uint64_t hi = 0;
9134     for (std::size_t i=0; i<lenb; i++)
9135         multiply64(a, b[i], hi, hi, c[i]);
9136     c[lenb] = hi;
9137     if (negative(a))
9138     {   std::uint64_t carry = 1;
9139         for (std::size_t i=0; i<lenb; i++)
9140             carry = add_with_carry(c[i+1], ~b[i], carry, c[i+1]);
9141     }
9142     if (negative(b[lenb-1])) c[lenb] -= a;
9143     std::size_t lenc = lenb+1;
9144     truncate_positive(c, lenc);
9145     truncate_negative(c, lenc);
9146     return confirm_size(c, lenb+1, lenc);
9147 }
9148 
op(std::uint64_t * a,std::int64_t b)9149 inline std::intptr_t Times::op(std::uint64_t *a, std::int64_t b)
9150 {   return Times::op(b, a);
9151 }
9152 
9153 // For big multi-digit numbers squaring can be done almost twice as fast
9154 // as general multiplication.
9155 // eg (a0,a1,a2,a3)^2 can be expressed as
9156 // a0^2+a1^2+a2^2+a3^2 + 2*(a0*a1+a0*a2+a0*a3+a1*a2+a1*a3+a2*a3)
9157 // where the part that has been doubled uses symmetry to reduce the work.
9158 //
9159 // For negative inputs I can form the product first treating the inputs
9160 // as if they had been unsigned, and then subtract 2*2^w*a from the result.
9161 //
9162 // I think my view here is that I should still be willing to move across
9163 // to Karatsuba, but only at a distinctly larger threshold than for
9164 // simple multiplication. Just where that threshold should be i snot really
9165 // clear to me, but for now I am setting it as 3 times the point at which
9166 // ordinary multiplications moves on from classical methods.
9167 
bigsquare(std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr)9168 inline void bigsquare(std::uint64_t *a, std::size_t lena,
9169                       std::uint64_t *r, std::size_t &lenr)
9170 {   if (lena > 3*KARATSUBA_CUTOFF)
9171     {   bigmultiply(a, lena, a, lena, r, lenr);
9172         return;
9173     }
9174     for (std::size_t i=0; i<2*lena; i++) r[i] = 0;
9175     std::uint64_t carry;
9176     lenr = 2*lena;
9177     for (std::size_t i=0; i<lena; i++)
9178     {   std::uint64_t hi = 0;
9179 // Note that all the terms I add in here will need to be doubled in the
9180 // final accounting.
9181         for (std::size_t j=i+1; j<lena; j++)
9182         {   std::uint64_t lo;
9183             multiply64(a[i], a[j], hi, hi, lo);
9184             hi += add_with_carry(lo, r[i+j], r[i+j]);
9185         }
9186         r[i+lena] = hi;
9187     }
9188 // Double the part that has been computed so far.
9189     carry = 0;
9190     for (std::size_t i=0; i<2*lena; i++)
9191     {   std::uint64_t w = r[i];
9192         r[i] = (w << 1) | carry;
9193         carry = w >> 63;
9194     }
9195 // Now add in the bits that do not get doubled.
9196     carry = 0;
9197     std::uint64_t hi = 0;
9198     for (std::size_t i=0; i<lena; i++)
9199     {   std::uint64_t lo;
9200         multiply64(a[i], a[i], r[2*i], hi, lo);
9201         carry = add_with_carry(lo, carry, r[2*i]);
9202         carry = add_with_carry(hi, r[2*i+1], carry, r[2*i+1]);
9203     }
9204 // Now if the input had been negative I have a correction to apply...
9205 // I subtract 2a from the top half of the result.
9206     if (negative(a[lena-1]))
9207     {   std::uint64_t carry = 1;
9208         int fromprev = 0;
9209         for (std::size_t i=0; i<lena; i++)
9210         {   std::uint64_t d = a[i];
9211             std::uint64_t w = (d<<1) | fromprev;
9212             fromprev = static_cast<int>(d>>63);
9213             carry = add_with_carry(r[lena+i], ~w, carry, r[lena+i]);
9214         }
9215     }
9216 // The actual value may be 1 word shorter than this.
9217 //  test top digit or r and if necessary reduce lenr.
9218     truncate_positive(r, lenr);
9219     truncate_negative(r, lenr);
9220 }
9221 
op(std::uint64_t * a)9222 inline std::intptr_t Square::op(std::uint64_t *a)
9223 {   std::size_t lena = number_size(a);
9224     std::size_t n = 2*lena;
9225     push(a);
9226     std::uint64_t *p = reserve(n);
9227     pop(a);
9228     std::size_t final_n;
9229     bigsquare(a, lena, p, final_n);
9230     return confirm_size(p, n, final_n);
9231 }
9232 
op(std::int64_t a)9233 inline std::intptr_t Square::op(std::int64_t a)
9234 {   std::uint64_t hi, lo;
9235     multiply64(a, a, hi, lo);
9236     if (a < 0) hi -= 2u*static_cast<std::uint64_t>(a);
9237 // Now I have a 128-bit product of the inputs
9238     if ((hi == 0 && positive(lo)) ||
9239         (hi == static_cast<std::uint64_t>(-1) && negative(lo)))
9240     {   if (fits_into_fixnum(static_cast<std::int64_t>(lo)))
9241             return int_to_handle(static_cast<std::int64_t>(lo));
9242         else
9243         {   std::uint64_t *p = reserve(1);
9244             p[0] = lo;
9245             return confirm_size(p, 1, 1);
9246         }
9247     }
9248     std::uint64_t *p = reserve(2);
9249     p[0] = lo;
9250     p[1] = hi;
9251     return confirm_size(p, 2, 2);
9252 }
9253 
op(std::uint64_t * a)9254 inline std::intptr_t Isqrt::op(std::uint64_t *a)
9255 {   std::size_t lena = number_size(a);
9256     if (lena == 1) return Isqrt::op(static_cast<std::int64_t>(a[0]));
9257     std::size_t lenx = (lena+1)/2;
9258     push(a);
9259     std::uint64_t *x = reserve(lenx);
9260     pop(a);
9261     for (std::size_t i=0; i<lenx; i++) x[i] = 0;
9262     std::size_t bitstop = a[lena-1]==0 ? 0 : 64 - nlz(a[lena-1]);
9263     bitstop /= 2;
9264     if ((lena%2) == 0) bitstop += 32;
9265     x[lenx-1] = static_cast<std::uint64_t>(1) << bitstop;
9266     if (bitstop == 63) x[lenx-1]--; // ensure it is still positive!
9267 // I now have a first approximation to the square root as a number that is
9268 // a power of 2 with about half the bit-length of a. I will degenerate into
9269 // using generic arithmetic here even though that may have extra costs.
9270 //
9271 // I could perhaps reasonably use uint64_t arithmetic for a first few
9272 // iterations, only looking at the most significant digit of the input.
9273 // That would save time, however at present I do not expect this function
9274 // to be time critical in any plausible application, and so I will keep
9275 // things simple(er).
9276     Bignum biga(true, vector_to_handle(a));
9277     Bignum bigx(true, confirm_size(x, lenx, lenx));
9278 // I will do the first step outside the loop to guarantee that my
9279 // approximation is an over-estimate before I try the end-test.
9280 //         bigx = (bigx + biga/bigx) >> 1;
9281 // The push/pop mess here feels extreme and I should probably re-code this
9282 // using lower level interfaces.
9283     push(bigx.val); push(biga.val);
9284     Bignum w1 = biga/bigx;
9285     pop(biga.val); pop(bigx.val);
9286     push(bigx.val); push(biga.val);
9287     w1 = bigx + w1;
9288     pop(biga.val); pop(bigx.val);
9289     push(bigx.val); push(biga.val);
9290     bigx = w1 >> 1;
9291     pop(biga.val); pop(bigx.val);
9292     for (;;)
9293     {   push(bigx.val); push(biga.val);
9294         w1 = biga/bigx;
9295         pop(biga.val); pop(bigx.val);
9296         push(bigx.val); push(biga.val);
9297         w1 = bigx + w1;
9298         pop(biga.val); pop(bigx.val);
9299         push(bigx.val); push(biga.val);
9300         Bignum y = w1 >> 1;
9301         pop(biga.val); pop(bigx.val);
9302         if (y >= bigx) break;
9303         bigx = y;
9304     }
9305 // The Bignum "biga" encapsulated my argument: when its destructor is called
9306 // I do not want the input vector "a" to be clobbered, so I clobber the
9307 // bignum first to break the link. Ditto bigx.
9308     biga.val = 0;
9309     std::intptr_t r = bigx.val;
9310     bigx.val = 0;
9311     return r;
9312 }
9313 
op(std::int64_t aa)9314 inline std::intptr_t Isqrt::op(std::int64_t aa)
9315 {   if (aa <= 0) return int_to_bignum(0);
9316     std::uint64_t a = static_cast<std::uint64_t>(aa);
9317     std::size_t w = 64 - nlz(a);
9318     std::uint64_t x0 = a >> (w/2);
9319 // The iteration here converges to sqrt(a) from above, but I believe that
9320 // when the value stops changing it will be at floor(sqrt(a)). There are
9321 // some cases where the sequence ends up alternating between two adjacent
9322 // values. Because my input is at most 2^63-1 the number of iterations
9323 // written here will always suffice.
9324     std::uint64_t x1 = (x0 + a/x0)/2;
9325     std::uint64_t x2 = (x1 + a/x1)/2;
9326     if (x2 >= x1) return unsigned_int_to_bignum(x1);
9327     std::uint64_t x3 = (x2 + a/x2)/2;
9328     if (x3 >= x2) return unsigned_int_to_bignum(x2);
9329     std::uint64_t x4 = (x3 + a/x3)/2;
9330     if (x4 >= x3) return unsigned_int_to_bignum(x3);
9331     std::uint64_t x5 = (x4 + a/x4)/2;
9332     if (x5 >= x4) return unsigned_int_to_bignum(x4);
9333     return unsigned_int_to_bignum(x5);
9334 }
9335 
9336 // This raises a bignum to a positive integer power. If the power is n then
9337 // the size of the output may be n*lena. The two vectors v and w are workspace
9338 // and must both be of size (at least) the size that the result could end
9339 // up as. Well with greater subtlty we can see that the sum of their sizes
9340 // must be at least the size of the result, but it is not clear that any
9341 // useful saving spece saving can be found down that path.
9342 
bigpow(std::uint64_t * a,std::size_t lena,std::uint64_t n,std::uint64_t * v,std::uint64_t * w,std::uint64_t * r,std::size_t & lenr,std::size_t maxlenr)9343 inline void bigpow(std::uint64_t *a, std::size_t lena,
9344                    std::uint64_t n,
9345                    std::uint64_t *v,
9346                    std::uint64_t *w,
9347                    std::uint64_t *r, std::size_t &lenr, std::size_t maxlenr)
9348 {   if (n == 0)
9349     {   r[0] = 0;
9350         lenr = 1;
9351         return;
9352     }
9353     internal_copy(a, lena, v);
9354     std::size_t lenv = lena;
9355     w[0] = 1;
9356     std::size_t lenw = 1;
9357     while (n > 1)
9358     {   if (n%2 == 0)
9359         {   bigsquare(v, lenv, r, lenr);
9360             arithlib_assert(lenr <= maxlenr);
9361             internal_copy(r, lenr, v);
9362             lenv = lenr;
9363             n = n/2;
9364         }
9365         else
9366         {   bigmultiply(v, lenv, w, lenw, r, lenr);
9367             arithlib_assert(lenr <= maxlenr);
9368             internal_copy(r, lenr, w);
9369             lenw = lenr;
9370             bigsquare(v, lenv, r, lenr);
9371             arithlib_assert(lenr <= maxlenr);
9372             internal_copy(r, lenr, v);
9373             lenv = lenr;
9374             n = (n-1)/2;
9375         }
9376     }
9377     bigmultiply(v, lenv, w, lenw, r, lenr);
9378 }
9379 
9380 // In cases where n is too large this can fail. At present I deal with that
9381 // with arithlib_assert() statements rather than any comfortable scheme for
9382 // reporting the trouble.
9383 
9384 // The code that dispatches into here should have filtered cases such that
9385 // the exponent n is not 0, 1 or 2 here.
9386 
op(std::uint64_t * a,std::int64_t n)9387 inline std::intptr_t Pow::op(std::uint64_t *a, std::int64_t n)
9388 {   std::size_t lena = number_size(a);
9389 //  1^(-n) == 1,
9390 //  (-1)^(-n) == 1 if n is even or -1 if n is odd.
9391 //  a^(-n) == 0 when a is not -1, 0 or 1.
9392     if (n < 0)
9393     {   int z = 0;
9394         if (lena == 0)
9395         {   if (static_cast<std::int64_t>(a[0]) == 1) z = 1;
9396             else if (static_cast<std::int64_t>(a[0]) == -1)
9397                 z = (n%1==0 ? 1 : -1);
9398             else arithlib_assert(a[0] != 0u);
9399         }
9400 // 0^(-n) is an error
9401 // 1^(-n) = 1
9402 // (-1)^(-n) = +1 or -1 depending on whether n is odd or even
9403 // x^(-n) = 0 otherwise.
9404         return int_to_bignum(z);
9405     }
9406 // 6^n = 0
9407     std::size_t bitsa = bignum_bits(a, lena);
9408     std::uint64_t hi, bitsr;
9409     multiply64(n, bitsa, hi, bitsr);
9410     arithlib_assert(hi ==
9411                     0); // Check that size is at least somewhat sane!
9412 // I estimate the largest size that my result could be, but then add
9413 // an extra word because the internal working of multiplication can
9414 // write a zero beyond its true result - eg if you are multiplying a pair
9415 // of 1-word numbers together it will believe that the result could be 2
9416 // words wide even if in fact you know it will not be.
9417     std::uint64_t lenr1 = 2 + bitsr/64;
9418     std::size_t lenr = static_cast<std::size_t>(lenr1);
9419 // if size_t was more narrow than 64-bits I could lose information in
9420 // truncating from uint64_t to size_t.
9421     arithlib_assert(lenr == lenr1);
9422     std::uint64_t olenr = lenr;
9423     push(a);
9424     std::uint64_t *r = reserve(lenr);
9425     push(r);
9426     std::uint64_t *v = reserve(lenr);
9427     push(v);
9428     std::uint64_t *w = reserve(lenr);
9429     pop(v); pop(r); pop(a);
9430     bigpow(a, lena, static_cast<std::uint64_t>(n), v, w, r, lenr, lenr);
9431     arithlib_assert(lenr <= olenr);
9432     abandon(w);
9433     abandon(v);
9434     return confirm_size(r, olenr, lenr);
9435 }
9436 
9437 // Again the cases n = 0, 1 and 2 have been filtered out
9438 
op(std::int64_t a,std::int64_t n)9439 inline std::intptr_t Pow::op(std::int64_t a, std::int64_t n)
9440 {   if (n < 0)
9441     {   int z = 0;
9442         if (a == 1) z = 1;
9443         else if (a == -1) z = (n%1==0 ? 1 : 0);
9444         else arithlib_assert(a != 0);
9445         return int_to_handle(z);
9446     }
9447     if (a == 0) return int_to_handle(0);
9448     else if (a == 1) return int_to_handle(a);
9449     else if (n == 0) return int_to_handle(1);
9450     std::uint64_t absa = (a < 0 ? -static_cast<std::uint64_t>
9451                           (a) : static_cast<std::uint64_t>(a));
9452     std::size_t bitsa = 64 - nlz(absa);
9453     std::uint64_t hi, bitsr;
9454     multiply64(n, bitsa, hi, bitsr);
9455     arithlib_assert(hi ==
9456                     0); // Check that size is at least somewhat sane!
9457     std::uint64_t lenr1 = 2 + bitsr/64;
9458     if (bitsr < 64) // Can do all the work as machine integers.
9459     {   std::int64_t result = 1;
9460         for (;;)
9461         {   if (n%2 != 0) result *= a;
9462             if ((n = n/2) == 0) break;
9463             a *= a;
9464         }
9465         return int_to_bignum(result);
9466     }
9467     std::size_t lenr = static_cast<std::size_t>(lenr1);
9468 // if size_t was more narrow than 64-bits I could lose information in
9469 // truncating from uint64_t to size_t.
9470     arithlib_assert(lenr == lenr1);
9471     std::uint64_t olenr = lenr;
9472     std::uint64_t *r = reserve(lenr);
9473     push(r);
9474     std::uint64_t *v = reserve(lenr);
9475     push(v);
9476     std::uint64_t *w = reserve(lenr);
9477     pop(v); pop(r);
9478     std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
9479     bigpow(aa, 1, static_cast<std::uint64_t>(n), v, w, r, lenr, lenr);
9480     arithlib_assert(lenr <= olenr);
9481     abandon(w);
9482     abandon(v);
9483     return confirm_size(r, olenr, lenr);
9484 }
9485 
op(std::uint64_t * a,double n)9486 inline double Pow::op(std::uint64_t *a, double n)
9487 {   return std::pow(Double::op(a), n);
9488 }
9489 
op(std::int64_t a,double n)9490 inline double Pow::op(std::int64_t a, double n)
9491 {   return std::pow(Double::op(a), n);
9492 }
9493 
9494 //=========================================================================
9495 //=========================================================================
9496 
9497 
9498 //
9499 // Division with the main number representation being 2s complement
9500 // turns out to have many messy special cases! Here are some of the
9501 // underlying issues:
9502 // . Inputs may have had initial 0 or -1 digits pre-pended to allow
9503 //   for positive values with their top bit set and negative ones with
9504 //   it clear. So if I had 8-bit words the number 128 would have an
9505 //   unsigned representation of [0x80], but it has to be stored as a
9506 //   two digit number [0x00,0x80]. Similarly some negative numbers
9507 //   need an initial 0xff attached just so that it can be seen that they
9508 //   are negative.
9509 // . If a result (quotient or remainder) is as above then space can be
9510 //   needed for the prefix digit.
9511 // . Long division needs to have a dividend with at least 3 digits
9512 //   (after scaling) and a divisor with at least 2. That means that various
9513 //   small cases have to be treated specially.
9514 // . An operation as basic as taking the absolute value of an integer
9515 //   generally involves allocating memory, and I would like to avoid that
9516 //   as much as I can.
9517 // . quotient and remainder operations are very similar indeed, but I ought
9518 //   to be able to safe memory allocation in one or the other. Specifically
9519 //   if I am computing a remainder I can discard quotient digits as I go
9520 //   rather than having anywhere to put them.
9521 // . On many useful platforms I will have an integer type that is 128 bits
9522 //   wide and I can use that for a 128-by-64 division operation that is
9523 //   really helpful when working with 64-bit digits. It is possible that
9524 //   if I do not have 128-bit arithmetic available it would be best to
9525 //   treat my numbers as being in base 2^32 so that 64-by-32 division is
9526 //   the order of the day as a basic primitive.
9527 // . For Lisp purposes I will have "fixnums" as well as "bignums" so special
9528 //   paths for arithmetic that involves values -2^59 to 2^59-1 will be
9529 //   required.
9530 //
9531 // Well perhaps I am fussing about all the above. But my first drafts of this
9532 // code has not thought through all the cases carefully enough!
9533 
9534 
9535 // Divide the bignum a by the b, returning a quotient or a remainder or
9536 // both. Note that at this stage a may still be negative! The value b is
9537 // passed in sign and magnitide form as {b, b_negative}
9538 
unsigned_short_division(std::uint64_t * a,std::size_t lena,std::uint64_t b,bool b_negative,bool want_q,std::uint64_t * & q,std::size_t & olenq,std::size_t & lenq,bool want_r,std::uint64_t * & r,std::size_t & olenr,std::size_t & lenr)9539 inline void unsigned_short_division(std::uint64_t *a,
9540                                     std::size_t lena,
9541                                     std::uint64_t b, bool b_negative,
9542                                     bool want_q, std::uint64_t *&q,
9543                                     std::size_t &olenq, std::size_t &lenq,
9544                                     bool want_r, std::uint64_t *&r,
9545                                     std::size_t &olenr, std::size_t &lenr)
9546 {   std::uint64_t hi = 0;
9547     bool a_negative = false;
9548     std::uint64_t *aa;
9549     if (negative(a[lena-1]))
9550     {   a_negative = true;
9551 // Take absolute value of a if necessary.
9552         push(a);
9553         aa = reserve(lena);
9554         pop(a);
9555         internal_negate(a, lena, aa);
9556         a = aa;
9557     }
9558 // Now both a and b are positive so I can do the division fairly simply.
9559 // Allocate space for the quotient if I need that, and then do standard
9560 // short division.
9561     std::size_t i=lena-1;
9562     if (want_q)
9563     {   olenq = lena;
9564         push(a);
9565         q = reserve(olenq);
9566         pop(a);
9567     }
9568     for (;;)
9569     {   std::uint64_t d;
9570         divide64(hi, a[i], b, d, hi);
9571         if (want_q) q[i] = d;
9572         if (i == 0) break;
9573         i--;
9574     }
9575 // If the original a had been negative I allocated space to store its
9576 // absolute value, and I can discard that now.
9577     if (a_negative) abandon(aa);
9578     if (want_q)
9579     {   lenq = lena;
9580 // The quotient will be negative if divisor and dividend had different signs.
9581         if (a_negative != b_negative)
9582         {   internal_negate(q, lenq, q);
9583             truncate_negative(q, lenq);
9584         }
9585         else truncate_positive(q, lenq);
9586     }
9587     if (want_r)
9588     {
9589 // The remainder is now present as an unsigned value in hi. The sign it
9590 // must end up having must match the sign of a (the dividend). Furthermore
9591 // the remainder will be strictly smaller then b, and the largest possible
9592 // value for b is 0xffffffffffffffff. The remainder may need to be returned
9593 // as a 2-digit bignum.
9594         if (want_q) push(q);
9595         if (a_negative)
9596         {   hi = -hi;
9597             if (positive(hi) && hi!=0)
9598             {   olenr = lenr = 2;
9599                 r = reserve(olenr);
9600                 r[0] = hi;
9601                 r[1] = -1;
9602             }
9603             else
9604             {   olenr = lenr = 1;
9605                 r = reserve(olenr);
9606                 r[0] = hi;
9607             }
9608         }
9609         else
9610         {   if (negative(hi))
9611             {   olenr = lenr = 2;
9612                 r = reserve(olenr);
9613                 r[0] = hi;
9614                 r[1] = 0;
9615             }
9616             else
9617             {   olenr = lenr = 1;
9618                 r = reserve(olenr);
9619                 r[0] = hi;
9620             }
9621         }
9622         if (want_q) pop(q);
9623     }
9624 }
9625 
signed_short_division(std::uint64_t * a,std::size_t lena,std::int64_t b,bool want_q,std::uint64_t * & q,std::size_t & olenq,std::size_t & lenq,bool want_r,std::uint64_t * & r,std::size_t & olenr,std::size_t & lenr)9626 inline void signed_short_division(std::uint64_t *a, std::size_t lena,
9627                                   std::int64_t b,
9628                                   bool want_q, std::uint64_t *&q,
9629                                   std::size_t &olenq, std::size_t &lenq,
9630                                   bool want_r, std::uint64_t *&r,
9631                                   std::size_t &olenr, std::size_t &lenr)
9632 {   if (b > 0) unsigned_short_division(a, lena,
9633                                            static_cast<std::uint64_t>(b),
9634                                            false,
9635                                            want_q, q, olenq, lenq,
9636                                            want_r, r, olenr, lenr);
9637     else unsigned_short_division(a, lena, -static_cast<std::uint64_t>(b),
9638                                      true,
9639                                      want_q, q, olenq, lenq,
9640                                      want_r, r, olenr, lenr);
9641 }
9642 
9643 inline void unsigned_long_division(std::uint64_t *a,
9644                                    std::size_t &lena,
9645                                    std::uint64_t *b, std::size_t &lenb,
9646                                    bool want_q, std::uint64_t *q,
9647                                    std::size_t &olenq, std::size_t &lenq);
9648 
9649 // The following is a major entrypoint to the division code. (a) and (b) are
9650 // vectors of digits such that the top digit of a number is treated as signed
9651 // and the lower ones as unsigned. To cope with this there will sometimes
9652 // be a sort of initial padder digit. The two boolean values indicate whether
9653 // either or both of quotient and remainder are required. if want_q is set
9654 // then this creates a new vector for q and return it via q/lenq. Similarly
9655 // for want_r. The inputs a and b can be bignums of any size and are allowed
9656 // to be positive or negative - this sorts everything out.
9657 
9658 // Divide a by b to obtain a quotient q and a remainder r.
9659 
division(std::uint64_t * a,std::size_t lena,std::uint64_t * b,std::size_t lenb,bool want_q,std::uint64_t * & q,std::size_t & olenq,std::size_t & lenq,bool want_r,std::uint64_t * & r,std::size_t & olenr,std::size_t & lenr)9660 inline void division(std::uint64_t *a, std::size_t lena,
9661                      std::uint64_t *b, std::size_t lenb,
9662                      bool want_q, std::uint64_t *&q, std::size_t &olenq, std::size_t &lenq,
9663                      bool want_r, std::uint64_t *&r, std::size_t &olenr, std::size_t &lenr)
9664 {   arithlib_assert(want_q || want_r);
9665 // First I will filter out a number of cases where the divisor is "small".
9666 // I only want to proceed into the general case code if it is a "genuine"
9667 // big number with at least two digits. This bit of the code is messier
9668 // than one might have imagined because of the 2s complement representation
9669 // I use and the fact that extreme values that almost fit in a single
9670 // digit can ends up as 2-digit values with a degenerate top digit.
9671 //
9672 // The first case is when the single digit if b is a signed value in the
9673 // range -2^63 to 2^63-1.
9674     if (lenb == 1)
9675     {
9676 // At present I cause an attempt to divide by zero to crash with an
9677 // arithlib_assert failure if I have build in debug mode or to do who
9678 // knows what (possibly raise an exception) otherwise. This maybe needs
9679 // review. I wonder if I should throw a "division by zero" exception?
9680         arithlib_assert(b[0] != 0); // would be division by zero
9681         signed_short_division(a, lena, static_cast<std::int64_t>(b[0]),
9682                               want_q, q, olenq, lenq,
9683                               want_r, r, olenr, lenr);
9684         return;
9685     }
9686 // Next I have b in the range 2^63 to 2^64-1. Such values can be represented
9687 // in uint64_t.
9688     else if (lenb == 2 && b[1]==0)
9689     {   unsigned_short_division(a, lena, b[0], false,
9690                                 want_q, q, olenq, lenq,
9691                                 want_r, r, olenr, lenr);
9692         return;
9693     }
9694 // Now for b in -2^64 to -2^63-1. The 2s complement representetation will be
9695 // of the form (-1,nnn) with nnn an unsigned 64-bit value.
9696     else if (lenb == 2 && b[1]==static_cast<std::uint64_t>(-1))
9697     {
9698 // -b(0) is an unsigned representation of the absolute value of b. There is
9699 // one special case when -b(0) is zero, and that corresponds to division
9700 // by -2^64, so I will need to detect that and turn the division into a
9701 // combination of shift and negate operations.
9702         if (b[0] == 0)
9703         {   if (want_q)
9704             {   lenq = lena;
9705                 olenq = lena;
9706                 push(a);
9707                 q = reserve(lena);
9708                 pop(a);
9709 // The next line took me some while to arrive at!
9710                 std::uint64_t carry = !negative(a[lena-1]) || a[0]==0 ? 1 : 0;
9711                 for (std::size_t i=1; i<lena; i++)
9712                     carry = add_with_carry(~a[i], carry, q[i-1]);
9713                 q[lena-1] = negative(a[lena-1]) ? 0 : -static_cast<std::uint64_t>(1);
9714                 truncate_positive(q, lenq);
9715                 truncate_negative(q, lenq);
9716             }
9717             if (want_r)
9718             {   std::uint64_t rr = a[0], padr = 0;
9719                 lenr = 1;
9720                 if (negative(a[lena-1]) && positive(rr))
9721                 {   padr = -1;
9722                     lenr++;
9723                 }
9724                 else if (positive(a[lena-1]) && negative(rr))
9725                 {   padr = 0;
9726                     lenr++;
9727                 }
9728                 if (want_q) push(q);
9729                 r = reserve(lenr);
9730                 if (want_q) pop(q);
9731                 olenr = lenr;
9732                 r[0] = rr;
9733                 if (lenr != 1) r[1] = padr;
9734             }
9735             return;
9736         }
9737         unsigned_short_division(a, lena, -b[0], true,
9738                                 want_q, q, olenq, lenq,
9739                                 want_r, r, olenr, lenr);
9740         return;
9741     }
9742 // Now the absolute value of b will be at least 2 digits of 64-bits with the
9743 // high digit non-zero. I need to make a copy of it because I will scale
9744 // it during long division.
9745     std::uint64_t *bb = NULL;
9746     std::size_t lenbb = lenb;
9747     bool b_negative = negative(b[lenb-1]);
9748     if (b_negative)
9749     {
9750 // In the case that b is negative I will want its absolute value. Especially
9751 // in a multi-thread world I must not disturb or overwrite the input vector,
9752 // so a create a temporary copy of b to negate. In my full 2s complement
9753 // representation negating -2^(64*n-1) would lead to what was supposed to be
9754 // a positive value but it would have its top bit set so it would require
9755 // and extra leading 0. Because the value I generate here is to be treated
9756 // as unsigned this leading top bit does not matter and so the absolute value
9757 // of b fits in the same amount of space that b did with no risk of overflow.
9758         push(a); push(b);
9759         bb = reserve(lenb);
9760         pop(b); pop(a);
9761         olenr = lenb;
9762         internal_negate(b, lenb, bb);
9763         if (bb[lenbb-1] == 0) lenbb--;
9764     }
9765     else if (b[lenb-1] == 0) lenbb--;
9766     arithlib_assert(lenbb >= 2);
9767 // Now I should look at the dividend. If it is shorter than the divisor
9768 // then I know that the quotient will be zero and the dividend will be the
9769 // remainder. If I had made this test before normalizing the divisor I could
9770 // have needed to worry about the case of (-2^(64n-1))/(2^(64n-1)) where the
9771 // divisor would have had an initial padding zero so would have shown up
9772 // as longer than the dividend but the quotient would have needed to come out
9773 // as 1. But here with the divisor made tidy this test is safe!
9774     if (lena < lenbb)
9775     {   if (want_q)
9776         {   q = reserve(1);
9777             olenq = 1;
9778             q[0] = 0;
9779             lenq = 1;
9780         }
9781         if (want_r)
9782         {   push(a);
9783             r = reserve(lena);
9784             pop(a);
9785             olenr = lena;
9786             internal_copy(a, lena, r);
9787             lenr = lena;
9788         }
9789         if (b_negative) abandon(bb);
9790         return;
9791     }
9792 // Now lena >= lenb >= 2 and I will need to do a genuine long division. This
9793 // will need me to allocate some workspace.
9794 //
9795 // Because I will scale the divisor I need that to be a copy of the
9796 // original data even if that has been positive and so I had not copied
9797 // it yet. I delay creation of that copy until now because that lets my
9798 // avoid a spurious allocation in the various small cases.
9799     if (!b_negative)
9800     {   push(a); push(b);
9801         bb = reserve(lenb);
9802         pop(b); pop(a);
9803         olenr = lenb;
9804         internal_copy(b, lenbb, bb);
9805     }
9806 #ifdef DEBUG_OVERRUN
9807     if (debug_arith) arithlib_assert(bb[olenr] == 0xaaaaaaaaaaaaaaaa);
9808 #endif
9809 // If I actually return the quotient I may need to add a leading 0 or -1 to
9810 // make its 2s complement representation valid. Hence the "+2" rather than
9811 // the more obvious "+1" here.
9812     if (want_q)
9813     {   lenq = lena - lenb + 2;
9814         push(a); push(b); push(bb);
9815         q = reserve(lenq);
9816         pop(bb); pop(b); pop(a);
9817         olenq = lenq;
9818     }
9819 // I will need space where I store something that starts off as a scaled
9820 // copy of the dividend and gradually have values subtracted from it until
9821 // it ends up as the remainder.
9822     lenr = lena;
9823     push(a); push(b); push(bb);
9824     if (want_q) push(q);
9825     r = reserve(lenr+1);
9826     if (want_q) pop(q);
9827     pop(bb); pop(b); pop(a);
9828     bool a_negative = negative(a[lena-1]);
9829     if (a_negative) internal_negate(a, lena, r);
9830     else internal_copy(a, lena, r);
9831     unsigned_long_division(r, lenr, bb, lenbb, want_q, q, olenq, lenq);
9832 #ifdef DEBUG_OVERRUN
9833     if (debug_arith) arithlib_assert(r[lena+1] == 0xaaaaaaaaaaaaaaaa);
9834 #endif
9835 // While performing the long division I will have had three vectors that
9836 // were newly allocated. r starts off containing a copy of a but ends up
9837 // holding the remainder. It is rather probable that this remainder will
9838 // often be a distinctly shorter vector than a was. The vector q is only
9839 // created and used if want_q was set, and it ends up holding the quotient.
9840 // finally bb holds the absolute value of the divisor but scaled up by a
9841 // power of 2 so that its leading digit has its top bit set. Well the actual
9842 // remainder is smaller than the divisor and so it will be a closer fit into
9843 // bb than r. So copy it into there so that the allocate/abandon and
9844 // size confirmation code is given less extreme things to cope with.
9845     arithlib_assert(lenr<=lenb);
9846     if (want_r) internal_copy(r, lenr, bb);
9847     abandon(r);
9848     if (want_q)
9849     {   if (negative(q[lenq-1]))
9850         {   arithlib_assert(lenq < olenq);
9851             q[lenq++] = 0;
9852         }
9853         if (a_negative != b_negative)
9854         {   internal_negate(q, lenq, q);
9855             truncate_negative(q, lenq);
9856         }
9857         else truncate_positive(q, lenq);
9858     }
9859 //  else abandon(q);
9860     if (want_r)
9861     {   r = bb;
9862         if (negative(r[lenr-1]))
9863         {   arithlib_assert(lenr < olenr);
9864             r[lenr++] = 0;
9865         }
9866         if (a_negative)
9867         {   internal_negate(r, lenr, r);
9868             truncate_negative(r, lenr);
9869         }
9870         else truncate_positive(r, lenr);
9871     }
9872     else abandon(bb);
9873 }
9874 
9875 // During long division I will scale my numbers by shifting left by an
9876 // amount s. I do that in place. The shift amount will be such that
9877 // the divisor ends up with the top bit of its top digit set. The
9878 // dividend will need to extend into an extra digit, and I deal with that
9879 // by returning the overflow word as a result of the scaling function. Note
9880 // that the shift amount will be in the range 0-63.
9881 
9882 
scale_for_division(std::uint64_t * r,std::size_t lenr,int s)9883 inline std::uint64_t scale_for_division(std::uint64_t *r,
9884                                         std::size_t lenr,
9885                                         int s)
9886 {
9887 // There are two reasons for me to treat a shift by zero specially. The
9888 // first is that it is cheap because no data needs moving at all. But the
9889 // more subtle reason is that if I tried using the general code as below
9890 // that would execute a right shift by 64, which is out of the proper range
9891 // for C++ right shifts.
9892     if (s == 0) return 0;
9893     std::uint64_t carry = 0;
9894     for (std::size_t i=0; i<lenr; i++)
9895     {   std::uint64_t w = r[i];
9896         r[i] = (w << s) | carry;
9897         carry = w >> (64-s);
9898     }
9899     return carry;
9900 }
9901 
9902 // r = r - b*q*base^(lena-lenb-1).
9903 
multiply_and_subtract(std::uint64_t * r,std::size_t lenr,std::uint64_t q0,std::uint64_t * b,std::size_t lenb)9904 inline void multiply_and_subtract(std::uint64_t *r, std::size_t lenr,
9905                                   std::uint64_t q0,
9906                                   std::uint64_t *b, std::size_t lenb)
9907 {   arithlib_assert(lenr > lenb);
9908     std::uint64_t hi = 0, lo, carry = 1;
9909     for (std::size_t i=0; i<lenb; i++)
9910     {   multiply64(b[i], q0, hi, hi, lo);
9911 // lo is now the next digit of b*q, and hi needs to be carried up to the
9912 // next one.
9913         carry = add_with_carry(r[i+lenr-lenb-1], ~lo, carry,
9914                                r[i+lenr-lenb-1]);
9915     }
9916     r[lenr-1] = r[lenr-1] + ~hi + carry;
9917 }
9918 
9919 // add_back_correction() is used when a quotient digit was mis-predicted by
9920 // 1 and I detect that when I calculate r = r - b*q and end up with r negative
9921 // result. I fix things up by decrementing q and going
9922 //         r = r + (b<<(lenr-lenb-1))
9923 
add_back_correction(std::uint64_t * r,std::size_t lenr,std::uint64_t * b,std::size_t lenb)9924 inline void add_back_correction(std::uint64_t *r, std::size_t lenr,
9925                                 std::uint64_t *b, std::size_t lenb)
9926 {   arithlib_assert(lenr > lenb);
9927     std::uint64_t carry = 0;
9928     for (std::size_t i=0; i<lenb; i++)
9929         carry = add_with_carry(r[i+lenr-lenb-1], b[i], carry,
9930                                r[i+lenr-lenb-1]);
9931     r[lenr-1] += carry;
9932 }
9933 
next_quotient_digit(std::uint64_t * r,std::size_t & lenr,std::uint64_t * b,std::size_t lenb)9934 inline std::uint64_t next_quotient_digit(std::uint64_t *r,
9935         std::size_t &lenr,
9936         std::uint64_t *b, std::size_t lenb)
9937 {   arithlib_assert(lenr > lenb);
9938     arithlib_assert(lenb >= 2);
9939     arithlib_assert(b[lenb-1] != 0);
9940     std::uint64_t q0, r0;
9941     if (r[lenr-1] == b[lenb-1])
9942     {   q0 = static_cast<std::uint64_t>(-1);
9943         r0 = r[lenr-2] + b[lenb-1];
9944 // Here perhaps q0 is still an over-estimate by 1?
9945     }
9946     else
9947     {   divide64(r[lenr-1], r[lenr-2], b[lenb-1], q0, r0);
9948 // At this stage q0 may be correct or it may be an over-estimate by 1 or 2,
9949 // but never any worse than that.
9950 //
9951 // The tests on the next lines should detect all case where q0 was in error
9952 // by 2 and most when it was in error by 1.
9953         std::uint64_t hi, lo;
9954         multiply64(q0, b[lenb-2], hi, lo);
9955         if (hi > r0 ||
9956             (hi == r0 && lo > r[lenr-3])) q0--;
9957     }
9958 //
9959 // Now I want to go "r = r - b*q0*2^(64*(lenr-lenb));" so that r
9960 // is set to an accurate remainder after using q0 as (part of) the
9961 // quotient. This may carry an overshoot into atop and if so I will need
9962 // to reduce q0 again and compensate.
9963 //
9964     multiply_and_subtract(r, lenr, q0, b, lenb);
9965     if (negative(r[lenr-1]))
9966     {   q0--;
9967         add_back_correction(r, lenr, b, lenb);
9968     }
9969     lenr--;  // a is now one digit shorter.
9970     return q0;
9971 }
9972 
9973 // r is an unsigned number. Shift right (in place) by s bits, where s
9974 // is in the range 0 - 63. The bits shifted out to the right should all
9975 // be zero.
9976 
unscale_for_division(std::uint64_t * r,std::size_t & lenr,int s)9977 inline void unscale_for_division(std::uint64_t *r, std::size_t &lenr,
9978                                  int s)
9979 {   if (s != 0)
9980     {   std::uint64_t carry = 0;
9981         std::size_t i = lenr-1;
9982         for (;;)
9983         {   std::uint64_t w = r[i];
9984             r[i] = (w >> s) | carry;
9985             carry = w << (64-s);
9986             if (i == 0) break;
9987             i--;
9988         }
9989         arithlib_assert(carry==0);
9990     }
9991     truncate_unsigned(r, lenr);
9992 }
9993 
9994 // This function does long division on unsigned values, computing the
9995 // quotient (a/b). In doing so it updates (a) so that at the end it holds
9996 // the remainder. It only fills in a value for the quotient q if want_q is
9997 // true. Note also that this code will scale (b) so that the top bit of its
9998 // highest digit is a "1", so b must be an array that can be overwritten
9999 // without disturbing code elsewhere.
10000 
unsigned_long_division(std::uint64_t * a,std::size_t & lena,std::uint64_t * b,std::size_t & lenb,bool want_q,std::uint64_t * q,std::size_t & olenq,std::size_t & lenq)10001 inline void unsigned_long_division(std::uint64_t *a,
10002                                    std::size_t &lena,
10003                                    std::uint64_t *b, std::size_t &lenb,
10004                                    bool want_q, std::uint64_t *q,
10005                                    std::size_t &olenq, std::size_t &lenq)
10006 {   arithlib_assert(lenb >= 2);
10007     arithlib_assert(lena >= lenb);
10008 // I will multiply a and b by a scale factor that gets the top digit of "b"
10009 // reasonably large. The value stored in "a" can become one digit longer,
10010 // but there is space to store that.
10011 //
10012 // The scaling is done here using a shift, which seems cheaper to sort out
10013 // then multiplication by a single-digit value.
10014     arithlib_assert(b[lenb-1] != 0);
10015     int ss = nlz(b[lenb-1]);
10016 // When I scale the dividend expands into an extra digit but the scale
10017 // factor has been chosen so that the divisor does not.
10018     a[lena] = scale_for_division(a, lena, ss);
10019     lena++;
10020     arithlib_assert(scale_for_division(b, lenb, ss) == 0);
10021     lenq = lena-lenb; // potential length of quotient.
10022     std::size_t m = lenq-1;
10023     for (;;)
10024     {   std::uint64_t qd = next_quotient_digit(a, lena, b, lenb);
10025 // If I am only computing the remainder I do not need to store the quotient
10026 // digit that I have just found.
10027         if (want_q) q[m] = qd;
10028         if (m == 0) break;
10029         m--;
10030     }
10031     unscale_for_division(a, lena, ss);
10032 // The quotient is OK correct now but has been computed as an unsigned value
10033 // so if its top digit has its top bit set I need to prepend a zero;
10034     if (want_q)
10035     {   if (negative(q[lenq-1])) q[lenq++] = 0;
10036         else truncate_unsigned(q, lenq);
10037     }
10038     if (negative(a[lena-1])) a[lena++] = 0;
10039     else truncate_unsigned(a, lena);
10040 }
10041 
10042 // Use unsigned_long_division when all that is required is the remainder.
10043 // Here a>b and b is at least 2 words. The code corrupts b and replaces
10044 // a with remainder(a, b).
10045 
unsigned_long_remainder(std::uint64_t * a,std::size_t & lena,std::uint64_t * b,std::size_t & lenb)10046 inline void unsigned_long_remainder(std::uint64_t *a,
10047                                     std::size_t &lena,
10048                                     std::uint64_t *b, std::size_t &lenb)
10049 {   std::size_t w;
10050     unsigned_long_division(a, lena, b, lenb,
10051                            false, NULL, w, w);
10052 }
10053 
op(std::uint64_t * a,std::uint64_t * b)10054 inline std::intptr_t Quotient::op(std::uint64_t *a, std::uint64_t *b)
10055 {   std::size_t lena = number_size(a);
10056     std::size_t lenb = number_size(b);
10057     std::uint64_t *q, *r;
10058     std::size_t olenq, olenr, lenq, lenr;
10059     division(a, lena, b, lenb,
10060              true, q, olenq, lenq,
10061              false, r, olenr, lenr);
10062     return confirm_size(q, olenq, lenq);
10063 }
10064 
op(std::uint64_t * a,std::int64_t b)10065 inline std::intptr_t Quotient::op(std::uint64_t *a, std::int64_t b)
10066 {   std::size_t lena = number_size(a);
10067     std::uint64_t *q, *r;
10068     std::size_t olenq, olenr, lenq, lenr;
10069     std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
10070     division(a, lena, bb, 1,
10071              true, q, olenq, lenq,
10072              false, r, olenr, lenr);
10073     return confirm_size(q, olenq, lenq);
10074 }
10075 
10076 // A fixnum divided by a bignum ought always to yield 0, except that
10077 // maybe -0x8000000000000000} / {0,0x8000000000000000) => -1
10078 
op(std::int64_t a,std::uint64_t * b)10079 inline std::intptr_t Quotient::op(std::int64_t a, std::uint64_t *b)
10080 {   if (number_size(b)==1 &&
10081         b[0]==-static_cast<std::uint64_t>(a)) return int_to_handle(-1);
10082     return int_to_handle(0);
10083 }
10084 
10085 // unpleasantly -0x8000000000000000 / -1 => a bignum
10086 
op(std::int64_t a,std::int64_t b)10087 inline std::intptr_t Quotient::op(std::int64_t a, std::int64_t b)
10088 {   if (b==-1 && a == MIN_FIXNUM) return int_to_bignum(-a);
10089     else return int_to_handle(a / b);
10090 }
10091 
op(std::uint64_t * a,std::uint64_t * b)10092 inline std::intptr_t Remainder::op(std::uint64_t *a, std::uint64_t *b)
10093 {   std::size_t lena = number_size(a);
10094     std::size_t lenb = number_size(b);
10095     std::uint64_t *q, *r;
10096     std::size_t olenq, olenr, lenq, lenr;
10097     division(a, lena, b, lenb,
10098              false, q, olenq, lenq,
10099              true, r, olenr, lenr);
10100     return confirm_size(r, olenr, lenr);
10101 }
10102 
op(std::uint64_t * a,std::int64_t b)10103 inline std::intptr_t Remainder::op(std::uint64_t *a, std::int64_t b)
10104 {   std::size_t lena = number_size(a);
10105     std::uint64_t *q, *r;
10106     std::size_t olenq, olenr, lenq, lenr;
10107     std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
10108     division(a, lena, bb, 1,
10109              false, q, olenq, lenq,
10110              true, r, olenr, lenr);
10111     return confirm_size(r, olenr, lenr);
10112 }
10113 
op(std::int64_t a,std::uint64_t * b)10114 inline std::intptr_t Remainder::op(std::int64_t a, std::uint64_t *b)
10115 {   if (number_size(b)==1 &&
10116         b[0]==-static_cast<std::uint64_t>(a)) return int_to_handle(0);
10117     return int_to_handle(a);
10118 }
10119 
op(std::int64_t a,std::int64_t b)10120 inline std::intptr_t Remainder::op(std::int64_t a, std::int64_t b)
10121 {   return int_to_handle(a % b);
10122 }
10123 
10124 
10125 
10126 #ifdef LISP
10127 
10128 // In LISP mode I provide a function that returns both quotient and
10129 // remainder. In the other two modes I support the same idea but
10130 // as a function that delivers the quotient as its result and saves
10131 // the remainder via an additional argument.
10132 
10133 }
10134 
10135 namespace arithlib_implementation
10136 {
10137 
op(std::uint64_t * a,std::uint64_t * b)10138 inline std::intptr_t Divide::op(std::uint64_t *a, std::uint64_t *b)
10139 {   std::size_t lena = number_size(a);
10140     std::size_t lenb = number_size(b);
10141     std::uint64_t *q, *r;
10142     std::size_t olenq, olenr, lenq, lenr;
10143     division(a, lena, b, lenb,
10144              true, q, olenq, lenq,
10145              true, r, olenr, lenr);
10146     std::intptr_t rr = confirm_size(r, olenr, lenr);
10147     std::intptr_t qq = confirm_size_x(q, olenq, lenq);
10148 #ifdef ZAPPA
10149     return cons(qq, rr).v;
10150 #else
10151     return cons(qq, rr);
10152 #endif
10153 }
10154 
op(std::uint64_t * a,std::int64_t bb)10155 inline std::intptr_t Divide::op(std::uint64_t *a, std::int64_t bb)
10156 {   std::size_t lena = number_size(a);
10157     std::uint64_t *q, *r;
10158     std::size_t olenq, olenr, lenq, lenr;
10159     std::uint64_t b[1] = {static_cast<std::uint64_t>(bb)};
10160     division(a, lena, b, 1,
10161              true, q, olenq, lenq,
10162              true, r, olenr, lenr);
10163     std::intptr_t rr = confirm_size(r, olenr, lenr);
10164     std::intptr_t qq = confirm_size_x(q, olenq, lenq);
10165 #ifdef ZAPPA
10166     return cons(qq, rr).v;
10167 #else
10168     return cons(qq, rr);
10169 #endif
10170 }
10171 
op(std::int64_t aa,std::uint64_t * b)10172 inline std::intptr_t Divide::op(std::int64_t aa, std::uint64_t *b)
10173 {   std::size_t lenb = number_size(b);
10174     std::uint64_t *q, *r;
10175     std::size_t olenq, olenr, lenq, lenr;
10176     std::uint64_t a[1] = {static_cast<std::uint64_t>(aa)};
10177     division(a, 1, b, lenb,
10178              true, q, olenq, lenq,
10179              true, r, olenr, lenr);
10180     std::intptr_t rr = confirm_size(r, olenr, lenr);
10181     std::intptr_t qq = confirm_size_x(q, olenq, lenq);
10182 #ifdef ZAPPA
10183     return cons(qq, rr).v;
10184 #else
10185     return cons(qq, rr);
10186 #endif
10187 }
10188 
op(std::int64_t aa,std::int64_t bb)10189 inline std::intptr_t Divide::op(std::int64_t aa, std::int64_t bb)
10190 {   std::uint64_t *q, *r;
10191     std::size_t olenq, olenr, lenq, lenr;
10192     std::uint64_t a[1] = {static_cast<std::uint64_t>(aa)};
10193     std::uint64_t b[1] = {static_cast<std::uint64_t>(bb)};
10194     division(a, 1, b, 1,
10195              true, q, olenq, lenq,
10196              true, r, olenr, lenr);
10197     std::intptr_t rr = confirm_size(r, olenr, lenr);
10198     std::intptr_t qq = confirm_size_x(q, olenq, lenq);
10199 #ifdef ZAPPA
10200     return cons(qq, rr).v;
10201 #else
10202     return cons(qq, rr);
10203 #endif
10204 }
10205 
10206 #else
10207 
op(std::uint64_t * a,std::uint64_t * b,std::intptr_t & rem)10208 inline std::intptr_t Divide::op(std::uint64_t *a, std::uint64_t *b,
10209                                 std::intptr_t &rem)
10210 {   std::size_t lena = number_size(a);
10211     std::size_t lenb = number_size(b);
10212     std::uint64_t *q, *r;
10213     std::size_t olenq, olenr, lenq, lenr;
10214     division(a, lena, b, lenb,
10215              true, q, olenq, lenq,
10216              true, r, olenr, lenr);
10217     rem = confirm_size(r, olenr, lenr);
10218     return confirm_size_x(q, olenq, lenq);
10219 }
10220 
10221 #endif
10222 
10223 // a = a - b*q.
10224 
reduce_for_gcd(std::uint64_t * a,std::size_t lena,std::uint64_t q,std::uint64_t * b,std::size_t lenb)10225 inline bool reduce_for_gcd(std::uint64_t *a, std::size_t lena,
10226                            std::uint64_t q,
10227                            std::uint64_t *b, std::size_t lenb)
10228 {   arithlib_assert(lena == lenb || lena == lenb+1);
10229     std::uint64_t hi = 0, hi1, lo, borrow = 0;
10230     for (std::size_t i=0; i<lenb; i++)
10231     {   multiply64(b[i], q, hi1, lo);
10232         hi1 += subtract_with_borrow(a[i], hi, a[i]);
10233         borrow = subtract_with_borrow(a[i], lo, borrow, a[i]);
10234         hi = hi1;
10235     }
10236 // In the cases where this is used the difference |a - q*b| should be
10237 // less than a. Well if q was computed accurately then it will be less
10238 // than b. And if q is large it will at least me much less than a. So I
10239 // am confident that testing the top bit if a[lena-1] after the subtraction
10240 // will be a reliable test for overshoot. I might want to formalize this
10241 // argument a bit better!
10242     if (lena > lenb) a[lena-1] = a[lena-1] - hi - borrow;
10243     return negative(a[lena-1]);
10244 }
10245 
10246 // The next function performs a = a = b*(q<<shift), but
10247 // it computes it more as a = a - (b<<shift)*q.
10248 // It will be used with 0 < shift < 64, ie only when a genuine shift
10249 // between digits is required.
shifted_reduce_for_gcd(std::uint64_t * a,std::size_t lena,std::uint64_t q,std::uint64_t * b,std::size_t lenb,int shift)10250 inline bool shifted_reduce_for_gcd(std::uint64_t *a, std::size_t lena,
10251                                    std::uint64_t q,
10252                                    std::uint64_t *b, std::size_t lenb,
10253                                    int shift)
10254 {   arithlib_assert(lena == lenb+1 || lena == lenb+2);
10255     std::uint64_t hi = 0, hi1, lo, borrow = 0;
10256     for (std::size_t i=0; i<=lenb; i++)
10257     {   multiply64(shifted_digit(b, lenb, shift, i), q, hi1, lo);
10258         hi1 += subtract_with_borrow(a[i], hi, a[i]);
10259         borrow = subtract_with_borrow(a[i], lo, borrow, a[i]);
10260         hi = hi1;
10261     }
10262 // In the cases where this is used the difference |a - q*b| should be
10263 // less than a. Well if q was computed accurately then it will be less
10264 // than b. And if q is large it will at least me much less than a. So I
10265 // am confident that testing the top bit if a[lena-1] after the subtraction
10266 // will be a reliable test for overshoot. I might want to formalize this
10267 // argument a bit better!
10268     if (lena > lenb+1) a[lena-1] = a[lena-1] - hi - borrow;
10269     return negative(a[lena-1]);
10270 }
10271 
10272 // Here we compute r = u*a - v*b, where lenr >= min(lena, lenb). This
10273 // is for use in Lehmer reductions.
10274 // In general this will be used as in
10275 //    ua_minus_vb(a, u1, b, v1, temp);
10276 //    ua_minus_vb(a, u2, b, v2, a);
10277 //    copy from temp to b
10278 // so note that the destination may be the same vector as one of the inputs.
10279 // This will only be used when a and b are almost the same length. I leave
10280 // a result of length lena even though I very much expect that in at least
10281 // almost all cases the result will be almost 128 bits smaller!
10282 
ua_minus_vb(std::uint64_t * a,std::size_t lena,std::uint64_t u,std::uint64_t * b,std::size_t lenb,std::uint64_t v,std::uint64_t * r,std::size_t & lenr)10283 inline bool ua_minus_vb(std::uint64_t *a, std::size_t lena,
10284                         std::uint64_t u,
10285                         std::uint64_t *b, std::size_t lenb,
10286                         std::uint64_t v,
10287                         std::uint64_t *r, std::size_t &lenr)
10288 {   arithlib_assert(lena == lenb || lena == lenb+1);
10289     std::uint64_t hia, loa, ca = 0, hib, lob, cb = 0, borrow = 0;
10290     for (std::size_t i=0; i<lenb; i++)
10291     {   multiply64(a[i], u, hia, loa);
10292 // hia is the high part of a product so carrying 1 into it can not cause it
10293 // to overflow. Just!
10294         hia += add_with_carry(loa, ca, loa);
10295         multiply64(b[i], v, hib, lob);
10296         hib += add_with_carry(lob, cb, lob);
10297         borrow = subtract_with_borrow(loa, lob, borrow, r[i]);
10298         ca = hia;
10299         cb = hib;
10300     }
10301     lenr = lenb;
10302 // I want to report in whether u*a-v*b was negative. To do that I will
10303 // first note that the result that I am computing should be less than the
10304 // value of a, so I do not get too much messy overflow. I will look at the
10305 // borrow out from the top word of the result.
10306     if (lena > lenb)
10307     {   multiply64(a[lena-1], u, hia, loa);
10308         hia += add_with_carry(loa, ca, loa);
10309         borrow = subtract_with_borrow(loa, cb, borrow, r[lena-1]);
10310         lenr = lena;
10311         return negative(hia - borrow);
10312     }
10313     return negative(ca - cb - borrow);
10314 }
10315 
10316 // Since the code here is quite short I will also provide a version
10317 // for r = -u*a + b*v;
10318 // Again this supposes that a is at least as long as b.
10319 
minus_ua_plus_vb(std::uint64_t * a,std::size_t lena,std::uint64_t u,std::uint64_t * b,std::size_t lenb,std::uint64_t v,std::uint64_t * r,std::size_t & lenr)10320 inline bool minus_ua_plus_vb(std::uint64_t *a, std::size_t lena,
10321                              std::uint64_t u,
10322                              std::uint64_t *b, std::size_t lenb,
10323                              std::uint64_t v,
10324                              std::uint64_t *r, std::size_t &lenr)
10325 {   arithlib_assert(lena == lenb || lena == lenb+1);
10326     std::uint64_t hia, loa, ca = 0, hib, lob, cb = 0, borrow = 0;
10327     for (std::size_t i=0; i<lenb; i++)
10328     {   multiply64(a[i], u, hia, loa);
10329         hia += add_with_carry(loa, ca, loa);
10330         multiply64(b[i], v, hib, lob);
10331         hib += add_with_carry(lob, cb, lob);
10332         borrow = subtract_with_borrow(lob, loa, borrow, r[i]);
10333         ca = hia;
10334         cb = hib;
10335     }
10336     lenr = lenb;
10337     if (lena > lenb)
10338     {   multiply64(a[lena-1], u, hia, loa);
10339         hia += add_with_carry(loa, ca, loa);
10340         borrow = subtract_with_borrow(cb, loa, borrow, r[lena-1]);
10341         lenr = lena;
10342 // It will be perfectly reasonable for hia to be zero and borrow to be zero
10343 // and hence the overall result positive.
10344         return negative(- hia - borrow);
10345     }
10346     return negative(cb - ca - borrow);
10347 }
10348 
10349 // gcd_reduction starts with a > b and |b| >=2. It must reset a and
10350 // b (and their lengths) to be smaller. The basic Euclidean algorithm
10351 // would go
10352 //    a = a % b;   // otherwise a = a-q*b; for some useful value of q
10353 //                 // and then if q was "too large" go a = |a|;
10354 //    swap(a, b);
10355 // but a Lehmer-style scheme can go distinctly faster overall.
10356 
gcd_reduction(std::uint64_t * & a,std::size_t & lena,std::uint64_t * & b,std::size_t & lenb,std::size_t & olena,std::size_t & olenb,std::uint64_t * & temp,std::size_t & lentemp)10357 inline void gcd_reduction(std::uint64_t *&a, std::size_t &lena,
10358                           std::uint64_t *&b, std::size_t &lenb,
10359                           std::size_t &olena, std::size_t &olenb,
10360                           std::uint64_t *&temp, std::size_t &lentemp)
10361 {
10362 // I will start by collecting high bits from a and b. If I collect the
10363 // contents of the top 3 words (ie 192 bits in all) then I will be able
10364 // to normalize that to get 128 bits to work with however the top bits
10365 // of a and b lie within the words.
10366     std::uint64_t a0=a[lena-1], a1=a[lena-2], a2=(lena>2 ? a[lena-3] : 0);
10367     int lza = nlz(a0);
10368     std::uint64_t b0=b[lenb-1], b1=b[lenb-2], b2=(lenb>2 ? b[lenb-3] : 0);
10369     int lzb = nlz(b0);
10370 // I will sort out how many more bits are involved in a than in b. If
10371 // this number is large I will invent a number q of the form q=q0*2^q1
10372 // with q0 using almost all of 64 bits and go "a = a - q*b;". This
10373 // will involve "virtually shifting" b left by q1 bits so what I actually
10374 // do is "a = a = q0*(b<<q1);". It will be obvious that the idea is that
10375 // q should be chosen so that the new value of a is as small as possible.
10376 // Given that q will be an estimate for the correct quotient and so may
10377 // occasionally be incorrect I will allow that it might in fact be
10378 // too large. In such a case the value of a computed will end up negative,
10379 // in which caseI do a final step that goes "a = -a;" to fix that. If I
10380 // manage to make q a round-to-nearest approximation to the quotient this
10381 // might happen a significant fraction of the time, ideally getting me
10382 // 1 extra bit in reduction in the size of a for each step.
10383 // If the estimated quotient is accurate enough the this will leave
10384 // a < b and by swapping a and b we have a new pair ready to continue from.
10385     std::int64_t diff = 64*(lena-lenb) + lzb - lza;
10386 // If however the length-difference between a and b is small a subtraction
10387 // "a = a - q0*(b<<0);" would often find q0 rather small and completing
10388 // the remainder sequence would take many steps. So in such cases I take
10389 // the top 128 bits of a and (128-diff) bits from b and start forming
10390 // a remainder sequence using 128-bit arithmetic until a term in it
10391 // fits in 64-bits. If the last 2 terms in that remainder sequence are
10392 // p and q (with p having >64 bits and q <= 64 bits) I can have
10393 //    p = |Ua - Vb|,    q = |-Wa + Xb|.
10394 // where U, V, W and X should all fit in 64-bits. That gives me a new
10395 // pair of values - expected to be up to 128-bits shorter - to continue
10396 // my remainder sequence. Because my stopping condition for the
10397 // approximate remainder sequence is not guaranteed perfect I can not
10398 // be certain that q < p, so I will need to compare the values and
10399 // swap as appropriate.
10400 //
10401     if (diff < 60)
10402     {
10403 // Try for Lehmer. The pair of values that will be 2-word surrogates
10404 // for a and b here will be the top 128 bits of a and however many bits of
10405 // b align with that. However if a has only 2 digits then I must NOT shift it
10406 // left, because that would make it seem to have a power of 2 as a factor
10407 // beyond any real such factors.
10408 // It could be that lenb < lena, but because a and b different in lengths
10409 // by at most 60 bits in that case lenb==lena-1. So adjust values so as to
10410 // align.
10411         if (lena != lenb)
10412         {   b2 = b1;
10413             b1 = b0;
10414             b0 = 0;
10415         }
10416         if (lena > 2)
10417         {   a0 = a0<<lza;
10418             if (lza!=0) a0 |= (a1>>(64-lza));
10419             a1 = a1<<lza;
10420             if (lza!=0) a1 |= (a2>>(64-lza));
10421             b0 = b0<<lza;
10422             if (lza!=0) b0 |= (b1>>(64-lza));
10423             b1 = b1<<lza;
10424             if (lza!=0) b1 |= (b2>>(64-lza));
10425         }
10426 // I will maintain an identity
10427 //          a = ua*A + va*B
10428 //          b = ub*A + vb*B
10429 // where A and B are the initial values in my remainder sequence and a and b
10430 // are working ones calculated along the way. Note horribly well here that
10431 // I am keeping these values as signed... but the code U have above that
10432 // calculates u*a-b*v will take unsigned inputs!
10433         std::int64_t ua = 1, va = 0, ub = 0, vb = 1;
10434         while (b0!=0 || b1!=0)
10435         {   std::uint64_t q;
10436 // Here I want to set q = {a0,a1}/{b0,b1}, and I expect that the answer
10437 // is a reasonably small integer. But it could potentially be huge.
10438 // At least I have filtered away the possibility {b0,b1}={0,0}.
10439 // I will grab the top 64 bits of a and the top corresponding bits of b,
10440 // because then I can do a (cheap) 64-by-64 division.
10441             int lza1 = a0==0 ? 64+nlz(a1) : nlz(a0);
10442             int lzb1 = b0==0 ? 64+nlz(b1) : nlz(b0);
10443             if (lzb1 > lza1+60) break; // quotient will be too big
10444             std::uint64_t ahi, bhi;
10445             if (lza1 == 0) ahi = a0;
10446             else if (lza1 < 64) ahi = (a0<<lza1) | (a1>>(64-lza1));
10447             else if (lza1 == 64) ahi = a1;
10448             else ahi = a1<<(lza1-64);
10449             if (lza1 == 0) bhi = b0;
10450             else if (lza1 < 64) bhi = (b0<<lza1) | (b1>>(64-lza1));
10451             else if (lza1 == 64) bhi = b1;
10452             else bhi = b1<<(lza1-64);
10453             if (bhi == 0) break;
10454 // q could end up and over-estimate for the true quotient because bhi has
10455 // been truncated and so under-represents b. If that happens then a-q*b will
10456 // end up negative.
10457             q = ahi/bhi;
10458             if (negative(q)) break;
10459             arithlib_assert(q != 0);
10460 // Now I need to go
10461 //              ua -= q*va;
10462 //              ub -= q*vb;
10463 //              {a0,a1} -= q*{b0,b1}
10464 // Then if a is negative I will negate a and ua and ub.
10465 // Finally, if (as I mostly expect) now a<b I swap a<=>b, ua<=>ub and va<=>vb
10466 // If I would get an overflow in updating ua or ub I will break out of the
10467 // loop.
10468             std::int64_t h;
10469             std::uint64_t l1, l2;
10470             signed_multiply64(q, va, h, l1);
10471             if (static_cast<std::uint64_t>(h) + (l1>>63) != 0) break;
10472 // There could be overflow in the following subtraction... So I check
10473 // if that was about to happen and break out of the loop if so.
10474             if (ua >= 0)
10475             {   if (ua - INT64_MAX > static_cast<std::int64_t>(l1)) break;
10476             }
10477             else if (ua - INT64_MIN < static_cast<std::int64_t>(l1)) break;
10478             signed_multiply64(q, vb, h, l2);
10479             if (static_cast<std::uint64_t>(h) + (l2>>63) != 0) break;
10480             if (ub >= 0)
10481             {   if (ub - INT64_MAX > static_cast<std::int64_t>(l2)) break;
10482             }
10483             else if (ub - INT64_MIN < static_cast<std::int64_t>(l2)) break;
10484 // I must either update both or neither of ua, ub.
10485             ua -= l1;
10486             ub -= l2;
10487             std::uint64_t hi, lo;
10488             multiply64(q, b1, hi, lo);
10489             hi += subtract_with_borrow(a1, lo, a1);
10490             std::uint64_t borrow = subtract_with_borrow(a0, hi, a0);
10491             borrow += subtract_with_borrow(a0, q*b0, a0);
10492 // Now borrow!=0 if a had become negative
10493             if (borrow != 0)
10494             {   if ((a1 = -a1) == 0) a0 = -a0;
10495                 else a0 = ~a0;
10496                 ua = -ua;
10497                 ub = -ub;
10498             }
10499             if (b0 > a0 ||
10500                 (b0 == a0 && b1 > a1))
10501             {   std::swap(a0, b0);
10502                 std::swap(a1, b1);
10503                 std::swap(ua, va);
10504                 std::swap(ub, vb);
10505             }
10506         }
10507 // Ahah now I am almost done. I want to go
10508 //          a' = |ua*a + ub*b|;
10509 //          b' = |va*a + vb*b|;
10510 //          if (a' > b') [a, b] = [a', b'];
10511 //          else [a, b] = [b', a'];
10512 // and in the first two lines I need to be aware that one or the other
10513 // (but not both) or ua and ub will be negative so I really have a subtraction,
10514 // and similarly for v1, vb.
10515         if (temp == NULL)
10516         {   push(a);
10517             push(b);
10518             temp = reserve(lena>lenb ? lena : lenb);
10519             pop(b);
10520             pop(a);
10521         }
10522         if (ub < 0)
10523         {   arithlib_assert(ua >= 0);
10524             if (ua_minus_vb(a, lena, ua, b, lenb, -ub, temp, lentemp))
10525                 internal_negate(temp, lentemp, temp);
10526         }
10527         else
10528         {   arithlib_assert(ua <= 0);
10529             if (minus_ua_plus_vb(a, lena, -ua, b, lenb, ub, temp, lentemp))
10530                 internal_negate(temp, lentemp, temp);
10531         }
10532         truncate_unsigned(temp, lentemp);
10533         if (vb < 0)
10534         {   arithlib_assert(va >= 0);
10535             if (ua_minus_vb(a, lena, va, b, lenb, -vb, a, lena))
10536                 internal_negate(a, lena, a);
10537         }
10538         else
10539         {   arithlib_assert(va <= 0);
10540             if (minus_ua_plus_vb(a, lena, -va, b, lenb, vb, a, lena))
10541                 internal_negate(a, lena, a);
10542         }
10543         truncate_unsigned(a, lena);
10544         internal_copy(temp, lentemp, b);
10545         lenb = lentemp;
10546         return;
10547     }
10548 // If I drop through to here I will do a simple reduction. This happens
10549 // either if the initial quotient a/b is huge (over 2^60) or if as I start
10550 // setting up for the Lehmer step I find I can not make enough progress
10551 // with that to be useful. For instance if the next two steps would have
10552 // q=1 and then q=<huge> I can not combine in the huge step to make Lehmer
10553 // style progress and I should drop down and do the "q=1" reduction first
10554 // (followed by the next huge one).
10555 //
10556 // This is the "a = a - q*b;" case.
10557 // Collect the top 128 bits of both a and b.
10558     b0 = b0<<lzb;
10559     if (lzb!=0) b0 |= (b1>>(64-lzb));
10560     b1 = b1<<lzb;
10561     if (lzb!=0) b1 |= (b2>>(64-lzb));
10562     a0 = a0<<lza;
10563     if (lza!=0) a0 |= (a1>>(64-lza));
10564     a1 = a1<<lza;
10565     if (lza!=0) a1 |= (a2>>(64-lza));
10566     a2 = a2<<lza;
10567 // When I have done this b0 will have its top bit set and I will
10568 // want to have a0<b0 because I will be dividing {a0,a1}/b0 and I want the
10569 // quotient to fit within a single 64-bit word.
10570     if (a0 >= b0)
10571     {   a2 = (a2>>1) | (a1<<63);
10572         a1 = (a1>>1) | (a0<<63);
10573         a0 = a0>>1;
10574         lza = lza-1;
10575         diff = diff+1;
10576     }
10577     std::uint64_t q, r;
10578 // I want to get as close an approximation to the full quotient as I can,
10579 // and a "correction" of the form {a0,a1} -= a0*b1/b0 should do the trick.
10580     multiply64(a0, b1, q, r);
10581     divide64(q, r, b0, q, r);
10582     r = a1 - q;
10583     if (r > a1) a0--;
10584     a1 = r;
10585     divide64(a0, a1, b0, q, r);
10586 // Now I want to go "a = a - q*b*2^(diff-64);". The "-64" there is because
10587 // the quotient I computed in q is essentially to be viewed as a fraction.
10588 // So if diff<=64 I will need to do something special.
10589     if (diff <= 64)
10590     {   std::size_t bits_to_lose = 64 - diff;
10591 // I will shift q right, but doing so in such a way that I try to round to
10592 // nearest.
10593         if (bits_to_lose != 0)
10594         {   q = q >> (bits_to_lose-1);
10595             q = (q >> 1) + (q & 1);
10596         }
10597 // Now just do "a = a-q*b;", then ensure that the result is positive
10598 // and clear away any leading zeros left in its representation.
10599         if (reduce_for_gcd(a, lena, q, b, lenb))
10600             internal_negate(a, lena, a);
10601         truncate_unsigned(a, lena);
10602     }
10603     else
10604     {
10605 // Here I need to do a reduction but the quotient in the step is very large
10606 // so I will use the value of q I have as basically the top 60+ bits of the
10607 // quotient I need but with "diff" bits stuck on the end. If diff is a
10608 // multiple of 64 then this is merely a shift by some whole number of words.
10609         if ((diff%64) == 0)
10610         {   std::size_t diffw = diff/64;
10611             if (reduce_for_gcd(a+diffw-1, lena+1-diffw, q, b, lenb))
10612                 internal_negate(a, lena, a);
10613             truncate_unsigned(a, lena);
10614         }
10615         else
10616         {   std::size_t diffw = diff/64;
10617             diff = diff%64;
10618             if (shifted_reduce_for_gcd(a+diffw-1, lena+1-diffw,
10619                                        q, b, lenb, diff))
10620                 internal_negate(a, lena, a);
10621             truncate_unsigned(a, lena);
10622         }
10623     }
10624 }
10625 
10626 // A bit of stray commentary here:
10627 // The simplest GCD scheme is direct Euclidean with the central loop
10628 // being
10629 //     q = a/b;
10630 //     {a, b} = {b, a - q*b};
10631 // There are those who observe that on average the quotient q will be
10632 // small, so they replace this with
10633 //     {a, b} = {a, a - b};
10634 //     swap a and b if necessary so that a>=b.
10635 // This takes more steps but each is a subtraction not a division/remainder
10636 // operation and so might sometimes be a win.
10637 // A "least-remainder" scheme is
10638 //     q = (a + b/2)/b;
10639 //     {a, b} = {b, |a - q*b|};
10640 // where the calculation of q just means round the quotient to nearest
10641 // rather than truncate it towards zero. At the cost of the extra absolute
10642 // value calculation this will reduce the number of steps. I believe that
10643 // using the Euclidean scheme each step shrinks the inputs by an average of
10644 // about 1.7 bits, while the least remainder scheme shrinks values by
10645 // 2.4 or 2.5 bits per step, ie it saves around 30% of the steps, albeit at
10646 // the cost of some absolute value calculations, which could go some way to
10647 // balance out the savings.
10648 // The quotient q will in general be small. In the case where it is very large
10649 // then calculating it becomes tedious. So in such cases it will make sense
10650 // to calculate a leading-digit approximation to it and reduce using that. A
10651 // step of that nature would be essentially what wa happening in long division
10652 // anyway, but now if the guessed quotient is not perfect all will be well
10653 // because subsequent reduction steps will correct for it automatically.
10654 // A Lehmer-style scheme will be useful when the firts several quotients in a
10655 // sequence will all be small - it consolidates big-number arithmetic over
10656 // what are logically multiple individual reduction steps.
10657 
10658 
op(std::uint64_t * a,std::uint64_t * b)10659 inline std::intptr_t Gcd::op(std::uint64_t *a, std::uint64_t *b)
10660 {   if (number_size(b) == 1) return Gcd::op(a,
10661                                                 static_cast<std::int64_t>(b[0]));
10662 // I will start by making copies of |a| and |b| that I can overwrite
10663 // during the calculation and use part of in my result.
10664     std::size_t lena = number_size(a), lenb = number_size(b);
10665     std::size_t olena = lena, olenb = lenb;
10666     if (olena == olenb &&
10667         // See comments later for an explanation of this!
10668         negative(a[lena-1]) && negative(b[lenb-1]) &&
10669         a[lena-1] == b[lenb-1]) olena++;
10670     push(a); push(b);
10671     std::uint64_t *av = reserve(olena);
10672     pop(b); pop(a);
10673     if (negative(a[lena-1])) internal_negate(a, lena, av);
10674     else internal_copy(a, lena, av);
10675     push(av); push(b);
10676     std::uint64_t *bv = reserve(olenb);
10677     pop(b); pop(av);
10678     if (negative(b[lenb-1])) internal_negate(b, lenb, bv);
10679     else internal_copy(b, lenb, bv);
10680     a = av;
10681     b = bv;
10682     if (big_unsigned_greaterp(b, lenb, a, lena))
10683     {   std::swap(a, b);
10684         std::swap(lena, lenb);
10685         std::swap(olena, olenb);
10686     }
10687 // Now a >= b and both numbers are in freshly allocated memory. I will
10688 // remember the sizes of these two arrays.
10689 // Remove any leading zero digits, and if that reduces the situation to
10690 // a 1-word case handle that specially..
10691     if (b[lenb-1] == 0) lenb--;
10692     if (a[lena-1] == 0) lena--;
10693     if (lenb == 1)
10694     {   std::uint64_t bb = b[0];
10695         std::uint64_t hi = 0, q;
10696         for (std::size_t i=lena-1;; i--)
10697         {   divide64(hi, a[i], bb, q, hi);
10698             if (i == 0) break;
10699         }
10700         while (hi != 0)
10701         {   std::uint64_t cc = bb % hi;
10702             bb = hi;
10703             hi = cc;
10704         }
10705         abandon(a);
10706         abandon(b);
10707         return unsigned_int_to_bignum(bb);
10708     }
10709 // In some cases performing a reduction will require a workspace vector.
10710 // I will only allocate this as and when first needed.
10711     std::uint64_t *temp = NULL;
10712     std::size_t lentemp = lena;
10713 // Now at last a and b and genuine unsigned vectors without leading digits
10714 // and with a > b. The next line is the key iteration in this whole procedure.
10715     while (lenb != 1)
10716     {   gcd_reduction(a, lena, b, lenb, olena, olenb, temp, lentemp);
10717         if (big_unsigned_greaterp(b, lenb, a, lena))
10718         {   std::swap(a, b);
10719             std::swap(lena, lenb);
10720             std::swap(olena, olenb);
10721         }
10722     }
10723     if (temp != NULL) abandon(temp);
10724 // One possibility is that b==0 and then a holds the GCD. There is a
10725 // pathological case where an input was -2^(64*n-1), which fits within n
10726 // words, and the GCD ends up as +2^(64*n-1) which needs an extra word.
10727 // If the other input had been bigger I can copy my result into it and
10728 // survive.. the very messy situation would be if both inputs were
10729 // -2^(64*n-1) so had I worked in the obvious way I would not have enough
10730 // space for the result. To allow for this I arrange that if both inputs
10731 // start off the same size (and ideally I would check if both had a value
10732 // of the form -2^(64*n-1), but doing that check is probably more expensive
10733 // that occasionally over-allocating memory!) I enlarge one of the inputs by
10734 // one word.
10735     if (b[0] == 0)
10736     {   if (negative(a[lena-1]))
10737         {   if (lena == olena)
10738             {   internal_copy(a, lena, b);
10739                 abandon(a);
10740                 a = b;
10741                 olena = olenb;
10742             }
10743             else abandon(b);
10744 #ifdef DEBUG_OVERRUN
10745             if (debug_arith)
10746             {   arithlib_assert(a[olena] == 0xaaaaaaaaaaaaaaaaU);
10747             }
10748 #endif
10749             a[lena++] = 0;
10750 #ifdef DEBUG_OVERRUN
10751             if (debug_arith)
10752             {   arithlib_assert(a[olena] == 0xaaaaaaaaaaaaaaaaU);
10753             }
10754 #endif
10755         }
10756         else abandon(b);
10757         return confirm_size(a, olena, lena);
10758     }
10759 // If b is not zero here then it represents a value up to 2^64-1, and I can
10760 // complete the GCD by doing a long-by-short remainder and then a short-num
10761 // GCD...
10762     std::uint64_t bb = b[0];
10763     abandon(b);
10764     std::uint64_t hi = 0, q;
10765     for (std::size_t i=lena-1;; i--)
10766     {   divide64(hi, a[i], bb, q, hi);
10767         if (i == 0) break;
10768     }
10769     abandon(a);
10770     while (hi != 0)
10771     {   std::uint64_t cc = bb % hi;
10772         bb = hi;
10773         hi = cc;
10774     }
10775     return unsigned_int_to_bignum(bb);
10776 }
10777 
op(std::uint64_t * a,std::int64_t bb)10778 inline std::intptr_t Gcd::op(std::uint64_t *a, std::int64_t bb)
10779 {
10780 // This case involved doing a long-by-short remainder operation and then
10781 // it reduces to the small-small case. The main problem is the handling of
10782 // negative inputs.
10783     std::uint64_t b = bb < 0 ? -bb : bb;
10784     std::size_t lena = number_size(a);
10785     bool signa = negative(a[lena-1]);
10786     std::uint64_t hi = 0, q;
10787     for (std::size_t i=lena-1;; i--)
10788     {   divide64(hi, (signa ? ~a[i] : a[i]), b, q, hi);
10789         if (i == 0) break;
10790     }
10791 // Now if a had been positive we have hi=a%b. If a had been negative we
10792 // have (~a)%b == (-a-1)%b which is about |a|%b -1
10793     if (signa) hi = (hi+1)%b;
10794     return Gcd::op(b, hi);
10795 }
10796 
op(std::int64_t a,std::uint64_t * b)10797 inline std::intptr_t Gcd::op(std::int64_t a, std::uint64_t *b)
10798 {   return Gcd::op(b, a);
10799 }
10800 
op(std::int64_t a,std::int64_t b)10801 inline std::intptr_t Gcd::op(std::int64_t a, std::int64_t b)
10802 {
10803 // Take absolute values of both arguments.
10804     std::uint64_t aa = a < 0 ? -static_cast<std::uint64_t>(a) : a;
10805     std::uint64_t bb = b < 0 ? -static_cast<std::uint64_t>(b) : b;
10806 // Ensure that aa >= bb
10807     if (bb > aa) std::swap(aa, bb);
10808 // Do simple Euclidean algorithm
10809     while (bb != 0)
10810     {   std::uint64_t cc = aa % bb;
10811         aa = bb;
10812         bb = cc;
10813     }
10814 // A messy case is gcd(-MIX_FIXNUM, MIN_FIXNUM) which yields -MIN_FIXNUM
10815 // which is liable to be MAX_FIXNUM+1 and so has to be returned as a bignum.
10816     return unsigned_int_to_bignum(aa);
10817 }
10818 
10819 // I think I have space-leaks within my code for LCM. For use in
10820 // a system with garbage collection that will not matter (hoorah) but at
10821 // some stage I need to come back here and look harder and tidy things up.
10822 
op(std::uint64_t * a,std::uint64_t * b)10823 inline std::intptr_t Lcm::op(std::uint64_t *a, std::uint64_t *b)
10824 {   push(a); push(b);
10825     std::intptr_t g = Gcd::op(a, b);
10826     pop(b);
10827     std::intptr_t q = op_dispatch2<Quotient,std::intptr_t>
10828                       (vector_to_handle(b),
10829                        g);
10830     pop(a);
10831     q = op_dispatch2<Times,std::intptr_t>(vector_to_handle(a), q);
10832     return op_dispatch1<Abs,std::intptr_t>(q);
10833 }
10834 
op(std::uint64_t * a,std::int64_t b)10835 inline std::intptr_t Lcm::op(std::uint64_t *a, std::int64_t b)
10836 {   push(a);
10837     std::intptr_t g = Gcd::op(a, b);
10838     std::intptr_t q = op_dispatch2<Quotient,std::intptr_t>(int_to_handle(
10839                           b), g);
10840     pop(a);
10841     q = op_dispatch2<Times,std::intptr_t>(vector_to_handle(a), q);
10842     return op_dispatch1<Abs,std::intptr_t>(q);
10843 }
10844 
op(std::int64_t a,std::uint64_t * b)10845 inline std::intptr_t Lcm::op(std::int64_t a, std::uint64_t *b)
10846 {   return Lcm::op(b, a);
10847 }
10848 
op(std::int64_t a,std::int64_t b)10849 inline std::intptr_t Lcm::op(std::int64_t a, std::int64_t b)
10850 {   std::intptr_t g = Gcd::op(a, b);
10851 // The GCD can only be a bignum if a = b = MIN_FIXNUM.
10852     if (stored_as_fixnum(g))
10853     {   b = b / int_of_handle(g);
10854         std::intptr_t q = Times::op(a, b); // possibly a bignum now
10855         return op_dispatch1<Abs,std::intptr_t>(q);
10856     }
10857     else return unsigned_int_to_bignum(-static_cast<std::uint64_t>
10858                                            (MIN_FIXNUM));
10859 }
10860 
10861 #ifdef CSL
10862 // Support for calculations modulo some integer value...
10863 
10864 // While initially developing this bit of code I will assume C++17 and
10865 // hence that inline variables are supported.
10866 
10867 static const int modulus_32 = 0;
10868 static const int modulus_64 = 1;
10869 static const int modulus_big = 2;
10870 // On Windows these thread-locals may introduce serious overhead. I
10871 // will worry about that later if needbe.
10872 thread_local inline int modulus_size = 0;
10873 thread_local inline std::uint64_t small_modulus = 2;
10874 thread_local inline std::vector<std::uint64_t>
10875 large_modulus_vector;
10876 
large_modulus()10877 inline std::uint64_t *large_modulus()
10878 {   return 1 + reinterpret_cast<std::uint64_t *>
10879            (large_modulus_vector.data());
10880 }
10881 
value_of_current_modulus()10882 inline std::intptr_t value_of_current_modulus()
10883 {   if (modulus_size == modulus_big)
10884     {   std::size_t n = number_size(large_modulus());
10885         std::uint64_t *r = reserve(n);
10886         std::memcpy(r, large_modulus(), n*sizeof(std::uint64_t));
10887         return confirm_size(r, n, n);
10888     }
10889     else return int_to_handle(small_modulus);
10890 }
10891 
op(std::int64_t n)10892 inline std::intptr_t SetModulus::op(std::int64_t n)
10893 {   if (n <= 1) return (std::intptr_t)aerror1("Invalid arg to set-modulus",
10894                             int_to_handle(n));
10895     std::intptr_t r = value_of_current_modulus();
10896     small_modulus = n;
10897     if (n <= 0xffffffffU) modulus_size = modulus_32;
10898     else modulus_size = modulus_64;
10899     return r;
10900 }
10901 
op(std::uint64_t * n)10902 inline std::intptr_t SetModulus::op(std::uint64_t *n)
10903 {   if (Minusp::op(n))
10904         return (std::intptr_t)aerror1("Invalid arg to set-modulus", vector_to_handle(n));
10905     std::intptr_t r = value_of_current_modulus();
10906     std::size_t lenn = number_size(n);
10907     std::size_t bytes = (lenn+1)*sizeof(std::uint64_t);
10908     if (bytes > large_modulus_vector.size())
10909         large_modulus_vector.resize(bytes);
10910     std::memcpy(large_modulus_vector.data(), &n[-1], bytes);
10911     modulus_size = modulus_big;
10912     return r;
10913 }
10914 
op(std::int64_t a)10915 inline std::intptr_t ModularNumber::op(std::int64_t a)
10916 {   if (a >= 0)
10917     {   if (modulus_size == modulus_big) return int_to_handle(a);
10918         else return int_to_handle(a % small_modulus);
10919     }
10920     if (modulus_size == modulus_big) return Plus::op(large_modulus(), a);
10921     else
10922     {   a = a % small_modulus;
10923         if (a < 0) a += small_modulus;
10924         return int_to_handle(a);
10925     }
10926 }
10927 
op(std::uint64_t * a)10928 inline std::intptr_t ModularNumber::op(std::uint64_t *a)
10929 {   if (Minusp::op(a))
10930     {   std::intptr_t r = Remainder::op(a, large_modulus());
10931         if (Minusp::op(r))
10932         {   std::intptr_t r1 = op_dispatch1<Plus,std::intptr_t>(r,
10933                                large_modulus());
10934             abandon(r);
10935             return r1;
10936         }
10937         else return r;
10938     }
10939     else return Remainder::op(a, large_modulus());
10940 }
10941 
op(std::int64_t a,std::int64_t b)10942 inline std::intptr_t ModularPlus::op(std::int64_t a, std::int64_t b)
10943 {   std::uint64_t ua = a, ub = b;
10944 // Because a and b are 64-bit signed values and they should be positive,
10945 // their sum will fit within a 64-bit unsigned integer, but if the modulus
10946 // is large it could be just a 1-word bignum...
10947     if (modulus_size == modulus_big)
10948     {   std::uint64_t r = ua + ub;
10949         if (number_size(large_modulus()) == 1 &&
10950             r >= large_modulus()[0]) r -= large_modulus()[0];
10951         return unsigned_int_to_bignum(r);
10952     }
10953     std::uint64_t r = ua + ub;
10954     if (r >= small_modulus) r -= small_modulus;
10955     return int_to_handle(static_cast<std::int64_t>(r));
10956 }
10957 
op(std::int64_t a,std::uint64_t * b)10958 inline std::intptr_t ModularPlus::op(std::int64_t a, std::uint64_t *b)
10959 {
10960 // One of the inputs here is a bignum, and that can only be valid if we
10961 // have a large modulus.
10962     if (modulus_size != modulus_big) return (std::intptr_t)aerror1("bad arg for modular-plus",
10963                 vector_to_handle(b));
10964     std::intptr_t r = Plus::op(a, b);
10965     if (op_dispatch1<Geq,bool>(r, large_modulus()))
10966     {   std::intptr_t r1 = op_dispatch1<Difference,std::intptr_t>(r,
10967                            large_modulus());
10968         abandon(r);
10969         return r1;
10970     }
10971     else return r;
10972 }
10973 
op(std::uint64_t * a,std::int64_t b)10974 inline std::intptr_t ModularPlus::op(std::uint64_t *a, std::int64_t b)
10975 {   return ModularPlus::op(b, a);
10976 }
10977 
op(std::uint64_t * a,std::uint64_t * b)10978 inline std::intptr_t ModularPlus::op(std::uint64_t *a,
10979                                      std::uint64_t *b)
10980 {   if (modulus_size != modulus_big)
10981         return (std::intptr_t)aerror1("bad arg for modular-plus",
10982                 vector_to_handle(a));
10983     std::intptr_t r = Plus::op(a, b);
10984     if (op_dispatch1<Geq, bool>(r, large_modulus()))
10985     {   std::intptr_t r1 = op_dispatch1<Difference,std::intptr_t>(r,
10986                            large_modulus());
10987         abandon(r);
10988         return r1;
10989     }
10990     else return r;
10991 }
10992 
op(std::int64_t a,std::int64_t b)10993 inline std::intptr_t ModularDifference::op(std::int64_t a,
10994         std::int64_t b)
10995 {   if (a >= b) return int_to_handle(a - b);
10996     if (modulus_size == modulus_big) return Plus::op(large_modulus(),
10997                                                 a - b);
10998     else return int_to_handle(small_modulus - b + a);
10999 }
11000 
op(std::int64_t a,std::uint64_t * b)11001 inline std::intptr_t ModularDifference::op(std::int64_t a,
11002         std::uint64_t *b)
11003 {   if (modulus_size != modulus_big)
11004         return (std::intptr_t)aerror1("bad arg for modular-plus",
11005                 vector_to_handle(b));
11006     std::intptr_t r = Difference::op(b, a);
11007     std::intptr_t r1 = op_dispatch1<RevDifference,std::intptr_t>(r,
11008                        large_modulus());
11009     abandon(r);
11010     return r1;
11011 }
11012 
op(std::uint64_t * a,std::int64_t b)11013 inline std::intptr_t ModularDifference::op(std::uint64_t *a,
11014         std::int64_t b)
11015 {   if (modulus_size != modulus_big)
11016         return (std::intptr_t)aerror1("bad arg for modular-plus",
11017                 vector_to_handle(a));
11018     return Difference::op(a, b);
11019 }
11020 
op(std::uint64_t * a,std::uint64_t * b)11021 inline std::intptr_t ModularDifference::op(std::uint64_t *a,
11022         std::uint64_t *b)
11023 {   if (modulus_size != modulus_big)
11024         return (std::intptr_t)aerror1("bad arg for modular-plus",
11025                 vector_to_handle(a));
11026     if (Geq::op(a, b)) return Difference::op(a, b);
11027     std::intptr_t r = Difference::op(b, a);
11028     std::intptr_t r1 = op_dispatch1<RevDifference,std::intptr_t>(r,
11029                        large_modulus());
11030     abandon(r);
11031     return r1;
11032 }
11033 
11034 
op(std::int64_t a,std::int64_t b)11035 inline std::intptr_t ModularTimes::op(std::int64_t a, std::int64_t b)
11036 {   switch (modulus_size)
11037     {   case modulus_32:
11038             return int_to_handle(static_cast<std::uint64_t>(a)
11039                                  *static_cast<std::uint64_t>
11040                                  (b) % small_modulus);
11041         case modulus_64:
11042         {   std::uint64_t hi, lo, q, r;
11043             multiply64(static_cast<std::uint64_t>(a),
11044                        static_cast<std::uint64_t>(b), hi,
11045                        lo);
11046             divide64(hi, lo, small_modulus, q, r);
11047             return int_to_handle(r);
11048         }
11049         default:
11050         case modulus_big:
11051         {   std::intptr_t w = Times::op(a, b);
11052             std::intptr_t r = op_dispatch1<Remainder,std::intptr_t>(w,
11053                               large_modulus());
11054             abandon(w);
11055             return r;
11056         }
11057     }
11058 }
11059 
op(std::int64_t a,std::uint64_t * b)11060 inline std::intptr_t ModularTimes::op(std::int64_t a,
11061                                       std::uint64_t *b)
11062 {   std::intptr_t w = Times::op(a, b);
11063     std::intptr_t r = op_dispatch1<Remainder,std::intptr_t>(w,
11064                       large_modulus());
11065     abandon(w);
11066     return r;
11067 }
11068 
op(std::uint64_t * a,std::int64_t b)11069 inline std::intptr_t ModularTimes::op(std::uint64_t *a,
11070                                       std::int64_t b)
11071 {   return ModularTimes::op(b, a);
11072 }
11073 
op(std::uint64_t * a,std::uint64_t * b)11074 inline std::intptr_t ModularTimes::op(std::uint64_t *a,
11075                                       std::uint64_t *b)
11076 {   std::intptr_t w = Times::op(a, b);
11077     std::intptr_t r = op_dispatch1<Remainder,std::intptr_t>(w,
11078                       large_modulus());
11079     abandon(w);
11080     return r;
11081 }
11082 
11083 
op(std::int64_t a,std::int64_t b)11084 inline std::intptr_t ModularExpt::op(std::int64_t a, std::int64_t b)
11085 {   return (std::intptr_t)aerror("incomplete");
11086 }
11087 
op(std::int64_t a,std::uint64_t * b)11088 inline std::intptr_t ModularExpt::op(std::int64_t a, std::uint64_t *b)
11089 {   return (std::intptr_t)aerror("incomplete");
11090 }
11091 
op(std::uint64_t * a,std::int64_t b)11092 inline std::intptr_t ModularExpt::op(std::uint64_t *a, std::int64_t b)
11093 {   return (std::intptr_t)aerror("incomplete");
11094 }
11095 
op(std::uint64_t * a,std::uint64_t * b)11096 inline std::intptr_t ModularExpt::op(std::uint64_t *a,
11097                                      std::uint64_t *b)
11098 {   return (std::intptr_t)aerror("incomplete");
11099 }
11100 
11101 
op(std::int64_t a,std::int64_t b)11102 inline std::intptr_t ModularQuotient::op(std::int64_t a,
11103         std::int64_t b)
11104 {   return (std::intptr_t)aerror("incomplete");
11105 }
11106 
op(std::int64_t a,std::uint64_t * b)11107 inline std::intptr_t ModularQuotient::op(std::int64_t a,
11108         std::uint64_t *b)
11109 {   return (std::intptr_t)aerror("incomplete");
11110 }
11111 
op(std::uint64_t * a,std::int64_t b)11112 inline std::intptr_t ModularQuotient::op(std::uint64_t *a,
11113         std::int64_t b)
11114 {   return (std::intptr_t)aerror("incomplete");
11115 }
11116 
op(std::uint64_t * a,std::uint64_t * b)11117 inline std::intptr_t ModularQuotient::op(std::uint64_t *a,
11118         std::uint64_t *b)
11119 {   return (std::intptr_t)aerror("incomplete");
11120 }
11121 
11122 
op(std::int64_t a)11123 inline std::intptr_t ModularMinus::op(std::int64_t a)
11124 {   if (a == 0) return int_to_handle(a);
11125     if (modulus_size == modulus_big)
11126         return Difference::op(large_modulus(), a);
11127     else return int_to_handle(small_modulus - a);
11128 }
11129 
op(std::uint64_t * a)11130 inline std::intptr_t ModularMinus::op(std::uint64_t *a)
11131 {   if (modulus_size != modulus_big)
11132         return (std::intptr_t)aerror1("bad argument for modular-minus", vector_to_handle(a));
11133     return Difference::op(large_modulus(), a);
11134 }
11135 
general_modular_reciprocal(std::intptr_t a)11136 inline std::intptr_t general_modular_reciprocal(std::intptr_t a)
11137 {   return (std::intptr_t)aerror("not coded yet");
11138 }
11139 
op(std::int64_t aa)11140 inline std::intptr_t ModularReciprocal::op(std::int64_t aa)
11141 {   if (aa <= 0) return (std::intptr_t)aerror1("bad argument to modular-reciprocal",
11142                              int_to_handle(aa));
11143     else if (modulus_size == modulus_big)
11144         return general_modular_reciprocal(int_to_handle(aa));
11145     std::int64_t a = small_modulus,
11146                  b = aa,
11147                  x = 0,
11148                  y = 1;
11149     while (b != 1)
11150     {   std::uint64_t w, t;
11151         if (b == 0)
11152             return (std::intptr_t)aerror2("non-prime modulus in modular-reciprocal",
11153                     int_to_handle(small_modulus),
11154                     int_to_handle(aa));
11155         w = a / b;
11156         t = b;
11157         b = a - b*w;
11158         a = t;
11159         t = y;
11160         y = x - y*w;
11161         x = t;
11162     }
11163     if (y < 0) y += small_modulus;
11164     return int_to_handle(y);
11165 }
11166 
op(std::uint64_t * a)11167 inline std::intptr_t ModularReciprocal::op(std::uint64_t *a)
11168 {   return general_modular_reciprocal(vector_to_handle(a));
11169 }
11170 
11171 #endif // CSL
11172 
11173 } // end of namespace arithlib_implementation
11174 
11175 // I want a namespace that the user can activate via "using" that only
11176 // gives access to things that ought to be exported by this library. So
11177 // arithlib_implementation is to be thought of as somewhat low level and
11178 // private, while just plain arithlib may be enough for the typical C++
11179 // user who is just going to be using the "Bignum" type.
11180 //
11181 // [The issue of whether I have everything I need included in this list
11182 //  remains uncertain, however a user can either add to the section here
11183 //  or use the arithlib_implementation namespace directly in case of upset]
11184 
11185 namespace arithlib
11186 {
11187 using arithlib_implementation::operator"" _Z;
11188 using arithlib_implementation::Bignum;
11189 
11190 using arithlib_implementation::mersenne_twister;
11191 using arithlib_implementation::reseed;
11192 using arithlib_implementation::uniform_uint64;
11193 using arithlib_implementation::uniform_positive;
11194 using arithlib_implementation::uniform_signed;
11195 using arithlib_implementation::random_upto_bits_bignum;
11196 
11197 using arithlib_implementation::display;
11198 using arithlib_implementation::fix_bignum;
11199 }
11200 
11201 // I am putting in names that CSL uses here...
11202 
11203 namespace arithlib_lowlevel
11204 {
11205 using arithlib_implementation::Plus;
11206 using arithlib_implementation::Difference;
11207 using arithlib_implementation::RevDifference;
11208 using arithlib_implementation::Times;
11209 using arithlib_implementation::Quotient;
11210 using arithlib_implementation::Remainder;
11211 using arithlib_implementation::Divide;
11212 using arithlib_implementation::Gcd;
11213 using arithlib_implementation::Lcm;
11214 using arithlib_implementation::Logand;
11215 using arithlib_implementation::Logor;
11216 using arithlib_implementation::Logxor;
11217 using arithlib_implementation::Logeqv;
11218 using arithlib_implementation::Zerop;
11219 using arithlib_implementation::Onep;
11220 using arithlib_implementation::Minusp;
11221 using arithlib_implementation::Evenp;
11222 using arithlib_implementation::Oddp;
11223 using arithlib_implementation::Eqn;
11224 using arithlib_implementation::Neqn;
11225 using arithlib_implementation::Geq;
11226 using arithlib_implementation::Greaterp;
11227 using arithlib_implementation::Leq;
11228 using arithlib_implementation::Lessp;
11229 using arithlib_implementation::Add1;
11230 using arithlib_implementation::Sub1;
11231 using arithlib_implementation::Minus;
11232 using arithlib_implementation::Abs;
11233 using arithlib_implementation::Square;
11234 using arithlib_implementation::Isqrt;
11235 using arithlib_implementation::Lognot;
11236 using arithlib_implementation::Pow;
11237 using arithlib_implementation::LeftShift;
11238 using arithlib_implementation::RightShift;
11239 using arithlib_implementation::IntegerLength;
11240 using arithlib_implementation::Low_bit;
11241 using arithlib_implementation::Logbitp;
11242 using arithlib_implementation::Logcount;
11243 using arithlib_implementation::Float;
11244 using arithlib_implementation::Double;
11245 using arithlib_implementation::Frexp;
11246 #ifdef CSL
11247 using arithlib_implementation::ModularPlus;
11248 using arithlib_implementation::ModularDifference;
11249 using arithlib_implementation::ModularTimes;
11250 using arithlib_implementation::ModularExpt;
11251 using arithlib_implementation::ModularQuotient;
11252 using arithlib_implementation::ModularMinus;
11253 using arithlib_implementation::ModularReciprocal;
11254 using arithlib_implementation::ModularNumber;
11255 using arithlib_implementation::SetModulus;
11256 #endif // CSL
11257 
11258 using arithlib_implementation::bignum_to_string;
11259 using arithlib_implementation::bignum_to_string_length;
11260 using arithlib_implementation::bignum_to_string_hex;
11261 using arithlib_implementation::bignum_to_string_hex_length;
11262 using arithlib_implementation::bignum_to_string_octal;
11263 using arithlib_implementation::bignum_to_string_octal_length;
11264 using arithlib_implementation::bignum_to_string_binary;
11265 using arithlib_implementation::bignum_to_string_binary_length;
11266 
11267 using arithlib_implementation::round_double_to_int;
11268 using arithlib_implementation::trunc_double_to_int;
11269 using arithlib_implementation::floor_double_to_int;
11270 using arithlib_implementation::ceiling_double_to_int;
11271 
11272 #ifdef softfloat_h
11273 using arithlib_implementation::Float128;
11274 using arithlib_implementation::Frexp128;
11275 using arithlib_implementation::round_float128_to_int;
11276 using arithlib_implementation::trunc_float128_to_int;
11277 using arithlib_implementation::floor_float128_to_int;
11278 using arithlib_implementation::ceiling_float128_to_int;
11279 // These next few are just raw float128_t operations.
11280 using arithlib_implementation::f128_0;
11281 using arithlib_implementation::f128_half;
11282 using arithlib_implementation::f128_mhalf;
11283 using arithlib_implementation::f128_1;
11284 using arithlib_implementation::f128_m1;
11285 using arithlib_implementation::f128_zero;
11286 using arithlib_implementation::f128_infinite;
11287 using arithlib_implementation::f128_nan;
11288 using arithlib_implementation::frexp;
11289 using arithlib_implementation::ldexp;
11290 using arithlib_implementation::modf;
11291 #endif // softfloat_h
11292 
11293 //using arithlib_implementation::negative;
11294 //using arithlib_implementation::number_size;
11295 
11296 using arithlib_implementation::cast_to_float;
11297 }
11298 
11299 #endif // __arithlib_hpp
11300 
11301 // end of arithlib.hpp
11302