1 // Big Number arithmetic. A C Norman, 2019-21
2
3 // To use this, go "#include "arithlib.hpp".
4
5 #ifndef __arithlib_hpp
6 #define __arithlib_hpp 1
7
8 // To do:
9 // Write full documentation! [Meanwhile there is a reasonably extended
10 // commentary included as comments here, and a file arithtest.cpp that
11 // can accompany it and illustrate its use]
12
13 /**************************************************************************
14 * Copyright (C) 2019-21, Codemist. A C Norman *
15 * *
16 * Redistribution and use in source and binary forms, with or without *
17 * modification, are permitted provided that the following conditions are *
18 * met: *
19 * *
20 * * Redistributions of source code must retain the relevant *
21 * copyright notice, this list of conditions and the following *
22 * disclaimer. *
23 * * Redistributions in binary form must reproduce the above *
24 * copyright notice, this list of conditions and the following *
25 * disclaimer in the documentation and/or other materials provided *
26 * with the distribution. *
27 * *
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
29 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT *
30 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS *
31 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE *
32 * COPYRIGHT OWNERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
33 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, *
34 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
35 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
36 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR *
37 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF *
38 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH *
39 * DAMAGE. *
40 *************************************************************************/
41
42
43
44 // There are quite a lot of bignumber packages out there on the web,
45 // but none of them seemed to be such that I could readily use them
46 // for arithmetic within a Lisp at all easily, for instance because of
47 // the storage management arangements used.
48 //
49 // This code is for use as a header-only library, so just "#include" it
50 // to be able to use it. All the use of the word "inline" is so that if
51 // you #include this from several source files there will not be trouble when
52 // you link them together: you should still only end up with one copy of
53 // each function in memory. Note that until a few years ago the expectation
54 // was that tagging a function as "inline" was advice to the compiler to
55 // merge its definition into the body of functions that called it. When
56 // that is to be done one wants to put the function definition in a header
57 // file so that all compilation units gain access to it. If it was made
58 // "static" there would risk being a compiled copy included along with
59 // every compilation uint, while if it was "extern" it could seem to be
60 // multiply defined. Use of "inline" resolves that, leaving just one
61 // copy in the eventual linked binary. From C++17 onwards data as well as
62 // code can be tagged "inline" in this sense. Until I can be confident that
63 // everybody will use a compiler that supports C++17 I need to go through
64 // quite odd-looking steps so that when this header is included from several
65 // compilation units I do not end up with multiply-defined entities.
66 //
67 // This code uses 64-bit digits and a 2s complement representation for
68 // negative numbers. This means it will work best on 64-bit platforms
69 // (which by now are by far the most important), and it provides bitwise
70 // logical operations (logand and logor) as well as arithmetic. It will work
71 // best where the C++ compiler supports a 128-bit integral type, but does not
72 // need that. It should work on 32-bit systems as well, although one should
73 // expect performance there to be lower.
74 //
75 // The code here tried to arrange that any operations that might overflow are
76 // done using unsigned types, because in C++ overflow in signed arithmetic
77 // yields undefined results - ie on some machines the values delivered could
78 // be quite unrelated to the desired ones. This means that I do plenty of
79 // arithmetic rather as
80 // int c = (int)((unsigned)a + (unsigned)b);
81 // and I rely on the result being as would be seen with natural 2s complement
82 // working. From C++20 onwards this is likely to be guaranteed by the standard,
83 // but at present it is not, so although this could will work on almost all
84 // known machines if judged against the standard at is at best relying on
85 // implementation defined behaviour and avoiding undefined behaviour.
86 //
87 // If "softfloat_h" is defined I will suppose that there is a type "float128"
88 // available and I will support conversions to and from that. In part because
89 // this type is not supported in any standard way I will be assuming that the
90 // version I use is as provided by the SoftFloat IEEE Floating-Point Arithmetic
91 // package released by John R. Hauser. That does everything in software so
92 // absolute speed may be modest, but it is nicely portable. For more
93 // information see http://www.jhauser.us/arithmetic/SoftFloat.html.
94 //
95 // I will provide two levels of access and abstraction. At the low level
96 // a big number is represented as and array of uint64_t digits along with
97 // a size_t value that indicates the number of digits in use. The most
98 // significant digit in any number lives in memory with type uint64_t but
99 // is treated as signed (ie int64_t) in the arithmetic. For the purposes
100 // of the bitwise operations (and, or, xor and not) negative values are
101 // processed as if they had an infinite number of 1 bits above their
102 // most significant stored digit.
103 // If a positive value has a top digit whose unsigned value has its top bit
104 // set then an additional zero digit is used ahead of that, and equivalently
105 // for negative values.
106 //
107 // Vectors to represent numbers are allocated using a function reserve()
108 // which takes an argument indicating how long the number might be. It will
109 // often be necessary to reserve memory in a conservative manner, ie to
110 // allocate more memory than will eventually prove to be needed.
111 // At the end of an arithmetic operation a reserved block of memory can be
112 // passed to abandon() when it is no longer required, or there can be a
113 // call to confirm_size() to establish the exact size that is to be retained.
114 // A variant call confirm_size_x() is used when the vector whose size is being
115 // confirmed is not the one that was most recently allocated: the intent
116 // there was that under some schemes discarding or shortening the most
117 // recently allocated item might be especially easy to implement well.
118 // confirm_size() returns a handle for the vector, not the vector itself.
119 // Depending on build options it is also possible that small integers will be
120 // represented almost directly: such cases will be referred to as fixnums.
121 // So for the benefit of higher levels of abstraction every number is stored
122 // using a "handle", where the handle can be tested to see it is holds the
123 // value of the number within itself as a fixnum or whether it is a pointer
124 // to a vector of digits. While the code here does not absolutely mandate it,
125 // the expectation is that all vectors will be allocated at addresses that are
126 // a multiple of sizeof(uint64_t) and that means that some low bits in a
127 // handle are available to tag fixnums.
128 //
129 // In addition to numbers I will generate strings (I have code to make a
130 // string representation of a number, with hex, decimal, octal and binary
131 // variants). Space for the string will be allocated using reserve_string()
132 // and finalized using confirm_size_string(), with both of those indicating
133 // sizes in bytes. Note that when you reserve or confirm string size the
134 // length quoted is the number of characters that will be present excluding
135 // any header or termination markers - reserve_string() will allow for the
136 // needs of suchlike.
137 //
138 // A higher level packaging represents numbers using a class Bignum. This
139 // has just one field which will hold a potentially encoded version of a
140 // pointer to a vector that is the number (ie a handle). When the handle
141 // identified a vector the first item in the vector will be a header word
142 // containing length information. That is followed by the uint64_t digits
143 // representing the numeric value. The representation of the header and the
144 // encoding of handles can be configured in one of several ways, these being
145 // intended to provide models of the implementation intended for different
146 // use cases.
147 //
148 // Overall the code has conditional compilation providing for 3 prototype
149 // arrangements. These are MALLOC, NEW and LISP. It is envisaged that some
150 // users of this code will need to modify it to allow it to interface with the
151 // rest of their software, and these three schemes give at least sketches of
152 // various possibilites. The short explanation is that MALLOC uses malloc()
153 // and free() for memory management and does not use fixnums, so that all
154 // numbers (however small) are stored as vectors. This is perhaps the simplest
155 // scheme, if not the highest performance. NEW exploits many more C++ features.
156 // Storage management uses "new" and "delete" at the lowest level, but the
157 // code keeps its own lists of previously used memory blocks in a manner that
158 // greatly reduces the call on C++ memory management work. This version stores
159 // handles that refer to vectors as even numbers and ones that are fixnums
160 // with their bottom bit set, so fixnums are 63 bits wide. The C++ class
161 // Bignum and a range of operator overloads lead to this being a simple
162 // version to use for casual C++ code, and it is the default version built.
163 // LISP is the version that originally motivated me to implement this. It has
164 // a subsidiary configuration option that allows for systems where garbage
165 // collection is or is not conservative. This could be a good starting point
166 // for a bignum system to be used as part of the run-time system for any
167 // language, not just Lisp. However the interface code here is liable to need
168 // detailed review and revision since it mediates between the data structures
169 // used here and whatever is present in the Lisp (or whatever!) that will
170 // use it. I initially developed and tested this using a Lisp called "vsl"
171 // and intend to migrate it for use in "csl". Both of these use low-bit
172 // tagging of data and the precise values for tag bits and their layout
173 // within header words has to be adhered to here, as has the Lisp's ideas
174 // about the way that header words are stored.
175 //
176 // Here is some more information about each scheme:
177 //
178 // MALLOC:
179 // A bignum with n digits is held in a vector of length n+1, and the
180 // "encoded pointer" to it is a native pointer to the second element.
181 // If this pointer is p then the number of words (n) is kept at p[-1]
182 // and the least significant digit of the number is at p[0]. reserve()
183 // uses malloc() to obtain space. confirm_size() uses realloc() to trim
184 // the allocated space, and abandon() maps onto use of free(). This
185 // uses C rather than C++ memory management because it wants to use realloc
186 // which is not supported in the tidy C++ world. Performance of the code
187 // as a whole will be sensitive to the malloc/realloc/free implementation
188 // on the platform that is in use. To allow for a user who wished to
189 // customize allocation, all calls to the basic memory allocation primitives
190 // are made indirectly so that alternative equivalents can be plugged in.
191 // Strings and allocated using malloc() and returned as simple nul-terminated
192 // C strings. They must be released using free() after use.
193 //
194 // NEW:
195 // A bignum with n digits will be stored in a vector whose size is the
196 // next power of 2 strictly larger than n. As with the MALLOC case
197 // the numeric data starts one word into this block and the initial word
198 // of the block contains a header of length information. Here the header
199 // is split into two 32-bit parts. One contains the length of the number
200 // as before (but note that in general that will not fill up the entire
201 // memory block), the other contains log2(block_size), ie it is a compact
202 // indication of the size of the block. There will be free-chains for
203 // blocks of size 2,4,8,... so that abandon() just pushes the released
204 // memory onto one and reserve() can often merely retrieve a previously
205 // used block. In most cases confirm_size just needs to write the actual
206 // length of a number into the header word. When two large numbers are
207 // subtracted the number of digits in the result may be much smaller than
208 // the size of block that had to have been reserved. To allow for that sort
209 // of situation confirm_size() reserves the right to notice cases where used
210 // size in a block is much smaller than the capacity, and in such cases
211 // allocate a fresh smaller block and copy information into it, allowing it
212 // to abandon the overlarge chunk of memory.
213 // The reference to the vector of digits is held using type intptr_t and
214 // can be cast to obtain the address of the least significant digit of the
215 // value. But so that this scheme as a whole provides better performance
216 // for general users, small integer values will be handled specially. If
217 // the "encoded pointer" has its bottom bit set than it represents a 63-bit
218 // signed value. The intent here is that the class Bignum, by containing
219 // just one integer-sized field, can be stored and passed around really
220 // efficiently, and if in its use most arithmetic remains on values that
221 // fit within 63 bits it will not do much storage allocation at all. If this
222 // works well it should represent a reasonably convenient and tolerably
223 // efficient C++ facility for general use.
224 // Strings live in store allocated using "new char[nnn]" and are returned
225 // as C style strings that must be disposed of using "delete". The use of
226 // C rather than C++ style strings because I hope that makes storage
227 // management interaction clearer.
228 // In this case there is an extra option DEBUG_OVERRUN which enables some
229 // simple checks for memory block overflow. reserve() always arranges that
230 // there will be a "spare" word just beyond the top used word in a vector,
231 // and it initializes this to a slightly unlikely value. When confirm_size
232 // or abandon() are used it can then verify that this guard word has not
233 // been corrupted. This may not be 100% foolproof but is nevertheless helps
234 // while developing or maintaining the library!
235 //
236 // LISP:
237 // The arrangements here are based on the arrangements I have in my VSL
238 // and CSL Lisp implementations. I still hope that between the above options
239 // and this one the code can be adapted reasonably easily. As before the
240 // basic representation of a number with n digits is a vector of length
241 // n+1, with the initial word containing a header. In VSL/CSL a header word
242 // contains some tage bits identifying it as a header, then some type
243 // bite that here will indicate that it is a header of a big number. Finally
244 // it contains a length field. The exact bit-patterns and packing here will
245 // be specific to the particular Lisp (obviously!). A reference to a big
246 // number will be the address of the header word of this vector plus some
247 // tag bits in the bottom 3 bits. This "low tagging" relies on all block
248 // of memory being aligned to 8-byte boundaries (even on 32-bit platforms).
249 // On a 32-bit system the header will only occupy the first 32-bits of the
250 // initial 64-bit memory unit, and the second 32-bit region is spare and
251 // would be best set to zero.
252 // There are two expectations about memory management. The first is that
253 // garbage collection has left a large block of contiguous memory within
254 // which new material can be allocated linearly. Under this supposition the
255 // most recently allocated block of memory can be shrunk or discarded by
256 // simply resetting a heap-fringe pointer. The second is that it will
257 // at least occasionally be desirable to perform linear scans of all memory.
258 // To support that when a block that is not the most recently allocated one
259 // is shrunk or discarded a header word is placed in the released space
260 // leaving a valid but dummy Lisp item there.
261 // Those issue motivate the distinction between confirm_size and
262 // confirm_size_x. [Note that the implementation may not (yet) do all that
263 // I intended in that respect!]
264 // Usually calls to memory allocation primitives are made without any special
265 // concern for garbage collector safety of other pointers, and so in its
266 // current form this code insists on running in a context where the garbage
267 // collector is conservative, so that for instance the untagged pointers to
268 // raw vectors of digits are safe. This aspect of the code may well limit
269 // its direct usefulness. So too allow for a system that uses a precise
270 // garbage collector I allow for a "#define PRECISE_GC 1" option and in
271 // that case whenever calls to memory allocation are performed within the
272 // low-level code I will use functions "push()" and "pop()" and
273 // expect that they save values somewhere that the garbage collector can
274 // find. Note that this scheme does not automate keeping large bignum
275 // calculations expressed via operator overloading safe! It is mostly aimed
276 // at making the low level code usable. A typical case where push() and
277 // pop() are needed will be the code to multiply two big numbers. That can
278 // work out how large its result will be and then needs to call reserve()
279 // to get space for it. Across the call to reserve it will need to push
280 // its two arguments, because a copying garbage collector might relocate
281 // them.
282 // Further though: having "push and pop" suggests that a potentially large
283 // number of items might need to be saved. I suspect that the most I can
284 // ever encounter here will be perhaps 3. So having a fixed number of
285 // dedicated list basis to stash things in might be nicer.
286 // In Lisp mode it is anticipated that as well as a tagged representation
287 // of small integers that the generic arithemetic will need to support
288 // floating point numbers (and possibly multiple widths of floating point
289 // values, plus ratios and complex numbers) and so the dispatch on tagged
290 // numbers needs to live at a higher level within the Lisp then just thise
291 // code. Thus while the big-number functions here are set up so they can
292 // return fixnum results and while there are entrypoints for performing
293 // arithmetic between bignums and fixnums (ie between uint64_t * and
294 // int64_t values) it is the responsibility of somebody else to decide which
295 // functions to call when.
296 // Strings are allocated using reserve_string() and finalized using
297 // confirm_size_string. For Lisp purposes they will need to have a header
298 // word containing the string length.
299 //
300 // It might be helpful if I provide my own thoughts about when you might decide
301 // to use this code and when you will probably not. Wikipedia lists rather
302 // a large number of arbitrary precision arithmetic packages on the web page
303 // en.wikipedia.org/wiki/List_of_arbitrary-precision_arithmetic_software.
304 // As well as free-stanidng libraries a range of programming languages feature
305 // big-number arithmetic as a standard feature. It may be fair to suggest
306 // that for use from C++ the most visible option is GMP with some users liking
307 // to use it via Boost. Given a view that GMP is the market leader I will
308 // set out some points comparing arithlib with it.
309 // First GMP is well established, it aims for top performance, it has fast
310 // algorithms for huge arithmetic as well as for sane-sized numbers. In
311 // contrast arithlib is new and neither well established nor truly heavily
312 // tested. It does not even try to provide algorithms that will only become
313 // useful for arithmetic on numbers that are many many thousands of digits
314 // (eg FFT-style multiplication). It can thus be expected to be generally
315 // slower than GMP.
316 // However potential advantages of arithlib are
317 // (1) It is subject to a permissive BSD license, while GMP is dual licensed
318 // under LGPL3 or GPL2. For some users or some projects this may matter!
319 // (2) Rather than having assembly code versions for a wide but finite range
320 // of particular hosts, arithlib follows the "Trust your Compiler" policy
321 // and expects that a sufficiently modern C++ compiler will manage to
322 // generate code close to the performance of all the hand-optimised
323 // assembly code that GMP uses. This reduces the total size of the
324 // package substantially and makes building/installing/using it especially
325 // easy even when a packaged version is not instantly available for
326 // your machine.
327 // (3) By being a header-only library, arithlib imposes a cost at program
328 // build time as it all has to be processed by the compiler - but these
329 // days compile-times are pretty short anyway. And by having all of
330 // its souce code available when code that uses it is built there are
331 // potential whole-program optimisations that can be made.
332 // (4) arithlib is coded in C++ not C, and this allows it to leverage features
333 // of C++11. For instance it can rely on the random number generation
334 // facilities that C++ provides rather than needing to implement its
335 // own. There are places within it where template code leads to a neater
336 // implementation, and the operator overloading scheme that various other
337 // C++ arithmetic packages provide fits in especially naturally.
338 // (5) My initial motivation for creating arithlib was a need for a big
339 // arithmetic package to form part of the run-time system of a language
340 // implementation. arithlib was built with a view to keeping much of the
341 // memory allocation and management somewhere else, probably supported
342 // by garbage collection. I found it much harder to see how to arrange
343 // that the garbage collector in the rest of my run-time system could
344 // track the memory usage within GMP.
345 // (6) While arithlib is not a totally tiny body of code it is smaller and
346 // simpler than GMP. When its capabilities cover what is needed and when
347 // its speed is sufficient I would suggest that "small and tidy is good".
348 //
349 // A key use-case that arithlib is NOT set up to support is arithmetic on
350 // long but fixed precision numbers - this is liable to mean that it will
351 // not be the technlogy of choice for a range of cryptography applications!
352 // The code here has been tested and runs on both 32 and 64-bit machines,
353 // however its internal workings are almost all expressed in terms on the
354 // type "uint64_t". This may result in there being significant scope for
355 // better specialization for code to run on 32-bit targets.
356 //
357 // What about thread safety? Well the code should be thread-safe.
358 // with the C++ "NEW" option I provide several options and you need to
359 // configure one (at compile time, and by editing this file or adding
360 // overriding predefined symbols), choosing the one you like or that
361 // will run fastest on your platform. Search for "CONCURRENCY SUPPORT" to
362 // find the commentary.
363 //
364 // I have run some VERY SIMPLISTIC benchmark comparisons between this code
365 // and gmp. All that has been tested is the low-level code for multiplying
366 // a pair on N word unsigned integers to get a 2N word result.
367 // On x86_64 and compiling with "g++ -O3" I believe that this test suggests
368 // that up to around 100 words (ie 6400 bits, 2000 decimal digits) the speed
369 // ratio between gmp and arithlib is in the range 1 to 2 (on Ubuntu Linux or
370 // Windows 10 (64-bit)).
371 // Beyond that the use of "TOOM3" by gmp leads to it gradually picking
372 // up advantages, reaching about a factor somewhat over 3 at around 1500
373 // words (100000 bits, 30000 decimal digits).
374 // This benchmark only tests multiplication of equally sized numbers and
375 // its results will vary noticably across platforms, and so it is not liable
376 // to be representative of overall results including mixes of all the
377 // operations on mixed-size numbers, but at least it shows something!
378 // For large enough inputs I believe I see multiplication being about 4 times
379 // as costly on a 32-bit platform as on a 64-bit one, a result that is perhaps
380 // no cause for great astonishment!
381
382 // Those concerned with programming style may be minded to complain about the
383 // fairly large numbers of casts in this code. So here is an attempt to
384 // explain some major causes.
385 // First, in C++ signed arithmetic overflow is undefined behaviour. However I
386 // want to be able to tell when it would arise, and I want to be able to do
387 // a great deal of my big-number implementation using all 64-bits of a wide
388 // integer. Thus both overflow tests and a great deal of the inner arithmetic
389 // are done using unsigned aritmeric, but both when dealing with negative
390 // bignums and when interacting with the user I need to have signed integers.
391 // I frequently cast between signed an unsigned 64-bit values in these
392 // contexts. The code ASSUMES that arithmetic is twos complement. Note that
393 // from C++20 onwards this ceases to be an assumption and is guaranteed by
394 // the standard!
395 // Secondly in many cases I will represent a big number as an object whose
396 // sole data menber is of type std::uintptr_t. This will be interpreted as
397 // a value with its low bit or few bits as tag information and upper bits
398 // as either a pointer (to a vector of digits) or an immediate integer value.
399 // I am ASSUMING that one or more low bits in the representation of a pointer
400 // to an aligned item will be zero in the natural pointer, and that I can
401 // force information into them for tagging. Even though this is not entirely
402 // proper I feel that eg std::align() would not make sense if this was not
403 // going to work! Whatever the risks, using this sort of representation leads
404 // to many casts beween std::uint64_t* and std::uintptr_t, and the
405 // unpacking of a signed value from the immediate integer case leads to
406 // further casts with signed integers, ie std::intptr_t.
407 // Finally I need at times to generate a bit-fiels using code such as
408 // (1<<n). If the literal "1" is not introduced using a wide enough type
409 // this can overflow. I might try (1LLU<<n) but I have no absolute guarantee
410 // that "LL" makes a 64-bit integer. I could use UINT64_C(1) and that is not
411 // too bad, but in many places I will in fact write the rather wordy but very
412 // explicit (static_cast<std::uint64_t>(1)<<n).
413
414
415 // I provide a default configuration, but by predefining one of the
416 // symbols allow other versions to be built.
417
418 #if !defined MALLOC && !defined NEW && !defined LISP
419 // If no explicit options have been set I will building using memory
420 // allocation via C++ "new".
421
422 #define NEW 1
423 #define DEBUG_OVERRUN 1 // Overhead is not huge: watching for error is good.
424
425 #endif
426
427 #ifdef __cpp_inline_variables
428
429 // For versions of C++ up to C++17 I will put constant values in header
430 // files using something along the line of "static const int VAR = VAL;".
431 // This should give the compiler a chance to replace the name with its value
432 // throughout the compilation unit, and if the compiler is clever enough it
433 // will avoid leaving a word of memory with the value stored if all uses
434 // have been dealt with more directly. However it will tend to lead to a
435 // lot of "static variable defined but not used" warnings.
436 // From C++17 onwards (and C++ mandates the __cpp_inline_variables macro to
437 // indicate if the feature is in place) I will use
438 // inline const int VAR = VAL;
439 // and now if memory is allocated for the variable it will only be allocated
440 // once, and I hope that compilers will not feel entitled to moan about
441 // cases where there are no references. I will only use this idiom for things
442 // that are at least "almost" constant so that in the case that the variables
443 // end up static and there are different copies in each compilation unit
444 // it should not cause cconfusion.
445
446 #define INLINE_VAR inline
447 #define HAVE_INLINE_VARS 1
448 #else
449 #define INLINE_VAR static
450 #endif
451
452
453 // Another useful C++17 feature.... with a fallback to a GNU-specific
454 // way of achieving the same through use of C++11 annotations. And a final
455 // fall back to just not worrying.
456
457 #ifndef MAYBE_UNUSED
458 #ifdef __has_cpp_attribute_maybe_unused
459 #define MAYBE_UNUSED [[maybe_unused]]
460 #elif defined __GNUC__
461 #define MAYBE_UNUSED [[gnu::unused]]
462 #else
463 #define MAYBE_UNUSED
464 #endif
465 #endif
466
467 #include <cstdio>
468 #include <cstring>
469 #include <cstdint>
470 #include <cmath>
471 #include <cfloat>
472 #include <cctype>
473 #include <cinttypes>
474 #include <cstdlib>
475 #include <cstdarg>
476 #include <cassert>
477 #include <random>
478 #include <iostream>
479 #include <iomanip>
480 #include <thread>
481 #include <ctime>
482 #include <chrono>
483 #include <utility>
484 #include <string>
485 #include <chrono>
486 #include <mutex>
487 #include <atomic>
488 #include <vector>
489 #include <type_traits>
490
491 namespace arithlib_implementation
492 {
493
494 #ifdef CSL
495 // For use within CSL I will provide a single thread-local pointer that
496 // can be accessed fast even on Windows. All value that are to be
497 // thread_local within this library can (eventually) be migrated to live in
498 // a chunk of memory referenced by this.
499 //
500 // The data involved will be
501 // (1) Information related to the worker threads for Karatsuba multiplication.
502 // (2) Information about the modulus used with modular arithmetic.
503 // Note that the allocation of memory when that is done within this library
504 // rather that outside it may also need to be thread local if the library is
505 // used from a threaded application, but that does not use the special
506 // treatment indicated just here.
507
508 // While this pointer is defined here it is not at present used! It is for
509 // future work.
510
511 #define TL_arithlibData 48
512 DEFINE_INLINE_THREAD_LOCAL(void *, arithlibData);
513 #endif // CSL
514
515 // A scheme "arithlib_assert" lets me write in my own code to print the
516 // diagnostics. To use this version you must include arithlib.hpp after
517 // and other header files that define assert or abort macros.
518
519 // I implement things here using #define because I want to exploit
520 // __FILE__ and __LINE__ to report where issues arose.
521
522 #define STRINGIFY1(x) #x
523 #define STRINGIFY(x) STRINGIFY1(x)
524
525 // arithlib_abort() mainly exists so I can set a breakpoint on it! Setting one
526 // on the system abort() function sometimes does not give me as much help
527 // as I might have hoped on at least some platforms, while a break-point
528 // on abort() does what I expect. But also the version I have here explains
529 // where it was called from nicely. The location gets passed in as a string
530 // and in a multi-file multi-thread world that could be "interesting", and
531 // so I will have mutex protection for the failure message (getting two
532 // aborts at once would be horrid, and I will be happy if ONE of the reports
533 // come out neatly!). The function diagnostic_muxex returns a reference
534 // to a mutex that can be used to protect things, and there will be just
535 // one of if. It also sets its argument to a reference to a const char *
536 // pointer that will be used for transmitting the location information.
537
538 #ifdef HAVE_INLINE_VARS
539 inline std::mutex my_diagnostic_mutex;
540 inline static const char *my_diagnostic_location;
541 #endif // HAVE_INLINE_VARS
542
diagnostic_mutex(const char *** where)543 inline std::mutex &diagnostic_mutex(const char ***where)
544 {
545 #ifndef HAVE_INLINE_VARS
546 // C++11 guarantees that even if this header is included from many
547 // compilation units there will be a single unique mutex here. I guarantees
548 // that the mutex will have been constructed (ie initialized) by the time
549 // an execution path flows past its definition. However there can be
550 // overhead since the C++ run-time system may protect itself from risk of
551 // multiple threads triggering initialization at the same time. In doing so
552 // it may be that the initialization involved not just a simple boolean flag
553 // but some synchronization primitives. If we have C++17 then inline
554 // variable declarations achieve pretty well just what I want without this
555 // mess, so I will use it.
556 static std::mutex my_diagnostic_mutex;
557 static const char *my_diagnostic_location;
558 #endif // !HAVE_INLINE_VARS
559 *where = &my_diagnostic_location;
560 return my_diagnostic_mutex;
561 }
562
abort1(const char * msg)563 [[noreturn]] inline void abort1(const char *msg)
564 { const char **where;
565 // The call to diagnostic_mutex here is just to retrieve the location of the
566 // const char * variable that holds the location. I am already within
567 // the scope of a mutex.
568 static_cast<void>(diagnostic_mutex(&where));
569 std::cout << std::endl << "About to abort at " << *where << ": "
570 << msg << std::endl;
571 std::abort();
572 }
573
abort1()574 [[noreturn]] inline void abort1()
575 { const char **where;
576 static_cast<void>(diagnostic_mutex(&where));
577 std::cout << std::endl << "About to abort at " << *where << std::endl;
578 std::abort();
579 }
580
581 // This variation on an abort() macro established a lock_guard so that only
582 // one part of the code can be aborting at any one time, and sets up
583 // information about the file and line where trouble arose. It could cope
584 // with arbitrary overloads of abort1() and the fact that the location
585 // information is not passed as an extra argument to abort1() is because
586 // of limitations in __VA_ARGS__ in portable code until C++2a.
587
588 // bacause arithlib_abort is a macro it does not live in any particular
589 // namespace
590
591 #define arithlib_abort(...) \
592 { const char **where; \
593 std::lock_guard<std::mutex> lock( \
594 arithlib_implementation::diagnostic_mutex(&where)); \
595 *where = __FILE__ " line " STRINGIFY(__LINE__); \
596 arithlib_implementation::abort1(__VA_ARGS__); \
597 }
598
599 // The following variable (well constant) enabled "assert" checking. The
600 // effect might be a (probably rather modest) slowdown. However the predefined
601 // macro __OPTIMIZE__ will be set up if any g++ optimizations are in force, so
602 // here I only activate assertions in the case of compilation subject to
603 // "-O0" which will often be associated with "-g".
604 //
605 // However the user can override this by predefining FORCE_DEBUG_ARITH to
606 // encourage debugging of FORCE_NO_DEBUG_ARITH to discourage it.
607
608 // Note I make the flags static not inline in case compilation flags for
609 // different files in a multi-file project differ.
610
611 #if (defined __OPTIMIZE__ && !defined FORCE_DEBUG_ARITH) || \
612 defined FORCE_NO_DEBUG_ARITH
613 static const bool debug_arith = false;
614 #else // __OPTIMIZE__
615 static const bool debug_arith = true;
616 #endif // __OPTIMIZE__
617
618 template <typename F>
assert1(bool ok,const char * why,F && action,const char * location)619 inline void assert1(bool ok, const char *why,
620 F&& action, const char *location)
621 {
622 // Use this as in:
623 // arithlib_assert(predicate, [&]{...});
624 // where the "..." is an arbitrary sequence of actions to be taken
625 // if the assertion fails. The action will typically be to display
626 // extra information about what went wrong.
627 if (debug_arith && !ok)
628 { action();
629 const char **where;
630 std::lock_guard<std::mutex> lock(diagnostic_mutex(&where));
631 *where = location;
632 abort1();
633 }
634 }
635
assert1(bool ok,const char * why,const char * location)636 inline void assert1(bool ok, const char *why, const char *location)
637 {
638 // For simple use where a customised message is not required:
639 // arithlib_assert(predicate);
640 if (debug_arith && !ok)
641 { const char **where;
642 std::lock_guard<std::mutex> lock(diagnostic_mutex(&where));
643 *where = location;
644 abort1(why);
645 }
646 }
647
648 #define arithlib_assert(...) \
649 arithlib_implementation::assert1(__VA_ARGS__, \
650 "arithlib_assert(" #__VA_ARGS__ ")", \
651 __FILE__ " line " STRINGIFY(__LINE__))
652
653
654 // At times during development it is useful to be able to send messages
655 // to a log file.... This should not be used in final production code
656 // but still may help while debugging.
657
658 // Making this "inline" avoids warning messages if it is not
659 // used. So even though this may somewhat waste space when it is used,
660 // I like this option.
661
logprintf(const char * fmt,...)662 inline void logprintf(const char *fmt, ...)
663 {
664 // I use a fixed name for the log file. This is another respect in which
665 // this has to be seen as code only suitable for temporary use. I use static
666 // variables that are local to this function so that even if the header is
667 // included from multiple source files there will not be multiple-definition
668 // clashes, and since this is just for diagnostics I am not worried about
669 // any costs that this imposes.
670 static std::FILE *logfile = NULL;
671 const char **where;
672 std::lock_guard<std::mutex> lock(diagnostic_mutex(&where));
673 if (logfile == NULL) logfile = std::fopen("arith.log", "w");
674 std::va_list args;
675 va_start(args, fmt);
676 std::vfprintf(logfile, fmt, args);
677 va_end(args);
678 std::fflush(logfile);
679 }
680
traceprintf(const char * fmt,...)681 inline void traceprintf(const char *fmt, ...)
682 { const char **where;
683 std::lock_guard<std::mutex> lock(diagnostic_mutex(&where));
684 std::va_list args;
685 va_start(args, fmt);
686 std::vprintf(fmt, args);
687 va_end(args);
688 std::fflush(stdout);
689 }
690
691 // The C++ rules about shifts are not always very comfortable, in particular
692 // right shifts on signed values may or may not propagate the sign bit
693 // and a left shift on signed values might cause trouble in overflow cases.
694 // So here are versions that should behave consistently across all
695 // architectures. Well it is probable that from C++20 onwards the language
696 // specification will arrange that right shifts on arithmetic types
697 // propagate the sign in the way that I want, and it is also liable to be
698 // the case that g++ and clang on all the computers that interest me do things
699 // the "obvious way" already, so the code here is something of an exercise
700 // in pedantry.
701
ASR(std::int32_t a,std::int64_t n)702 inline std::int32_t ASR(std::int32_t a, std::int64_t n)
703 { if (n<0 || n>=8*static_cast<int>(sizeof(std::int32_t))) n=0;
704 std::uint32_t r = static_cast<std::uint32_t>(a) >> n;
705 std::uint32_t signbit = static_cast<std::uint32_t>(a) >>
706 (8*sizeof(std::uint32_t)-1);
707 if (n != 0) r |= ((-signbit) << (8*sizeof(std::uint32_t) - n));
708 return static_cast<std::int32_t>(r);
709 }
710
ASR(std::int64_t a,std::int64_t n)711 inline std::int64_t ASR(std::int64_t a, std::int64_t n)
712 { if (n<0 || n>=8*static_cast<int>(sizeof(std::int64_t))) n=0;
713 std::uint64_t r = static_cast<std::uint64_t>(a) >> n;
714 std::uint64_t signbit = static_cast<std::uint64_t>(a) >> (8*sizeof(
715 std::uint64_t)-1);
716 if (n != 0) r |= ((-signbit) << (8*sizeof(std::uint64_t) - n));
717 return static_cast<std::int64_t>(r);
718 }
719
ASR(std::uint64_t a,std::int64_t n)720 inline std::uint64_t ASR(std::uint64_t a, std::int64_t n)
721 { return ASR(static_cast<std::int64_t>(a), n);
722 }
723
724 // The behaviour of left shifts on negative (signed) values seems to be
725 // labelled as undefined in C/C++, so any time I am going to do a left shift
726 // I need to work in an unsigned type. Well at some stage in the future it
727 // may be that C++ will insist that signed integers are handled in 2s
728 // complement form and shifts on them behave "as naively expected" but at
729 // present that can not be relied upon.
730
731 // In the comparisons I case the sizeof value to an int so that I can compare
732 // it against the signed amount n.
733
ASL(std::int32_t a,std::int64_t n)734 inline std::int32_t ASL(std::int32_t a, std::int64_t n)
735 { if (n < 0 || n>=8*static_cast<int>(sizeof(std::uint32_t))) n = 0;
736 return static_cast<std::int32_t>(static_cast<std::uint32_t>(a) << n);
737 }
738
ASL(std::int64_t a,std::int64_t n)739 inline std::int64_t ASL(std::int64_t a, std::int64_t n)
740 { if (n < 0 || n>=8*static_cast<int>(sizeof(std::uint64_t))) n = 0;
741 return static_cast<std::int64_t>(static_cast<std::uint64_t>(a) << n);
742 }
743
744 //=========================================================================
745 //=========================================================================
746 // I want to make C++ output using the "<<" operator on an ostream cope
747 // with big numbers. Doing so makes their use much smoother. The particular
748 // aspect of this addresses here is the provision of an IO manipulator
749 // called "std::bin" that sets for binary display of bignums (bit not of
750 // other integer types).
751 //=========================================================================
752 //=========================================================================
753
754 struct radix
755 {
756 public:
757 // I would like setting hex or oct or dec to disable this, but at present
758 // I do not know how to do that. So I will arrange that binary output is
759 // only generated if none of those flags are set, because I can clear
760 // them here. Then when I restore one of them I disable the test for binary.
761 // I will arrange that if nobody has ever asked for binary they do not get it,
762 // just so I feel safe.
set_binary_outputarithlib_implementation::radix763 static void set_binary_output(std::ios_base &s)
764 { flag(s) = 1;
765 s.unsetf(std::ios_base::dec);
766 s.unsetf(std::ios_base::oct);
767 s.unsetf(std::ios_base::hex);
768 }
is_binary_outputarithlib_implementation::radix769 static bool is_binary_output(std::ios_base &s)
770 { return flag(s) != 0;
771 }
772 private:
flagarithlib_implementation::radix773 static long & flag(std::ios_base &s)
774 { static int n = std::ios_base::xalloc();
775 return s.iword(n);
776 }
777 };
778
779 } // temporary end of namespace arithlib
780
781 // I want a new io manipulator "std::bin" to select binary mode output.
782 // This will be used much as std::oct, std::dec and std::hex.
783
784 namespace std
bin(std::ostream & os)785 { inline std::ostream &bin(std::ostream &os)
786 { arithlib_implementation::radix::set_binary_output(os);
787 return os;
788 }
789 }
790
791
792
793 namespace arithlib_implementation
794 {
795
796 // When I get to big-integer multiplication I will use two worker threads
797 // so that elapsed times for really large multiplications are reduced
798 // somewhat. Well ideally by a factor approaching 3. I set up a framework
799 // of support for the threads here. Each main program thread will want
800 // its own pair of worker threads here. Each worker thread gets passed a
801 // nice object called "worker_data" that encapsulates the way it receives
802 // data from the caller and passes results back.
803
804 // Each worker thread needs some data that it shares with the main thread.
805 // this structure encapsulates that.
806
807 // Probably the most "official" way to coordinate threads would be to use
808 // condition variables, but doing so involves several synchronization
809 // primitives for each step of the transaction. For the simple level of
810 // coordination I need here it would be more costly that necessary. I can
811 // manage here with a scheme that when thread A want to allow thread B to
812 // proceed it unlocks a mutex that thread B was waiting on. There is some
813 // mess associated with ensuring that the main thread waits for results and
814 // that there are no race situations where all threads compete for a single
815 // mutex.
816 //
817 // There are 4 mutexes for each worker thread, but each synchronization step
818 // just involves a single mutex, transferring ownership between main and worker
819 // thread. Here is the patter of transfer and subsequent ownership, with "?"
820 // marking a muxex that has been waiting and the ">n" or <n" in the middle
821 // also showing which muxex has just been activated:
822 // X X . . ? . X X Idle state. Worker waiting on mutex 0
823 // To start a transaction the main thread sets up data and unlocks mutex 0.
824 // That allows the worker to proceed and pick up the data.
825 // . X . . >0 ?X . X X first transaction
826 // The main thread must not alter data until the worker is finished. It waits
827 // on mutex 1 until the worker tells it that a result is available.
828 // . X ?X . <2 X . . X
829 // The main thread is now entitles to start using the results of the activity
830 // just completed and setting up data for the next one. It can not release
831 // mutex 0 to restart the worker because the worker alread owns that. And even
832 // though it owns mutex 2 it had better not release that, because for that
833 // to make sense the worker would need to be waiting on it, and that would mean
834 // the worker had just done m3.unlock(); m3.lock() in quick succesion, which
835 // might have led it to grab m3 rather than the main program managing to. So
836 // use the third mutex, which the worker must be waiting on.
837 // . . X . >1 X ?X . X second transaction
838 // When it has finished its task the worker now unlocks mutex 3. This leaves
839 // a situation symmetric with the initial one
840 // . . X ?X <3 X X . .
841 // . . . X >2 X X ?X . third transaction
842 // ?X . . X <0 . X X .
843 // X . . . >3 . X X ?X fourth transaction
844 // X ?X . . <1 . . X X back in initial configuration!
845 //
846 // The pattern is thus that the master goes
847 // [initially lock 0 and 1]
848 // unlock 0 wait 2
849 // unlock 1 wait 3
850 // unlock 2 wait 0
851 // unlock 3 wait 1
852 // while the worker goes
853 // [initially lock 2 and 3]
854 // wait 0 unlock 2
855 // wait 1 unlock 3
856 // wait 2 unlock 0
857 // wait 3 unlock 1
858 // Observe that I can use (n^2) to map between the mutex number in the first
859 // and second columns here. That counting is what send_count and
860 // receive_count are doing.
861
862 // In a nice world I would use just the C++ std::mutex scheme for
863 // synchronization, however here I am performance critical and to save
864 // a bit when performing medium-sized multiplications I will use the
865 // Microsoft version of mutexes directly on that platform.
866
867 #if defined __CYGWIN__ || defined __MINGW32__
868 #define USE_MICROSOFT_MUTEX 1
869
870 extern "C"
871 { struct SecApp
872 { std::uintptr_t nLength;
873 void *lpSecurityDescriptor;
874 int bINheritHandle;
875 };
876
877 extern __declspec(dllimport) void *
878 CreateMutexA(SecApp *, std::uintptr_t, const char *);
879 extern __declspec(dllimport) int CloseHandle(void *h);
880 extern __declspec(dllimport) int ReleaseMutex(void *m);
881 extern __declspec(dllimport) void *
882 WaitForSingleObject(void *, std::uintptr_t);
883 };
884
885 #endif // __CYGWIN__ or __MINGW32__
886
887 class Worker_data
888 {
889 public:
890 std::atomic<bool> ready;
891 #ifdef USE_MICROSOFT_MUTEX
892 void *mutex[4];
893 #else
894 std::mutex mutex[4];
895 #endif
896 bool quit_flag;
897 const std::uint64_t *a;
898 std::size_t lena;
899 const std::uint64_t *b;
900 std::size_t lenb;
901 std::uint64_t *c;
902 std::uint64_t *w;
903
904 // When I construct an instance of Worker data I set its quit_flag to
905 // false and lock two of the mutexes. That sets things up so that when
906 // it is passed to a new worker thread that thread behaves in the way I
907 // want it to.
Worker_data()908 Worker_data()
909 { ready = false;
910 quit_flag = false;
911 #ifdef USE_MICROSOFT_MUTEX
912 mutex[0] = CreateMutexA(NULL, 1, NULL);
913 mutex[1] = CreateMutexA(NULL, 1, NULL);
914 mutex[2] = CreateMutexA(NULL, 0, NULL);
915 mutex[3] = CreateMutexA(NULL, 0, NULL);
916 #else
917 // The next two must be locked by the main thread.
918 mutex[0].lock();
919 mutex[1].lock();
920 #endif
921 }
922 #ifdef USE_MICROSOFT_MUTEX
~Worker_data()923 ~Worker_data()
924 { CloseHandle(mutex[0]);
925 CloseHandle(mutex[1]);
926 CloseHandle(mutex[2]);
927 CloseHandle(mutex[3]);
928 }
929 #endif
930 };
931
932 inline void worker_thread(Worker_data *w);
933
934 // Then each main thread will have a structure that encapsulated the
935 // two worker threads that it ends up with and the data it sets up for
936 // them and that they then access. When this structures is created it will
937 // cause the worker threads and the data block they need to be constructed.
938
939 class DriverData
940 {
941 public:
942 int send_count = 0;
943 Worker_data wd_0,
944 wd_1;
945 // When an instance of DriverData is created the two sets of Worker_data
946 // get constructed with two of their mutexes locked. This will mean that when
947 // worker threads are created and start running they will politely wait for
948 // work.
949
950 std::thread w_0, w_1;
DriverData()951 DriverData()
952 { w_0 = std::thread(worker_thread, &wd_0),
953 w_1 = std::thread(worker_thread, &wd_1);
954 // I busy-wait until the two threads have both claimed the mutexes that they
955 // need to own at the start! Without this the main thread may post a
956 // multiplication, so its part of the work and try to check that the worker
957 // has finished (by claiming one of these mutexes) before the worker thread
958 // has got started up and has claimed them. This feels clumsy, but it only
959 // happens at system-startup.
960 while (!wd_0.ready.load() && !wd_1.ready.load())
961 std::this_thread::sleep_for(std::chrono::microseconds(1));
962 }
963
964 // When the DriverData object is to be destroyed it must arrange to
965 // stop and then join the two threads that it set up. This code that sends
966 // a "quit" message to each thread will be executed before the thread object
967 // is deleted, and the destructor of the thread object should be activated
968 // before that of the Worker_data and the mutexes within that.
969
~DriverData()970 ~DriverData()
971 { wd_0.quit_flag = wd_1.quit_flag = true;
972 release_workers();
973 w_0.join();
974 w_1.join();
975 }
976
977 // Using the worker threads is then rather easy: one sets up data in
978 // the Worker_data structures and then call release_workers(). Then
979 // you can do your own thing in parallel with the two workers picking up
980 // the tasks that they had been given. When you are ready you call
981 // wait_for_workers() which does what one might imagine, and the workers
982 // are expected to have left their results in the Worker_data object so
983 // you can find it.
984
release_workers()985 void release_workers()
986 {
987 #ifdef USE_MICROSOFT_MUTEX
988 ReleaseMutex(wd_0.mutex[send_count]);
989 ReleaseMutex(wd_1.mutex[send_count]);
990 #else
991 wd_0.mutex[send_count].unlock();
992 wd_1.mutex[send_count].unlock();
993 #endif
994 }
995
wait_for_workers()996 void wait_for_workers()
997 {
998 #ifdef USE_MICROSOFT_MUTEX
999 WaitForSingleObject(wd_0.mutex[send_count^2], 0xffffffff);
1000 WaitForSingleObject(wd_1.mutex[send_count^2], 0xffffffff);
1001 #else
1002 wd_0.mutex[send_count^2].lock();
1003 wd_1.mutex[send_count^2].lock();
1004 #endif
1005 send_count = (send_count+1)&3;
1006 }
1007
1008 };
1009
1010 // I encapsulate the driver data in this function, which will ensure that
1011 // exactly one copy gets made for each top-level thread that calls this,
1012 // ie that uses a huge multiplication.
1013
getDriverData()1014 inline DriverData *getDriverData()
1015 { static thread_local DriverData dd;
1016 return ⅆ
1017 }
1018
1019 // Declare a number of functions that might usefully be used elsewhere. If
1020 // I declare them "inline" then it will be OK even if I include this header
1021 // from multiple source files because only one copy should end up in the
1022 // eventually-linked executable.
1023
1024 inline std::uint64_t *reserve(std::size_t n);
1025 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1026 std::size_t final);
1027 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1028 std::size_t final);
1029 inline void abandon(std::uint64_t *p);
1030 inline void abandon(std::intptr_t h);
1031
1032 #if defined LISP && !defined ZAPPA
1033 typedef std::intptr_t string_handle;
1034 #else
1035 typedef char *string_handle;
1036 #endif
1037
1038 inline string_handle confirm_size_string(char *p, std::size_t n,
1039 std::size_t final);
1040 inline void abandon_string(string_handle);
1041
1042 inline std::intptr_t vector_to_handle(std::uint64_t *p);
1043 inline std::uint64_t *vector_of_handle(std::intptr_t n);
1044 inline std::size_t number_size(std::uint64_t *p);
1045
1046 inline bool fits_into_fixnum(std::int64_t n);
1047 inline std::intptr_t int_to_handle(std::int64_t n);
1048 constexpr inline std::int64_t int_of_handle(std::intptr_t n);
1049
1050 inline std::intptr_t string_to_bignum(const char *s);
1051 inline std::intptr_t int_to_bignum(std::int64_t n);
1052 inline std::intptr_t unsigned_int_to_bignum(std::uint64_t n);
1053 inline std::intptr_t round_double_to_int(double d);
1054 inline std::intptr_t trunc_double_to_int(double d);
1055 inline std::intptr_t floor_double_to_int(double d);
1056 inline std::intptr_t ceiling_double_to_int(double d);
1057 #ifdef softfloat_h
1058 inline std::intptr_t round_float128_to_int(float128_t d);
1059 inline std::intptr_t trunc_float128_to_int(float128_t d);
1060 inline std::intptr_t floor_float128_to_int(float128_t d);
1061 inline std::intptr_t ceiling_float128_to_int(float128_t d);
1062 #endif // softfloat_h
1063 inline std::intptr_t uniform_positive(std::size_t n);
1064 inline std::intptr_t uniform_signed(std::size_t n);
1065 inline std::intptr_t uniform_upto(std::intptr_t a);
1066 inline std::intptr_t random_upto_bits(std::size_t bits);
1067 inline std::intptr_t fudge_distribution(std::intptr_t, int);
1068
1069 #ifdef PRECISE_GC
1070 inline void push(std::intptr_t w);
1071 inline void pop(std::intptr_t& w);
1072
1073 // In many cases where I want to save things I will have a reference to
1074 // an array of uint64_t objects, so when I push it I must convert it back to
1075 // a Lisp-friendly form.
push(std::uint64_t * p)1076 inline void push(std::uint64_t *p)
1077 { push(vector_to_handle(p));
1078 }
pop(std::uint64_t * & p)1079 inline void pop(std::uint64_t *&p)
1080 { std::intptr_t w;
1081 pop(w);
1082 p = vector_of_handle(w);
1083 }
1084 #else
1085 // In cases where these are not required I will just defined them as
1086 // empty procedures and hope that the C++ compiler will inline them and
1087 // hence lead to them not adding any overhead at all.
push(std::intptr_t p)1088 inline void push(std::intptr_t p)
1089 {}
pop(std::intptr_t & p)1090 inline void pop(std::intptr_t& p)
1091 {}
1092
push(std::uint64_t * p)1093 inline void push(std::uint64_t *p)
1094 {}
pop(std::uint64_t * & p)1095 inline void pop(std::uint64_t *&p)
1096 {}
1097
push(const std::uint64_t * p)1098 inline void push(const std::uint64_t *p)
1099 {}
pop(const std::uint64_t * & p)1100 inline void pop(const std::uint64_t *&p)
1101 {}
1102
1103 #endif
1104
1105 #if defined MALLOC
1106
1107 //=========================================================================
1108 //=========================================================================
1109 // The MALLOC option is perhaps the simplest! It uses C style
1110 // malloc/realloc and free functions and the vector if turned into a
1111 // handle by just casting it to an intptr_t value.
1112 //=========================================================================
1113 //=========================================================================
1114
1115
1116 // The following are provided so that a user can update malloc_function,
1117 // realloc_function and free_function to refer to their own choice of
1118 // allocation technology. Well it is a bit uglier than that! If you compile
1119 // using C++17 then the function pointers here are per_thread across every
1120 // compilation unit and things behave "as you might expect". If you have an
1121 // earlier C++ standard in play than each compilation unit gets its own static
1122 // set of variables that can be used to divert memory allocation, and hence
1123 // each compilation unit that includes this header is liable to need to reset
1124 // them all.
1125 // I view the ability to replace the allocation functions as somewhat
1126 // specialist and to be used by people who will have the skill to modify this
1127 // code as necessary, so this slight oddity does not worry me.
1128
1129 typedef void *malloc_t(size_t n);
1130 typedef void *realloc_t(void *, std::size_t);
1131 typedef void free_t(void *);
1132
1133 INLINE_VAR malloc_t *malloc_function = std::malloc;
1134 INLINE_VAR realloc_t *realloc_function = std::realloc;
1135 INLINE_VAR free_t *free_function = std::free;
1136
reserve(std::size_t n)1137 inline std::uint64_t *reserve(std::size_t n)
1138 { arithlib_assert(n>0);
1139 std::uint64_t *r = reinterpret_cast<std::uint64_t *>
1140 (*malloc_function)((
1141 n+1)*sizeof(std::uint64_t));
1142 arithlib_assert(r != NULL);
1143 return &r[1];
1144 }
1145
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)1146 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1147 std::size_t final)
1148 { artithlib_assert(final>0 && n>=final);
1149 p = reinterpret_cast<std::uint64_t *>()
1150 (*realloc_function)((void *)&p[-1],
1151 (final_n+1)*sizeof(std::uint64_t));
1152 arithlib_assert(p != NULL);
1153 p[0] = final_n;
1154 return vector_to_handle(&p[1]);
1155 }
1156
1157 // In this model confirm_size_x() is just the same as confirm_size().
1158
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)1159 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1160 std::size_t final)
1161 { arithlib_assert(final>0 && n>=final);
1162 confirm_size(p, n, final);
1163 }
1164
abandon(std::uint64_t * p)1165 inline void abandon(std::uint64_t *p)
1166 { (*free_function)((void *)&p[-1]);
1167 }
1168
1169 // Note that I allocate space for the string data plus a NUL terminating byte.
1170
reserve_string(std::size_t n)1171 inline char *reserve_string(std::size_t n)
1172 { char *r = reinterpret_cast<char *>(*malloc_function)(n+1);
1173 arithlib_assert(r != NULL);
1174 return r;
1175 }
1176
1177 // When I confirm the size of a string all I do is to write a NUL byte
1178 // to terminate it. I expect the code to have reserved an amount of memory
1179 // that is not much longer than the amount that I will need, so using
1180 // realloc to shrink things to the exact size that is filled would not
1181 // be a good bargain.
1182
confirm_size_string(char * p,std::size_t n,std::size_t final)1183 inline char *confirm_size_string(char *p, std::size_t n,
1184 std::size_t final)
1185 { arithlib_assert(final>0 && (n+9)>final);
1186 r[final] = 0;
1187 return r;
1188 }
1189
abandon_string(char * s)1190 inline void abandon_string(char *s)
1191 { (*free_function)(s);
1192 }
1193
1194 // In the C/malloc model I will represent a number by the intptr_t style
1195 // integer that is obtained from a pointer to the first digit of the bignum.
1196
vector_to_handle(std::uint64_t * p)1197 inline std::intptr_t vector_to_handle(std::uint64_t *p)
1198 { return reinterpret_cast<std::intptr_t>(p);
1199 }
1200
vector_of_handle(std::intptr_t n)1201 inline std::uint64_t *vector_of_handle(std::intptr_t n)
1202 { return reinterpret_cast<std::uint64_t *>(n);
1203 }
1204
number_size(std::uint64_t * p)1205 inline std::size_t number_size(std::uint64_t *p)
1206 { arithlib_assert(p[-1]!=0);
1207 return p[-1];
1208 }
1209
1210 // When I use Bignums that are allocated using malloc() and operated on
1211 // via C++ overloaded operators I often need to copy the data. However when
1212 // memory management uses garbage collection I can allow multiple references
1213 // to the same object and so copying is not needed as much. I have two
1214 // copying functions. One only copies if the system is using MALLOC or
1215 // NEW (but leaves data sharable on systems with garbage collection) while
1216 // the other unconditionally copies.
1217
always_copy_bignum(std::uint64_t * p)1218 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
1219 { std::size_t n = p[-1];
1220 push(p);
1221 std::uint64_t *r = reserve(n);
1222 pop(p);
1223 std::memcpy(r, p, n*sizeof(std::uint64_t));
1224 return confirm_size(r, n, n);
1225 }
1226
copy_if_no_garbage_collector(std::uint64_t * p)1227 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
1228 { std::size_t n = p[-1];
1229 push(p);
1230 std::uint64_t *r = reserve(n);
1231 pop(p);
1232 std::memcpy(r, p, n*sizeof(std::uint64_t));
1233 return confirm_size(r, n, n);
1234 }
1235
copy_if_no_garbage_collector(std::intptr_t pp)1236 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
1237 { if (stored_as_fixnum(pp)) return pp;
1238 std::uint64_t *p = vector_of_handle(pp);
1239 std::size_t n = number_size(p);
1240 push(p);
1241 std::uint64_t *r = reserve(n);
1242 pop(p);
1243 std::memcpy(r, p, n*sizeof(std::uint64_t));
1244 return confirm_size(r, n, n);
1245 }
1246
1247 #elif defined NEW
1248
1249 //=========================================================================
1250 //=========================================================================
1251 // The NEW code is intended to be a reasonably sensible implementation for
1252 // use of this code free-standing within C++. Memory is allocated in units
1253 // whose size is a power of 2, and I keep track of memory that I have used
1254 // and discarded so that I do not need to go back to the system-provided
1255 // allocator too often.
1256 //=========================================================================
1257 //=========================================================================
1258
1259
1260 inline unsigned int log_next_power_of_2(std::size_t n);
1261
1262 // There is a serious issue here as regards thread safety. And a subsidiary
1263 // one about C++17.
1264 //
1265 // As things stand if you use C++ memory allocation the local allocation
1266 // and reallocation here is not thread safe. The result could be a disaster
1267 // if multiple threads used big-numbers via the C++ type Bignum. Note that
1268 // if you do not have "NEW" defined but instead use MALLOC or LISP you are
1269 // safe.
1270
1271 // CONCURRENCY SUPPORT:
1272 // I provide three options, and the selection as to which is used
1273 // can be made by predefining a symbol.
1274 //
1275 // ARITHLIB_MUTEX. Use a mutex to protect memory allocation. One hope
1276 // is that in situations where there is low contention the
1277 // overhead will be small, and so this is the default.
1278 // ARITHLIB_THREAD_LOCAL. Have a separate memory pool for use by each thread.
1279 // This can use more memory than a scheme that uses shared
1280 // allocation.
1281 // ARITHLIB_NO_THREADS. Assume that no concurrent use of arithlib will
1282 // arise and so no extra complication or overhead is needed.
1283 // I do not make this the default because I can imagine
1284 // people extending a program to use threads and then not
1285 // looking here!
1286 //
1287 // I looked into having an ARITHLIB_LOCK_FREE to use compare-and-swap
1288 // operations to maintain the freestore pool, but support via gcc on x86_64
1289 // is uncertain (in part because not all variants on x86_64 CPUs implement
1290 // the CMPXCHG16B instruction (though soon the ones that do not will count as
1291 // museum pieces), and also because reports on the performance impact
1292 // do not see quite clear-cut. I also see various unspecific comments about
1293 // patents, but I have not followed them up.
1294 // I could fairly happily imagine replacing the discard() code with a lock
1295 // free "push" operation, which is simple to implement and does not suffer
1296 // from the "ABA" issue. allocate() seems to be harder to implement and
1297 // especially harder to implement in a way that will give reliably high
1298 // performance across all platforms while avoiding machine depemdent
1299 // components and especially in-line assembly code.
1300
1301 // If no options has been pre-defined I will default to ARITHLIB_MUTEX
1302 #if !defined ARITHLIB_MUTEX && \
1303 !defined ARITHLIB_THREAD_LOCAL && \
1304 !defined ARITHLIB_NO_THREADS
1305 #define ARITHLIB_MUTEX 1
1306 #endif
1307
1308 // Attempts to select more that one option at once get detected and moaned
1309 // about.
1310
1311 #if (defined ARITHLIB_MUTEX && defined ARITHLIB_THREAD_LOCAL) || \
1312 (defined ARITHLIB_THREAD_LOCAL && defined ARITHLIB_NO_THREADS) || \
1313 (defined ARITHLIB_NO_THREADS && defined ARITHLIB_MUTEX)
1314 #error Only one thread-support policy can be selected.
1315 #endif
1316
1317 #ifdef ARITHLIB_MUTEX
freechain_mutex()1318 inline std::mutex& freechain_mutex()
1319 { static std::mutex m;
1320 return m;
1321 }
1322 #endif
1323
1324 typedef std::uint64_t *freechain_table_type[64];
1325
1326 #ifdef ARITHLIB_THREAD_LOCAL
1327 inline thread_local freechain_table_type freechain_table;
1328 #else // ARITHLIB_THREAD_LOCAL
1329 #ifdef __cpp_inline_variables
1330 class freechain_table
1331 { static inline freechain_table_type freechain_table_Var;
1332 public:
get()1333 static freechain_table_type& get()
1334 { return freechain_table_Var;
1335 }
1336 };
1337 #else // __cpp_inline_variables
1338 class freechain_table
1339 {
1340 public:
get()1341 static freechain_table_type& get()
1342 { static freechain_table_type freechain_table_Var;
1343 return freechain_table_Var;
1344 }
1345 };
1346 #endif // __cpp_inline_variables
1347 #endif // ARITHLIB_THREAD_LOCAL
1348
1349 class Freechains
1350 {
1351 private:
1352 // The following obscure line ensures that when I make an instance
1353 // of Freechains it forces the standard streams into existence. Having
1354 // that initilization happening early helps a LITTLE bit to reassure me
1355 // that the standard streams should still be alive during the destructor
1356 // for this class.
1357 std::ios_base::Init force_iostreams_initialization;
1358
1359 public:
Freechains()1360 Freechains()
1361 { for (int i=0; i<64; i++) (freechain_table::get())[i] = NULL;
1362 }
1363
~Freechains()1364 ~Freechains()
1365 { for (std::size_t i=0; i<64; i++)
1366 { std::uint64_t *f = (freechain_table::get())[i];
1367 if (debug_arith)
1368 // Report how many entries there are in the freechain.
1369 { std::size_t n = 0;
1370 // To arrange that double-free mistakes are detected I arrange to put -1
1371 // in the initial word of any deleted block, so that all blocks on the
1372 // freechains here should show that. I set and test for that in the other
1373 // bits of code that allocate or release memory.
1374 for (std::uint64_t *b=f; b!=NULL; b = (std::uint64_t *)b[1])
1375 { arithlib_assert(b[0] == -static_cast<std::uint64_t>(1));
1376 n++;
1377 }
1378 if (n != 0)
1379 std::cout << "Freechain " << i << " length: "
1380 << n << " " << ((static_cast<std::size_t>(1))<<i)
1381 << std::endl;
1382 }
1383 while (f != NULL)
1384 { std::uint64_t w = f[1];
1385 delete [] f;
1386 f = reinterpret_cast<std::uint64_t *>(w);
1387 }
1388 (freechain_table::get())[i] = NULL;
1389 }
1390 }
1391
allocate(std::size_t n)1392 static std::uint64_t *allocate(std::size_t n)
1393 {
1394 #ifdef DEBUG_OVERRUN
1395 // If I am debugging I can allocate an extra word that will lie
1396 // just beyond what would have been the end of my block, and I will
1397 // fill everything with a pattern that might let me spot some cases where
1398 // I write beyond the proper size of data. This option is only supported
1399 // when storage allocation is using NEW.
1400 int bits = log_next_power_of_2(n+1);
1401 #else
1402 // Finding the number of bits in n is achieved by counting the leading
1403 // zeros in the representation of n, and on many platforms an intrinsic
1404 // function will compile this into a single machine code instruction.
1405 int bits = log_next_power_of_2(n);
1406 #endif
1407 arithlib_assert(n<=((static_cast<std::size_t>(1))<<bits) && n>0);
1408 std::uint64_t *r;
1409 #if defined ARITHLIB_THREAD_LOCAL || ARITHLIB_NO_THREADS
1410 r = (freechain_table::get())[bits];
1411 if (r != NULL)
1412 (freechain_table::get())[bits] = reinterpret_cast<std::uint64_t *>
1413 (r)[1];
1414 #elif defined ARITHLIB_MUTEX
1415 { std::lock_guard<std::mutex> lock(freechain_mutex());
1416 r = (freechain_table::get())[bits];
1417 if (r != NULL)
1418 (freechain_table::get())[bits] =
1419 reinterpret_cast<std::uint64_t *>(r[1]);
1420 }
1421 #else
1422 #error Internal inconsistency in arithlib.hpp: memory allocation strategy.
1423 #endif
1424 // If no memory had been found on the freechain I need to allocate some
1425 // more.
1426 if (r == NULL)
1427 { r = new std::uint64_t[(static_cast<std::size_t>(1))<<bits];
1428 if (r == NULL) arithlib_abort("Run out of memory");
1429 }
1430 #ifdef DEBUG_OVERRUN
1431 // When debugging I tend to initialize all memory to a known pattern
1432 // when it is allocated and check that the final word has not been
1433 // overwritten when I release it. This is not foolproof but it helps me
1434 // by being able to detect many possible cases of overrun.
1435 if (debug_arith)
1436 { std::memset(r, 0xaa,
1437 (static_cast<std::size_t>(1)<<bits)*sizeof(std::uint64_t));
1438 r[0] = 0;
1439 }
1440 #endif
1441 // Just the first 32-bits of the header word record the block capacity.
1442 // The casts here look (and indeed are) ugly, but when I store data into
1443 // memory as a 32-bit value that is how I will read it later on, and the
1444 // messy notation here does not correspond to complicated computation.
1445 reinterpret_cast<std::uint32_t *>(r)[0] = bits;
1446 return r;
1447 }
1448
1449 // When I abandon a memory block I will push it onto a relevant free chain.
1450
abandon(std::uint64_t * p)1451 static void abandon(std::uint64_t *p)
1452 { arithlib_assert(p[0] != -static_cast<std::uint64_t>(1));
1453 int bits = reinterpret_cast<std::uint32_t *>(p)[0];
1454 arithlib_assert(bits>0 && bits<48);
1455 // Here I assume that sizeof(uint64_t) >= sizeof(intptr_t) so I am not
1456 // going to lose information here on any platform I can consider.
1457 if (debug_arith) p[0] = -static_cast<std::uint64_t>(1);
1458 #ifdef ARITHLIB_ATOMIC
1459 lockfree_push(p, freechain_table::get(), bits);
1460 #else
1461 #ifdef ARITHLIB_MUTEX
1462 std::lock_guard<std::mutex> lock(freechain_mutex());
1463 #endif
1464 p[1] = reinterpret_cast<std::uint64_t>(freechain_table::get()[bits]);
1465 (freechain_table::get())[bits] = p;
1466 #endif
1467 }
1468
1469 };
1470
1471 #ifdef ARITHLIB_THREAD_LOCAL
1472 inline thread_local Freechains freechain;
1473 #else // ARITHLIB_THREAD_LOCAL
1474 #ifdef __cpp_inline_variables
1475 class freechains
1476 { static inline Freechains freechains_Var;
1477 public:
get()1478 static Freechains& get()
1479 { return freechains_Var;
1480 }
1481 };
1482 #else // __cpp_inline_variables
1483 class freechains
1484 {
1485 public:
get()1486 static Freechains& get()
1487 { static Freechains freechains_Var;
1488 return freechains_Var;
1489 }
1490 };
1491 #endif // __cpp_inline_variables
1492 #endif // ARITHLIB_THREAD_LOCAL
1493
reserve(std::size_t n)1494 inline std::uint64_t *reserve(std::size_t n)
1495 { arithlib_assert(n>0);
1496 return &(freechains::get().allocate(n+1))[1];
1497 }
1498
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)1499 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1500 std::size_t final)
1501 { arithlib_assert(final>0 && n>=final);
1502 // Verify that the word just beyond where anything should have been
1503 // stored has not been clobbered.
1504 #ifdef DEBUG_OVERRUN
1505 if (debug_arith) arithlib_assert(p[n] == 0xaaaaaaaaaaaaaaaaU);
1506 #endif
1507 if (final == 1 && fits_into_fixnum(static_cast<std::int64_t>(p[0])))
1508 { std::intptr_t r = int_to_handle(static_cast<std::int64_t>(p[0]));
1509 freechains::get().abandon(&p[-1]);
1510 return r;
1511 }
1512 // I compare the final size with the capacity and if it is a LOT smaller
1513 // I allocate a new smaller block and copy the data across.
1514 // That situation can most plausibly arise when two similar-values big
1515 // numbers are subtracted.
1516 int bits = reinterpret_cast<std::uint32_t *>(&p[-1])[0];
1517 std::size_t capacity = (static_cast<std::size_t>(1))<<bits;
1518 if (capacity > 4*final)
1519 { std::uint64_t *w =
1520 freechains::get().allocate(
1521 (static_cast<std::size_t>(1))<<log_next_power_of_2(final+1));
1522 std::memcpy(&w[1], p, final*sizeof(std::uint64_t));
1523 freechains::get().abandon(&p[-1]);
1524 p = &w[1];
1525 }
1526 reinterpret_cast<std::uint32_t *>(&p[-1])[1] = final;
1527 return vector_to_handle(p);
1528 }
1529
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)1530 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1531 std::size_t final)
1532 { arithlib_assert(final>0 && n>=final);
1533 return confirm_size(p, n, final);
1534 }
1535
number_size(std::uint64_t * p)1536 inline std::size_t number_size(std::uint64_t *p)
1537 { std::size_t r = reinterpret_cast<std::uint32_t *>(&p[-1])[1];
1538 arithlib_assert(r>0);
1539 return reinterpret_cast<std::uint32_t *>(&p[-1])[1];
1540 }
1541
vector_to_handle(std::uint64_t * p)1542 inline std::intptr_t vector_to_handle(std::uint64_t *p)
1543 { return reinterpret_cast<std::intptr_t>(p);
1544 }
1545
vector_of_handle(std::intptr_t n)1546 inline std::uint64_t *vector_of_handle(std::intptr_t n)
1547 { return reinterpret_cast<std::uint64_t *>(n);
1548 }
1549
stored_as_fixnum(std::intptr_t a)1550 inline bool stored_as_fixnum(std::intptr_t a)
1551 { return (a & 1) != 0;
1552 }
1553
int_of_handle(std::intptr_t a)1554 constexpr inline std::int64_t int_of_handle(std::intptr_t a)
1555 { return (static_cast<std::int64_t>(a) & ~static_cast<std::int64_t>
1556 (1))/2;
1557 }
1558
1559 // This function should only ever be called in situations where the
1560 // arithmetic indicated will not overflow.
1561
int_to_handle(std::int64_t n)1562 inline std::intptr_t int_to_handle(std::int64_t n)
1563 { return static_cast<std::intptr_t>(2*n + 1);
1564 }
1565
1566 // The following two lines are slighly delicate in that INTPTR_MIN and _MAX
1567 // may not have the low tag bits to be proper fixnums. But if I implement
1568 // int_of_handle so that it ignores tag bits that will be OK.
1569
1570 INLINE_VAR const std::int64_t MIN_FIXNUM = int_of_handle(INTPTR_MIN);
1571 INLINE_VAR const std::int64_t MAX_FIXNUM = int_of_handle(INTPTR_MAX);
1572
fits_into_fixnum(std::int64_t a)1573 inline bool fits_into_fixnum(std::int64_t a)
1574 { return a>=MIN_FIXNUM && a<=MAX_FIXNUM;
1575 }
1576
abandon(std::uint64_t * p)1577 inline void abandon(std::uint64_t *p)
1578 { freechains::get().abandon(&p[-1]);
1579 }
1580
abandon(std::intptr_t p)1581 inline void abandon(std::intptr_t p)
1582 { if (!stored_as_fixnum(p) && p!=0)
1583 { std::uint64_t *pp = vector_of_handle(p);
1584 freechains::get().abandon(&pp[-1]);
1585 }
1586 }
1587
reserve_string(std::size_t n)1588 inline char *reserve_string(std::size_t n)
1589 { return new char[n+1];
1590 }
1591
confirm_size_string(char * p,std::size_t n,std::size_t final)1592 inline char *confirm_size_string(char *p, std::size_t n,
1593 std::size_t final)
1594 { arithlib_assert(final>0 && (n+9)>final);
1595 p[final] = 0;
1596 return p;
1597 }
1598
abandon_string(char * s)1599 inline void abandon_string(char *s)
1600 { delete [] s;
1601 }
1602
1603 // In the NEW case I will want to make all operations cope with both
1604 // shorter integers (up to 63 bits) stored as the "handle", or genuine
1605 // big numbers where the handle yields a pointer to a vector of 64-bit
1606 // words. I do this by having a dispatch scheme that can activate code
1607 // for each of the mixtures of representations.
1608 //
1609
1610 // Dispatch as between mixed bignum and fixnum representations using some
1611 // template stuff and classes.
1612
1613 template <class OP,class RES>
op_dispatch1(std::intptr_t a1)1614 inline RES op_dispatch1(std::intptr_t a1)
1615 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1));
1616 else return OP::op(vector_of_handle(a1));
1617 }
1618
1619 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::int64_t n)1620 inline RES op_dispatch1(std::intptr_t a1, std::int64_t n)
1621 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1622 else return OP::op(vector_of_handle(a1), n);
1623 }
1624
1625 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::uint64_t * n)1626 inline RES op_dispatch1(std::intptr_t a1, std::uint64_t *n)
1627 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1628 else return OP::op(vector_of_handle(a1), n);
1629 }
1630
1631 template <class OP,class RES>
op_dispatch2(std::intptr_t a1,std::intptr_t a2)1632 inline RES op_dispatch2(std::intptr_t a1, std::intptr_t a2)
1633 { if (stored_as_fixnum(a1))
1634 { if (stored_as_fixnum(a2))
1635 return OP::op(int_of_handle(a1), int_of_handle(a2));
1636 else return OP::op(int_of_handle(a1), vector_of_handle(a2));
1637 }
1638 else
1639 { if (stored_as_fixnum(a2))
1640 return OP::op(vector_of_handle(a1), int_of_handle(a2));
1641 else return OP::op(vector_of_handle(a1), vector_of_handle(a2));
1642 }
1643 }
1644
always_copy_bignum(std::uint64_t * p)1645 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
1646 { std::size_t n = reinterpret_cast<std::uint32_t *>(&p[-1])[1];
1647 push(p);
1648 std::uint64_t *r = reserve(n);
1649 pop(p);
1650 std::memcpy(r, p, n*sizeof(std::uint64_t));
1651 return confirm_size(r, n, n);
1652 }
1653
copy_if_no_garbage_collector(std::uint64_t * p)1654 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
1655 { std::size_t n = number_size(p);
1656 push(p);
1657 std::uint64_t *r = reserve(n);
1658 pop(p);
1659 std::memcpy(r, p, n*sizeof(std::uint64_t));
1660 return confirm_size(r, n, n);
1661 }
1662
copy_if_no_garbage_collector(std::intptr_t pp)1663 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
1664 { if (stored_as_fixnum(pp)) return pp;
1665 std::uint64_t *p = vector_of_handle(pp);
1666 std::size_t n = number_size(p);
1667 push(p);
1668 std::uint64_t *r = reserve(n);
1669 pop(p);
1670 std::memcpy(r, p, n*sizeof(std::uint64_t));
1671 return confirm_size(r, n, n);
1672 }
1673
1674 #elif defined LISP
1675
1676 //=========================================================================
1677 //=========================================================================
1678 // The LISP code is for incorporation in VSL or CSL or Zappa
1679 //=========================================================================
1680 //=========================================================================
1681
1682 // The code in this region needs to be adapted to work with whatever
1683 // language implementation you are going to use the arithmetic library from.
1684 // In my case this will be one of two fairly closely related Lisp systems,
1685 // where VSL is a "small" one and CSL is larger (and more complicated).
1686 // The code here is left visible and available since it may provide some
1687 // sort of model for anybody wanting to use this code in their own
1688 // project.
1689
1690 #if defined CSL
1691
1692 // The code here can only make sense in the context of the CSL sources,
1693 // and it is assumed that all the relevant CSL header files have already
1694 // been #included.
1695
reserve(std::size_t n)1696 inline std::uint64_t *reserve(std::size_t n)
1697 { arithlib_assert(n>0);
1698 // During a transition period I will use TYPE_NEW_BIGNUM rather than
1699 // TYPE_BIGNUM.
1700 LispObject a = get_basic_vector(TAG_NUMBERS, TYPE_NEW_BIGNUM,
1701 n*sizeof(std::uint64_t)+8);
1702 return reinterpret_cast<std::uint64_t *>(a + 8 - TAG_NUMBERS);
1703 }
1704
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)1705 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1706 std::size_t final)
1707 { arithlib_assert(final>0 && n>=final);
1708 if (final == 1 && fits_into_fixnum(static_cast<std::int64_t>(p[0])))
1709 { std::intptr_t r = int_to_handle(static_cast<std::int64_t>(p[0]));
1710 return r;
1711 }
1712 // Note that pack_hdrlength() takes its argument measured in units of
1713 // 32-bit words. That is because the way the length field is packed into
1714 // an object header supported just that resolution (and special treatment is
1715 // given to halfword, byte and bit-vectors to allow for their finer grain).
1716 // The length also includes the size of a header-word, and on 32-bit platforms
1717 // it has to allow for padding the data to allow the array of 64-bit digits
1718 // to be properly aligned in memory.
1719 ((LispObject *)&p[-1])[0] =
1720 TAG_HDR_IMMED + TYPE_NEW_BIGNUM +
1721 pack_hdrlength((final+1)*sizeof(std::uint64_t)/sizeof(std::int32_t));
1722 // If I am on a 32-bit system the data for a bignum is 8 bit aligned and
1723 // that leaves a 4-byte gap after the header. In such a case I will write
1724 // in a zero just to keep memory tidy.
1725 if (sizeof(LispObject) == 4)
1726 ((LispObject *)&p[-1])[1] = 0;
1727 // Here I should reset fringe down by (final-n) perhaps. Think about that
1728 // later!
1729 return vector_to_handle(p);
1730 }
1731
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)1732 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1733 std::size_t final)
1734 { arithlib_assert(final>0 && n>=final);
1735 // Here I might need to write a nice dummy object into the gap left by
1736 // shrinking the object.
1737 return confirm_size(p, n, final);
1738 }
1739
vector_to_handle(std::uint64_t * p)1740 inline std::intptr_t vector_to_handle(std::uint64_t *p)
1741 { return reinterpret_cast<std::intptr_t>(p) - 8 + TAG_NUMBERS;
1742 }
1743
vector_of_handle(std::intptr_t n)1744 inline std::uint64_t *vector_of_handle(std::intptr_t n)
1745 { return reinterpret_cast<std::uint64_t *>(n + 8 - TAG_NUMBERS);
1746 }
1747
number_size(std::uint64_t * p)1748 inline std::size_t number_size(std::uint64_t *p)
1749 {
1750 // The odd looking cast here is because in arithlib I am passing around
1751 // arrays of explicitly 64-bit values, however in the underlying Lisp
1752 // I expect to be modelling memory as made up of intptr-sized items
1753 // that I arrange to have aligned on 8-byte boundaries. So to show some
1754 // though about strict aliasing and the like I will access memory as
1755 // an array of LispObject things when I access the header of an item.
1756 Header h = (Header)*(LispObject *)&p[-1];
1757 std::size_t r = length_of_header(h);
1758 // On 32-bit systems a bignum will have a wasted 32-bit word after the
1759 // header and before the digits, so that the digits are properly aligned
1760 // in memory. The result will be that the bignum is laid out as follows
1761 // | hdr64 | digit64 | digit64 | ... | (64-bit world)
1762 // | hdr32 | gap32 | digit64 | digit64 | ... | (32-bit world)
1763 // The length value packed into the header is the length of the vector
1764 // including its header.
1765 r = (r-8)/sizeof(std::uint64_t);
1766 arithlib_assert(r>0);
1767 return r;
1768 }
1769
stored_as_fixnum(std::intptr_t a)1770 inline bool stored_as_fixnum(std::intptr_t a)
1771 { return is_fixnum(a);
1772 }
1773
int_of_handle(std::intptr_t a)1774 constexpr inline std::int64_t int_of_handle(std::intptr_t a)
1775 { return int_of_fixnum(a);
1776 }
1777
int_to_handle(std::int64_t n)1778 inline std::intptr_t int_to_handle(std::int64_t n)
1779 { return fixnum_of_int(n);
1780 }
1781
1782 INLINE_VAR const std::int64_t MIN_FIXNUM = int_of_handle(INTPTR_MIN);
1783 INLINE_VAR const std::int64_t MAX_FIXNUM = int_of_handle(INTPTR_MAX);
1784
fits_into_fixnum(std::int64_t a)1785 inline bool fits_into_fixnum(std::int64_t a)
1786 { return a>=MIN_FIXNUM && a<=MAX_FIXNUM;
1787 }
1788
abandon(std::uint64_t * p)1789 inline void abandon(std::uint64_t *p)
1790 { // No need to do anything! But MIGHT reset fringe pointer?
1791 }
1792
abandon(std::intptr_t p)1793 inline void abandon(std::intptr_t p)
1794 { if (!stored_as_fixnum(p) && p!=0)
1795 { std::uint64_t *pp = vector_of_handle(p);
1796 abandon(pp);
1797 }
1798 }
1799
reserve_string(std::size_t n)1800 inline char *reserve_string(std::size_t n)
1801 { LispObject a = get_basic_vector(TAG_VECTOR, TYPE_STRING_4,
1802 CELL+n);
1803 return reinterpret_cast<char *>(a - TAG_VECTOR + sizeof(LispObject));
1804 }
1805
confirm_size_string(char * p,std::size_t n,std::size_t final)1806 inline LispObject confirm_size_string(char *p, std::size_t n,
1807 std::size_t final)
1808 { LispObject *a = (LispObject *)(p - sizeof(LispObject));
1809 *a = TAG_HDR_IMMED + TYPE_STRING_4 + (final<<(Tw+5));
1810 return (LispObject)a + TAG_VECTOR;
1811 }
1812
abandon_string(string_handle s)1813 inline void abandon_string(string_handle s)
1814 { // Do nothing.
1815 }
1816
1817 template <class OP,class RES>
op_dispatch1(std::intptr_t a1)1818 inline RES op_dispatch1(std::intptr_t a1)
1819 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1));
1820 else return OP::op(vector_of_handle(a1));
1821 }
1822
1823 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::int64_t n)1824 inline RES op_dispatch1(std::intptr_t a1, std::int64_t n)
1825 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1826 else return OP::op(vector_of_handle(a1), n);
1827 }
1828
1829 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::uint64_t * n)1830 inline RES op_dispatch1(std::intptr_t a1, std::uint64_t *n)
1831 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1832 else return OP::op(vector_of_handle(a1), n);
1833 }
1834
1835 template <class OP,class RES>
op_dispatch2(std::intptr_t a1,std::intptr_t a2)1836 inline RES op_dispatch2(std::intptr_t a1, std::intptr_t a2)
1837 { if (stored_as_fixnum(a1))
1838 { if (stored_as_fixnum(a2))
1839 return OP::op(int_of_handle(a1), int_of_handle(a2));
1840 else return OP::op(int_of_handle(a1), vector_of_handle(a2));
1841 }
1842 else
1843 { if (stored_as_fixnum(a2))
1844 return OP::op(vector_of_handle(a1), int_of_handle(a2));
1845 else return OP::op(vector_of_handle(a1), vector_of_handle(a2));
1846 }
1847 }
1848
always_copy_bignum(std::uint64_t * p)1849 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
1850 { std::size_t n = number_size(p);
1851 push(p);
1852 std::uint64_t *r = reserve(n);
1853 pop(p);
1854 std::memcpy(r, p, n*sizeof(std::uint64_t));
1855 return confirm_size(r, n, n);
1856 }
1857
copy_if_no_garbage_collector(std::uint64_t * p)1858 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
1859 { std::size_t n = number_size(p);
1860 push(p);
1861 std::uint64_t *r = reserve(n);
1862 pop(p);
1863 std::memcpy(r, p, n*sizeof(std::uint64_t));
1864 return confirm_size(r, n, n);
1865 }
1866
copy_if_no_garbage_collector(std::intptr_t pp)1867 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
1868 { if (stored_as_fixnum(pp)) return pp;
1869 std::uint64_t *p = vector_of_handle(pp);
1870 std::size_t n = number_size(p);
1871 push(p);
1872 std::uint64_t *r = reserve(n);
1873 pop(p);
1874 std::memcpy(r, p, n*sizeof(std::uint64_t));
1875 return confirm_size(r, n, n);
1876 }
1877
1878
1879 #elif defined ZAPPA // and end of CSL
1880
1881 // The code here can only make sense in the context of the Zappa sources,
1882 // and it is assumed that all the relevant Zappa header files have already
1883 // been #included.
1884
reserve(std::size_t n)1885 inline std::uint64_t *reserve(std::size_t n)
1886 { arithlib_assert(n>0);
1887 std::uint64_t* a = binaryAllocate(n+1);
1888 *a = n; // Record length of object in first word.
1889 return a+1;
1890 }
1891
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)1892 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
1893 std::size_t final)
1894 { arithlib_assert(final>0 && n>=final);
1895 if (final == 1 && fits_into_fixnum(static_cast<std::int64_t>(p[0])))
1896 { std::intptr_t r = int_to_handle(static_cast<std::int64_t>(p[0]));
1897 return r;
1898 }
1899 // I put the length of a bignum in stored as 2 times the number of words
1900 // (not including the header word). This leaves valid headers even, and so
1901 // odd values can be used for forwarding addresses.
1902 p[-1] = 2*final;
1903 return vector_to_handle(p);
1904 }
1905
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)1906 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
1907 std::size_t final)
1908 { arithlib_assert(final>0 && n>=final);
1909 return confirm_size(p, n, final);
1910 }
1911
vector_to_handle(std::uint64_t * p)1912 inline std::intptr_t vector_to_handle(std::uint64_t *p)
1913 { return reinterpret_cast<std::intptr_t>(p) - 8 + BigInteger;
1914 }
1915
vector_of_handle(std::intptr_t n)1916 inline std::uint64_t *vector_of_handle(std::intptr_t n)
1917 { return reinterpret_cast<std::uint64_t *>(n + 8 - BigInteger);
1918 }
1919
number_size(std::uint64_t * p)1920 inline std::size_t number_size(std::uint64_t *p)
1921 { std::uint64_t h = p[-1]/2;
1922 arithlib_assert(h>0);
1923 return h;
1924 }
1925
stored_as_fixnum(std::intptr_t a)1926 inline bool stored_as_fixnum(std::intptr_t a)
1927 { return (a & tagXMask) == SmallInteger;
1928 }
1929
int_of_handle(std::intptr_t a)1930 constexpr inline std::int64_t int_of_handle(std::intptr_t a)
1931 { return static_cast<std::int64_t>(
1932 (a & ~static_cast<std::int64_t>(tagXMask)) /
1933 static_cast<std::int64_t>(valXMult));
1934 }
1935
int_to_handle(std::int64_t n)1936 inline std::intptr_t int_to_handle(std::int64_t n)
1937 { return static_cast<std::intptr_t>(n*valXMult + SmallInteger);
1938 }
1939
1940 INLINE_VAR const std::int64_t MIN_FIXNUM = int_of_handle(INTPTR_MIN);
1941 INLINE_VAR const std::int64_t MAX_FIXNUM = int_of_handle(INTPTR_MAX);
1942
fits_into_fixnum(std::int64_t a)1943 inline bool fits_into_fixnum(std::int64_t a)
1944 { return a>=MIN_FIXNUM && a<=MAX_FIXNUM;
1945 }
1946
abandon(std::uint64_t * p)1947 inline void abandon(std::uint64_t *p)
1948 { // No need to do anything!
1949 }
1950
abandon(std::intptr_t p)1951 inline void abandon(std::intptr_t p)
1952 { // Also do not do anything!
1953 }
1954
reserve_string(std::size_t n)1955 inline char *reserve_string(std::size_t n)
1956 { std::uint64_t* a = binaryAllocate((8+n+7)/8);
1957 return reinterpret_cast<char *>(a) + 8;
1958 }
1959
1960 // A string size is measured in bytes not words.
confirm_size_string(char * p,std::size_t n,std::size_t final)1961 inline char* confirm_size_string(char *p, std::size_t n,
1962 std::size_t final)
1963 { p[final] = 0;
1964 reinterpret_cast<std::uint64_t*>(p)[-1] = 2*final;
1965 return p;
1966 }
1967
abandon_string(string_handle s)1968 inline void abandon_string(string_handle s)
1969 { // Do nothing.
1970 }
1971
1972 template <class OP,class RES>
op_dispatch1(std::intptr_t a1)1973 inline RES op_dispatch1(std::intptr_t a1)
1974 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1));
1975 else return OP::op(vector_of_handle(a1));
1976 }
1977
1978 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::int64_t n)1979 inline RES op_dispatch1(std::intptr_t a1, std::int64_t n)
1980 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1981 else return OP::op(vector_of_handle(a1), n);
1982 }
1983
1984 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::uint64_t * n)1985 inline RES op_dispatch1(std::intptr_t a1, std::uint64_t *n)
1986 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
1987 else return OP::op(vector_of_handle(a1), n);
1988 }
1989
1990 template <class OP,class RES>
op_dispatch2(std::intptr_t a1,std::intptr_t a2)1991 inline RES op_dispatch2(std::intptr_t a1, std::intptr_t a2)
1992 { if (stored_as_fixnum(a1))
1993 { if (stored_as_fixnum(a2))
1994 return OP::op(int_of_handle(a1), int_of_handle(a2));
1995 else return OP::op(int_of_handle(a1), vector_of_handle(a2));
1996 }
1997 else
1998 { if (stored_as_fixnum(a2))
1999 return OP::op(vector_of_handle(a1), int_of_handle(a2));
2000 else return OP::op(vector_of_handle(a1), vector_of_handle(a2));
2001 }
2002 }
2003
always_copy_bignum(std::uint64_t * p)2004 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
2005 { std::size_t n = number_size(p);
2006 push(p);
2007 std::uint64_t *r = reserve(n);
2008 pop(p);
2009 std::memcpy(r, p, n*sizeof(std::uint64_t));
2010 return confirm_size(r, n, n);
2011 }
2012
copy_if_no_garbage_collector(std::uint64_t * p)2013 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
2014 { std::size_t n = number_size(p);
2015 push(p);
2016 std::uint64_t *r = reserve(n);
2017 pop(p);
2018 std::memcpy(r, p, n*sizeof(std::uint64_t));
2019 return confirm_size(r, n, n);
2020 }
2021
copy_if_no_garbage_collector(std::intptr_t pp)2022 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
2023 { if (stored_as_fixnum(pp)) return pp;
2024 std::uint64_t *p = vector_of_handle(pp);
2025 std::size_t n = number_size(p);
2026 push(p);
2027 std::uint64_t *r = reserve(n);
2028 pop(p);
2029 std::memcpy(r, p, n*sizeof(std::uint64_t));
2030 return confirm_size(r, n, n);
2031 }
2032
2033
2034
2035
2036 #else // ZAPPA
2037
reserve(std::size_t n)2038 inline std::uint64_t *reserve(std::size_t n)
2039 { arithlib_assert(n>0);
2040 // I must allow for alignment padding on 32-bit platforms.
2041 if (sizeof(LispObject)==4) n = n*sizeof(std::uint64_t) + 4;
2042 else n = n*sizeof(std::uint64_t);
2043 LispObject a = allocateatom(n);
2044 return reinterpret_cast<std::uint64_t *>(a + 8 - tagATOM);
2045 }
2046
confirm_size(std::uint64_t * p,std::size_t n,std::size_t final)2047 inline std::intptr_t confirm_size(std::uint64_t *p, std::size_t n,
2048 std::size_t final)
2049 { arithlib_assert(final>0 && n>=final);
2050 if (final == 1 && fits_into_fixnum(static_cast<std::int64_t>(p[0])))
2051 { std::intptr_t r = int_to_handle(static_cast<std::int64_t>(p[0]));
2052 return r;
2053 }
2054 ((LispObject *)&p[-1])[0] =
2055 tagHDR + typeBIGNUM + packlength(final*sizeof(std::uint64_t) +
2056 (sizeof(LispObject)==4 ? 4 : 0));
2057 // If I am on a 32-bit system the data for a bignum is 8 bit aligned and
2058 // that leaves a 4-byte gap after the header. In such a case I will write
2059 // in a zero just to keep memory tidy.
2060 if (sizeof(LispObject) == 4)
2061 ((LispObject *)&p[-1])[1] = 0;
2062 // Here I should reset fringe down by (final-n) perhaps. Think about that
2063 // later!
2064 return vector_to_handle(p);
2065 }
2066
confirm_size_x(std::uint64_t * p,std::size_t n,std::size_t final)2067 inline std::intptr_t confirm_size_x(std::uint64_t *p, std::size_t n,
2068 std::size_t final)
2069 { arithlib_assert(final>0 && n>=final);
2070 // Here I might need to write a nice dummy object into the gap left by
2071 // shrinking the object.
2072 return confirm_size(p, n, final);
2073 }
2074
vector_to_handle(std::uint64_t * p)2075 inline std::intptr_t vector_to_handle(std::uint64_t *p)
2076 { return reinterpret_cast<std::intptr_t>(p) - 8 + tagATOM;
2077 }
2078
vector_of_handle(std::intptr_t n)2079 inline std::uint64_t *vector_of_handle(std::intptr_t n)
2080 { return reinterpret_cast<std::uint64_t *>(n + 8 - tagATOM);
2081 }
2082
number_size(std::uint64_t * p)2083 inline std::size_t number_size(std::uint64_t *p)
2084 {
2085 // The odd looking cast here is because in arithlib I am passing around
2086 // arrays of explicitly 64-bit values, however in the underlying Lisp
2087 // I expect to be modelling memory as made up of intptr-sized items
2088 // that I arrange to have aligned on 8-byte boundaries. So to show some
2089 // though about strict aliasing and the like I will access memory as
2090 // an array of LispObject things when I access the header of an item.
2091 std::uintptr_t h = reinterpret_cast<std::uintptr_t>()*
2092 (LispObject *)&p[-1];
2093 std::size_t r = veclength(h);
2094 // On 32-bit systems a bignum will have a wasted 32-bit word after the
2095 // header and before the digits, so that the digits are properly aligned
2096 // in memory. The result will be that the bignum is laid out as follows
2097 // | hdr64 | digit64 | digit64 | ... | (64-bit world)
2098 // | hdr32 | gap32 | digit64 | digit64 | ... | (32-bit world)
2099 // The length value packed into the header is the length of the vector
2100 // excluding its header.
2101 // if (sizeof(LispObject) == 4) r -= 4; { the remaindering does all I need! }
2102 r = r/sizeof(std::uint64_t);
2103 arithlib_assert(r>0);
2104 return r;
2105 }
2106
stored_as_fixnum(std::intptr_t a)2107 inline bool stored_as_fixnum(std::intptr_t a)
2108 { return isFIXNUM(a);
2109 }
2110
int_of_handle(std::intptr_t a)2111 constexpr inline std::int64_t int_of_handle(std::intptr_t a)
2112 { return qfixnum(a);
2113 }
2114
int_to_handle(std::int64_t n)2115 inline std::intptr_t int_to_handle(std::int64_t n)
2116 { return packfixnum(n);
2117 }
2118
fits_into_fixnum(std::int64_t a)2119 inline bool fits_into_fixnum(std::int64_t a)
2120 { return a>=MIN_FIXNUM && a<=MAX_FIXNUM;
2121 }
2122
abandon(std::uint64_t * p)2123 inline void abandon(std::uint64_t *p)
2124 { // No need to do anything! But MIGHT reset fringe pointer?
2125 }
2126
abandon(std::intptr_t p)2127 inline void abandon(std::intptr_t p)
2128 { if (!stored_as_fixnum(p) && p!=0)
2129 { std::uint64_t *pp = vector_of_handle(p);
2130 abandon(pp);
2131 }
2132 }
2133
reserve_string(std::size_t n)2134 inline char *reserve_string(std::size_t n)
2135 { LispObject a = allocateatom(n);
2136 return reinterpret_cast<char *>(a - tagATOM + sizeof(LispObject));
2137 }
2138
confirm_size_string(char * p,std::size_t n,std::size_t final)2139 inline LispObject confirm_size_string(char *p, std::size_t n,
2140 std::size_t final)
2141 { LispObject *a = (LispObject *)(p - sizeof(LispObject));
2142 // On 32-bit platforms I do not have a padder word before string data
2143 // so things are simpler here than when confirming the size of a number.
2144 *a = tagHDR + typeSTRING + packlength(final);
2145 return (LispObject)a +tagATOM;
2146 }
2147
abandon_string(string_handle s)2148 inline void abandon_string(string_handle s)
2149 { // Do nothing.
2150 }
2151
2152 template <class OP,class RES>
op_dispatch1(std::intptr_t a1)2153 inline RES op_dispatch1(std::intptr_t a1)
2154 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1));
2155 else return OP::op(vector_of_handle(a1));
2156 }
2157
2158 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::int64_t n)2159 inline RES op_dispatch1(std::intptr_t a1, std::int64_t n)
2160 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
2161 else return OP::op(vector_of_handle(a1), n);
2162 }
2163
2164 template <class OP,class RES>
op_dispatch1(std::intptr_t a1,std::uint64_t * n)2165 inline RES op_dispatch1(std::intptr_t a1, std::uint64_t *n)
2166 { if (stored_as_fixnum(a1)) return OP::op(int_of_handle(a1), n);
2167 else return OP::op(vector_of_handle(a1), n);
2168 }
2169
2170 template <class OP,class RES>
op_dispatch2(std::intptr_t a1,std::intptr_t a2)2171 inline RES op_dispatch2(std::intptr_t a1, std::intptr_t a2)
2172 { if (stored_as_fixnum(a1))
2173 { if (stored_as_fixnum(a2))
2174 return OP::op(int_of_handle(a1), int_of_handle(a2));
2175 else return OP::op(int_of_handle(a1), vector_of_handle(a2));
2176 }
2177 else
2178 { if (stored_as_fixnum(a2))
2179 return OP::op(vector_of_handle(a1), int_of_handle(a2));
2180 else return OP::op(vector_of_handle(a1), vector_of_handle(a2));
2181 }
2182 }
2183
always_copy_bignum(std::uint64_t * p)2184 inline std::intptr_t always_copy_bignum(std::uint64_t *p)
2185 { std::size_t n = number_size(p);
2186 push(p);
2187 std::uint64_t *r = reserve(n);
2188 pop(p);
2189 std::memcpy(r, p, n*sizeof(std::uint64_t));
2190 return confirm_size(r, n, n);
2191 }
2192
copy_if_no_garbage_collector(std::uint64_t * p)2193 inline std::intptr_t copy_if_no_garbage_collector(std::uint64_t *p)
2194 { std::size_t n = number_size(p);
2195 push(p);
2196 std::uint64_t *r = reserve(n);
2197 pop(p);
2198 std::memcpy(r, p, n*sizeof(std::uint64_t));
2199 return confirm_size(r, n, n);
2200 }
2201
copy_if_no_garbage_collector(std::intptr_t pp)2202 inline std::intptr_t copy_if_no_garbage_collector(std::intptr_t pp)
2203 { if (stored_as_fixnum(pp)) return pp;
2204 std::uint64_t *p = vector_of_handle(pp);
2205 std::size_t n = number_size(p);
2206 push(p);
2207 std::uint64_t *r = reserve(n);
2208 pop(p);
2209 std::memcpy(r, p, n*sizeof(std::uint64_t));
2210 return confirm_size(r, n, n);
2211 }
2212
2213 #endif // !CSL
2214
2215 #else // none if MALLOC, LISP or NEW specified.
2216 #error Unspecified memory model
2217 #endif
2218
2219 // The main arithmetic operations are supported by code that can work on
2220 // Bignums stored as vectors of digits or on Fixnums represented as (tagged)
2221 // immediate data, or as mixtures. For each operation there is a class, and
2222 // methods called "op" within it deal with the various cases via overloading.
2223
2224 class Plus
2225 {
2226 public:
2227 static std::intptr_t op(std::int64_t, std::int64_t);
2228 static std::intptr_t op(std::int64_t, std::uint64_t *);
2229 static std::intptr_t op(std::uint64_t *, std::int64_t);
2230 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2231 };
2232
2233 inline std::intptr_t bigplus_small(std::intptr_t, std::int64_t);
2234
2235 class Difference
2236 {
2237 public:
2238 static std::intptr_t op(std::int64_t, std::int64_t);
2239 static std::intptr_t op(std::int64_t, std::uint64_t *);
2240 static std::intptr_t op(std::uint64_t *, std::int64_t);
2241 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2242 };
2243
2244 class RevDifference
2245 {
2246 public:
2247 static std::intptr_t op(std::int64_t, std::int64_t);
2248 static std::intptr_t op(std::int64_t, std::uint64_t *);
2249 static std::intptr_t op(std::uint64_t *, std::int64_t);
2250 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2251 };
2252
2253 class Times
2254 {
2255 public:
2256 static std::intptr_t op(std::int64_t, std::int64_t);
2257 static std::intptr_t op(std::int64_t, std::uint64_t *);
2258 static std::intptr_t op(std::uint64_t *, std::int64_t);
2259 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2260 };
2261
2262 class Quotient
2263 {
2264 public:
2265 static std::intptr_t op(std::int64_t, std::int64_t);
2266 static std::intptr_t op(std::int64_t, std::uint64_t *);
2267 static std::intptr_t op(std::uint64_t *, std::int64_t);
2268 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2269 };
2270
2271 class Remainder
2272 {
2273 public:
2274 static std::intptr_t op(std::int64_t, std::int64_t);
2275 static std::intptr_t op(std::int64_t, std::uint64_t *);
2276 static std::intptr_t op(std::uint64_t *, std::int64_t);
2277 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2278 };
2279
2280 class Divide
2281 {
2282 public:
2283 static std::intptr_t op(std::int64_t, std::int64_t);
2284 static std::intptr_t op(std::int64_t, std::uint64_t *);
2285 static std::intptr_t op(std::uint64_t *, std::int64_t);
2286 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2287 static std::intptr_t op(std::int64_t, std::int64_t, std::intptr_t &);
2288 static std::intptr_t op(std::int64_t, std::uint64_t *,
2289 std::intptr_t &);
2290 static std::intptr_t op(std::uint64_t *, std::int64_t,
2291 std::intptr_t &);
2292 static std::intptr_t op(std::uint64_t *, std::uint64_t *,
2293 std::intptr_t &);
2294 };
2295
2296 class Gcd
2297 {
2298 public:
2299 static std::intptr_t op(std::int64_t, std::int64_t);
2300 static std::intptr_t op(std::int64_t, std::uint64_t *);
2301 static std::intptr_t op(std::uint64_t *, std::int64_t);
2302 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2303 };
2304
2305 class Lcm
2306 {
2307 public:
2308 static std::intptr_t op(std::int64_t, std::int64_t);
2309 static std::intptr_t op(std::int64_t, std::uint64_t *);
2310 static std::intptr_t op(std::uint64_t *, std::int64_t);
2311 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2312 };
2313
2314 class Logand
2315 {
2316 public:
2317 static std::intptr_t op(std::int64_t, std::int64_t);
2318 static std::intptr_t op(std::int64_t, std::uint64_t *);
2319 static std::intptr_t op(std::uint64_t *, std::int64_t);
2320 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2321 };
2322
2323 class Logor
2324 {
2325 public:
2326 static std::intptr_t op(std::int64_t, std::int64_t);
2327 static std::intptr_t op(std::int64_t, std::uint64_t *);
2328 static std::intptr_t op(std::uint64_t *, std::int64_t);
2329 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2330 };
2331
2332 class Logxor
2333 {
2334 public:
2335 static std::intptr_t op(std::int64_t, std::int64_t);
2336 static std::intptr_t op(std::int64_t, std::uint64_t *);
2337 static std::intptr_t op(std::uint64_t *, std::int64_t);
2338 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2339 };
2340
2341 class Logeqv
2342 {
2343 public:
2344 static std::intptr_t op(std::int64_t, std::int64_t);
2345 static std::intptr_t op(std::int64_t, std::uint64_t *);
2346 static std::intptr_t op(std::uint64_t *, std::int64_t);
2347 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2348 };
2349
2350 class Zerop
2351 {
2352 public:
2353 static bool op(std::int64_t w);
2354 static bool op(std::uint64_t *w);
2355 };
2356
2357 class Onep
2358 {
2359 public:
2360 static bool op(std::int64_t w);
2361 static bool op(uint64_t *w);
2362 };
2363
2364 class Minusp
2365 {
2366 public:
2367 static bool op(std::int64_t w);
2368 static bool op(uint64_t *w);
2369 };
2370
2371 class Evenp
2372 {
2373 public:
2374 static bool op(std::int64_t w);
2375 static bool op(uint64_t *w);
2376 };
2377
2378 class Oddp
2379 {
2380 public:
2381 static bool op(std::int64_t w);
2382 static bool op(uint64_t *w);
2383 };
2384
2385 class Eqn
2386 {
2387 public:
2388 static bool op(std::int64_t, std::int64_t);
2389 static bool op(std::int64_t, std::uint64_t *);
2390 static bool op(std::uint64_t *, std::int64_t);
2391 static bool op(std::uint64_t *, std::uint64_t *);
2392 // Even comparing a floating point number with an integer for equality
2393 // turns out to be messier than one might have hoped!
2394 static bool op(std::int64_t, float);
2395 static bool op(std::uint64_t *, float);
2396 static bool op(float, std::int64_t);
2397 static bool op(float, std::uint64_t *);
2398 static bool op(std::int64_t, double);
2399 static bool op(std::uint64_t *, double);
2400 static bool op(double, std::int64_t);
2401 static bool op(double, std::uint64_t *);
2402 #ifdef softfloat_h
2403 static bool op(std::int64_t, float128_t);
2404 static bool op(std::uint64_t *, float128_t);
2405 static bool op(float128_t, std::int64_t);
2406 static bool op(float128_t, std::uint64_t *);
2407 #endif // softfloat_h
2408 };
2409
2410 class Neqn
2411 {
2412 public:
2413 static bool op(std::int64_t, std::int64_t);
2414 static bool op(std::int64_t, std::uint64_t *);
2415 static bool op(std::uint64_t *, std::int64_t);
2416 static bool op(std::uint64_t *, std::uint64_t *);
2417 // Even comparing a floating point number with an integer for equality
2418 // turns out to be messier than one might have hoped!
2419 static bool op(std::int64_t, float);
2420 static bool op(std::uint64_t *, float);
2421 static bool op(float, std::int64_t);
2422 static bool op(float, std::uint64_t *);
2423 static bool op(std::int64_t, double);
2424 static bool op(std::uint64_t *, double);
2425 static bool op(double, std::int64_t);
2426 static bool op(double, std::uint64_t *);
2427 #ifdef softfloat_h
2428 static bool op(std::int64_t, float128_t);
2429 static bool op(std::uint64_t *, float128_t);
2430 static bool op(float128_t, std::int64_t);
2431 static bool op(float128_t, std::uint64_t *);
2432 #endif // softfloat_h
2433 };
2434
2435 class Geq
2436 {
2437 public:
2438 static bool op(std::int64_t, std::int64_t);
2439 static bool op(std::int64_t, std::uint64_t *);
2440 static bool op(std::uint64_t *, std::int64_t);
2441 static bool op(std::uint64_t *, std::uint64_t *);
2442 // Comparing a bignum against a floating point value has multiple cases
2443 // to consider, but needs special implementation so that neither rounding
2444 // nor overflow not Infinities/NaNs lead to incorrect results.
2445 static bool op(std::int64_t, float);
2446 static bool op(std::uint64_t *, float);
2447 static bool op(float, std::int64_t);
2448 static bool op(float, std::uint64_t *);
2449 static bool op(std::int64_t, double);
2450 static bool op(std::uint64_t *, double);
2451 static bool op(double, std::int64_t);
2452 static bool op(double, std::uint64_t *);
2453 #ifdef softfloat_h
2454 static bool op(std::int64_t, float128_t);
2455 static bool op(std::uint64_t *, float128_t);
2456 static bool op(float128_t, std::int64_t);
2457 static bool op(float128_t, std::uint64_t *);
2458 #endif // softfloat_h
2459 };
2460
2461 class Greaterp
2462 {
2463 public:
2464 static bool op(std::int64_t, std::int64_t);
2465 static bool op(std::int64_t, std::uint64_t *);
2466 static bool op(std::uint64_t *, std::int64_t);
2467 static bool op(std::uint64_t *, std::uint64_t *);
2468 static bool op(std::int64_t, float);
2469 static bool op(std::uint64_t *, float);
2470 static bool op(float, std::int64_t);
2471 static bool op(float, std::uint64_t *);
2472 static bool op(std::int64_t, double);
2473 static bool op(std::uint64_t *, double);
2474 static bool op(double, std::int64_t);
2475 static bool op(double, std::uint64_t *);
2476 #ifdef softfloat_h
2477 static bool op(std::int64_t, float128_t);
2478 static bool op(std::uint64_t *, float128_t);
2479 static bool op(float128_t, std::int64_t);
2480 static bool op(float128_t, std::uint64_t *);
2481 #endif // softfloat_h
2482 };
2483
2484 class Leq
2485 {
2486 public:
2487 static bool op(std::int64_t, std::int64_t);
2488 static bool op(std::int64_t, std::uint64_t *);
2489 static bool op(std::uint64_t *, std::int64_t);
2490 static bool op(std::uint64_t *, std::uint64_t *);
2491 static bool op(std::int64_t, float);
2492 static bool op(std::uint64_t *, float);
2493 static bool op(float, std::int64_t);
2494 static bool op(float, std::uint64_t *);
2495 static bool op(std::int64_t, double);
2496 static bool op(std::uint64_t *, double);
2497 static bool op(double, std::int64_t);
2498 static bool op(double, std::uint64_t *);
2499 #ifdef softfloat_h
2500 static bool op(std::int64_t, float128_t);
2501 static bool op(std::uint64_t *, float128_t);
2502 static bool op(float128_t, std::int64_t);
2503 static bool op(float128_t, std::uint64_t *);
2504 #endif // softfloat_h
2505 };
2506
2507 class Lessp
2508 {
2509 public:
2510 static bool op(std::int64_t, std::int64_t);
2511 static bool op(std::int64_t, std::uint64_t *);
2512 static bool op(std::uint64_t *, std::int64_t);
2513 static bool op(std::uint64_t *, std::uint64_t *);
2514 static bool op(std::int64_t, float);
2515 static bool op(std::uint64_t *, float);
2516 static bool op(float, std::int64_t);
2517 static bool op(float, std::uint64_t *);
2518 static bool op(std::int64_t, double);
2519 static bool op(std::uint64_t *, double);
2520 static bool op(double, std::int64_t);
2521 static bool op(double, std::uint64_t *);
2522 #ifdef softfloat_h
2523 static bool op(std::int64_t, float128_t);
2524 static bool op(std::uint64_t *, float128_t);
2525 static bool op(float128_t, std::int64_t);
2526 static bool op(float128_t, std::uint64_t *);
2527 #endif // softfloat_h
2528 };
2529
2530 class Add1
2531 {
2532 public:
2533 static std::intptr_t op(std::int64_t w);
2534 static std::intptr_t op(uint64_t *w);
2535 };
2536
2537 class Sub1
2538 {
2539 public:
2540 static std::intptr_t op(std::int64_t w);
2541 static std::intptr_t op(uint64_t *w);
2542 };
2543
2544 class Minus
2545 {
2546 public:
2547 static std::intptr_t op(std::int64_t w);
2548 static std::intptr_t op(uint64_t *w);
2549 };
2550
2551 class Abs
2552 {
2553 public:
2554 static std::intptr_t op(std::int64_t w);
2555 static std::intptr_t op(uint64_t *w);
2556 };
2557
2558 class Square
2559 {
2560 public:
2561 static std::intptr_t op(std::int64_t w);
2562 static std::intptr_t op(uint64_t *w);
2563 };
2564
2565 class Isqrt
2566 {
2567 public:
2568 static std::intptr_t op(std::int64_t w);
2569 static std::intptr_t op(uint64_t *w);
2570 };
2571
2572 class Lognot
2573 {
2574 public:
2575 static std::intptr_t op(std::int64_t w);
2576 static std::intptr_t op(uint64_t *w);
2577 };
2578
2579 class Pow
2580 {
2581 public:
2582 static std::intptr_t op(std::int64_t, std::int64_t);
2583 static std::intptr_t op(std::uint64_t *, std::int64_t);
2584 static std::intptr_t op(std::int64_t, std::uint64_t *);
2585 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2586 static double op(std::int64_t, double);
2587 static double op(std::uint64_t *, double);
2588 };
2589
2590 class LeftShift
2591 {
2592 public:
2593 static std::intptr_t op(std::int64_t, std::int64_t);
2594 static std::intptr_t op(std::uint64_t *, std::int64_t);
2595 static std::intptr_t op(std::int64_t, std::uint64_t *);
2596 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2597 };
2598
2599 class RightShift
2600 {
2601 public:
2602 static std::intptr_t op(std::int64_t, std::int64_t);
2603 static std::intptr_t op(std::uint64_t *, std::int64_t);
2604 static std::intptr_t op(std::int64_t, std::uint64_t *);
2605 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2606 };
2607
2608 class IntegerLength
2609 {
2610 public:
2611 static std::size_t op(std::int64_t w);
2612 static std::size_t op(uint64_t *w);
2613 };
2614
2615 class Low_bit
2616 {
2617 public:
2618 static std::size_t op(std::int64_t w);
2619 static std::size_t op(uint64_t *w);
2620 };
2621
2622 class Logbitp
2623 {
2624 public:
2625 static bool op(std::int64_t, std::size_t);
2626 static bool op(std::uint64_t *, std::size_t);
2627 };
2628
2629 class Logcount
2630 {
2631 public:
2632 static std::size_t op(std::int64_t w);
2633 static std::size_t op(uint64_t *w);
2634 };
2635
2636 class Int64_t
2637 {
2638 public:
2639 static std::int64_t op(std::int64_t w);
2640 static std::int64_t op(uint64_t *w);
2641 };
2642
2643 class Uint64_t
2644 {
2645 public:
2646 static std::uint64_t op(std::int64_t w);
2647 static std::uint64_t op(uint64_t *w);
2648 };
2649
2650 class Float
2651 {
2652 public:
2653 static float op(std::int64_t w);
2654 static float op(uint64_t *w);
2655 };
2656
2657 class Double
2658 {
2659 public:
2660 static double op(std::int64_t w);
2661 static double op(uint64_t *w);
2662 };
2663
2664 class Frexp
2665 {
2666 public:
2667 static double op(std::int64_t, std::int64_t &x);
2668 static double op(std::uint64_t *, std::int64_t &x);
2669 };
2670
2671 #ifdef softfloat_h
2672
2673 class Float128
2674 {
2675 public:
2676 static float128_t op(std::int64_t w);
2677 static float128_t op(uint64_t *w);
2678 };
2679
2680 class Frexp128
2681 {
2682 public:
2683 static float128_t op(std::int64_t, std::int64_t &x);
2684 static float128_t op(std::uint64_t *, std::int64_t &x);
2685 };
2686
2687 #endif // softfloat_h
2688
2689 #ifdef CSL
2690
2691 class ModularPlus
2692 {
2693 public:
2694 static std::intptr_t op(std::int64_t, std::int64_t);
2695 static std::intptr_t op(std::int64_t, std::uint64_t *);
2696 static std::intptr_t op(std::uint64_t *, std::int64_t);
2697 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2698 };
2699
2700 class ModularDifference
2701 {
2702 public:
2703 static std::intptr_t op(std::int64_t, std::int64_t);
2704 static std::intptr_t op(std::int64_t, std::uint64_t *);
2705 static std::intptr_t op(std::uint64_t *, std::int64_t);
2706 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2707 };
2708
2709 class ModularTimes
2710 {
2711 public:
2712 static std::intptr_t op(std::int64_t, std::int64_t);
2713 static std::intptr_t op(std::int64_t, std::uint64_t *);
2714 static std::intptr_t op(std::uint64_t *, std::int64_t);
2715 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2716 };
2717
2718 class ModularExpt
2719 {
2720 public:
2721 static std::intptr_t op(std::int64_t, std::int64_t);
2722 static std::intptr_t op(std::int64_t, std::uint64_t *);
2723 static std::intptr_t op(std::uint64_t *, std::int64_t);
2724 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2725 };
2726
2727 class ModularQuotient
2728 {
2729 public:
2730 static std::intptr_t op(std::int64_t, std::int64_t);
2731 static std::intptr_t op(std::int64_t, std::uint64_t *);
2732 static std::intptr_t op(std::uint64_t *, std::int64_t);
2733 static std::intptr_t op(std::uint64_t *, std::uint64_t *);
2734 };
2735
2736 class ModularMinus
2737 {
2738 public:
2739 static std::intptr_t op(std::int64_t w);
2740 static std::intptr_t op(uint64_t *w);
2741 };
2742
2743 class ModularReciprocal
2744 {
2745 public:
2746 static std::intptr_t op(std::int64_t w);
2747 static std::intptr_t op(uint64_t *w);
2748 };
2749
2750 class ModularNumber
2751 {
2752 public:
2753 static std::intptr_t op(std::int64_t w);
2754 static std::intptr_t op(uint64_t *w);
2755 };
2756
2757 class SetModulus
2758 {
2759 public:
2760 static std::intptr_t op(std::int64_t w);
2761 static std::intptr_t op(uint64_t *w);
2762 };
2763
2764 #endif // CSL
2765
2766 inline string_handle bignum_to_string(std::intptr_t aa);
2767 inline string_handle bignum_to_string_hex(std::intptr_t aa);
2768 inline string_handle bignum_to_string_octal(std::intptr_t aa);
2769 inline string_handle bignum_to_string_binary(std::intptr_t aa);
2770
2771 class Bignum;
2772
2773 inline void display(const char *label, const std::uint64_t *a,
2774 std::size_t lena);
2775 inline void display(const char *label, std::intptr_t a);
2776 inline void display(const char *label, const Bignum &a);
2777
2778
2779 //=========================================================================
2780 //=========================================================================
2781 // I have a class Bignum that wraps up the representation of a number
2782 // and then allows me to overload most operators so that big numbers can be
2783 // used in C++ code anmost as if they were a natural proper type. The main
2784 // big oddity will be that to denote a Bignum literal it will be necessary
2785 // to use a constructor, with obvious constructors accepting integers of up
2786 // to 64-bits and a perhaps less obvious one taking a string that is the
2787 // decimal denotation of the integer concerned.
2788 //=========================================================================
2789 //=========================================================================
2790
2791 class Bignum
2792 {
2793 public:
2794 // a Bignum only had one data field, and that is simple plain data.
2795 std::intptr_t val;
2796
2797
2798 // A default constructor build a Bignum with no stored data.
Bignum()2799 Bignum()
2800 { val = 0;
2801 }
2802 // In the next constructor the boolean argument is not used at run time but
2803 // serves to indicate which constructor is wanted.
Bignum(bool set_val,std::intptr_t v)2804 Bignum(bool set_val, std::intptr_t v)
2805 { val = v;
2806 }
~Bignum()2807 ~Bignum()
2808 { abandon(val);
2809 val = 0;
2810 }
Bignum(std::uint64_t * p)2811 Bignum(std::uint64_t *p)
2812 { val = vector_to_handle(p);
2813 }
2814 // The code here is more complicated than I would have liked. What I want is
2815 // that ANY sort of C++ integer can be converted to a Bignum. My first
2816 // attempts arranges that int32_t and int64_t could be, however there is
2817 // no guarantee that just coping with all the width-specified cases will
2818 // then cover mere "int" and "long". So what I now use is a template
2819 // definition filtered with magic that constrains it to only matching the
2820 // template parameter against some sort of integer. In some cases (such as
2821 // here) I provide one version to deal with all the signed integer cases
2822 // and another all the unsigned ones.
2823 // In general I will receive integers this way and cast them to 64-bit
2824 // values. This means that if the platform happens to provide intmax_t
2825 // that is wider than that then it will not be handled well!
2826 template <typename T,
2827 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2828 typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
Bignum(T n)2829 Bignum(T n)
2830 { val = int_to_bignum(static_cast<std::int64_t>(n));
2831 }
2832 template <typename T,
2833 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2834 typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
Bignum(T n)2835 Bignum(T n)
2836 { val = unsigned_int_to_bignum(static_cast<std::uint64_t>(n));
2837 }
Bignum(float d)2838 Bignum(float d)
2839 { val = round_double_to_int(static_cast<double>(d));
2840 }
Bignum(double d)2841 Bignum(double d)
2842 { val = round_double_to_int(d);
2843 }
2844 #ifdef softfloat_h
Bignum(float128_t d)2845 Bignum(float128_t d)
2846 { val = round_float128_to_int(d);
2847 }
2848 #endif // softfloat_h
Bignum(const char * s)2849 Bignum(const char *s)
2850 { val = string_to_bignum(s);
2851 }
Bignum(const Bignum & a)2852 Bignum(const Bignum &a)
2853 { val = copy_if_no_garbage_collector(a.val);
2854 }
2855 template <typename T,
2856 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2857 typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
operator T()2858 operator T()
2859 { return static_cast<T>(op_dispatch1<Int64_t, std::int64_t>(val));
2860 }
2861 template <typename T,
2862 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2863 typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
operator T()2864 operator T()
2865 { return static_cast<T>(op_dispatch1<Uint64_t, std::uint64_t>(val));
2866 }
operator double()2867 operator double()
2868 { return op_dispatch1<Double, double>(val);
2869 }
vec() const2870 std::uint64_t *vec() const
2871 { return vector_of_handle(val);
2872 }
2873
2874 // In a way that is BAD I make the result of an assignment void rather than
2875 // the value that is assigned. This is so I do not make gratuitous extra
2876 // copies of it in the common case where the value is not used, but it could
2877 // catch out the unwary.
operator =(const Bignum & x)2878 inline void operator = (const Bignum &x)
2879 { if (this == &x) return; // assign to self - a silly case!
2880 abandon(val);
2881 // See comment in the copy constructor.
2882 val = copy_if_no_garbage_collector(x.val);
2883 }
2884
2885 template <typename T,
2886 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2887 typename std::enable_if<std::is_signed<T>::value>::type* = nullptr>
operator =(const T x)2888 inline void operator = (const T x)
2889 { abandon(val);
2890 val = int_to_bignum(static_cast<std::int64_t>(x));
2891 }
2892
2893 template <typename T,
2894 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr,
2895 typename std::enable_if<std::is_unsigned<T>::value>::type* = nullptr>
operator =(const T x)2896 inline void operator = (const T x)
2897 { abandon(val);
2898 val = unsigned_int_to_bignum(static_cast<std::uint64_t>(x));
2899 }
2900
operator =(const char * x)2901 inline void operator = (const char *x)
2902 { abandon(val);
2903 val = string_to_bignum(x);
2904 }
2905
operator +(const Bignum & x) const2906 inline Bignum operator +(const Bignum &x) const
2907 { return Bignum(true, op_dispatch2<Plus,std::intptr_t>(val, x.val));
2908 }
2909
2910 template <typename T,
2911 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator +(const T x) const2912 inline Bignum operator +(const T x) const
2913 { return Bignum(true, op_dispatch2<Plus,std::intptr_t>(val,
2914 Bignum(x).val));
2915 }
2916
operator -(const Bignum & x) const2917 inline Bignum operator -(const Bignum &x) const
2918 { return Bignum(true, op_dispatch2<Difference,std::intptr_t>(val,
2919 x.val));
2920 }
2921
2922 template <typename T,
2923 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator -(const T x) const2924 inline Bignum operator -(const T x) const
2925 { return Bignum(true, op_dispatch2<Difference,std::intptr_t>(val,
2926 Bignum(x).val));
2927 }
2928
operator *(const Bignum & x) const2929 inline Bignum operator *(const Bignum &x) const
2930 { return Bignum(true, op_dispatch2<Times,std::intptr_t>(val,
2931 x.val));
2932 }
2933
2934 template <typename T,
2935 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator *(const T x) const2936 inline Bignum operator *(const T x) const
2937 { return Bignum(true, op_dispatch2<Times,std::intptr_t>(val,
2938 Bignum(x).val));
2939 }
2940
operator /(const Bignum & x) const2941 inline Bignum operator /(const Bignum &x) const
2942 { return Bignum(true, op_dispatch2<Quotient,std::intptr_t>(val,
2943 x.val));
2944 }
2945
2946 template <typename T,
2947 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator /(const T x) const2948 inline Bignum operator /(const T x) const
2949 { return Bignum(true, op_dispatch2<Quotient,std::intptr_t>(val,
2950 Bignum(x).val));
2951 }
2952
operator %(const Bignum & x) const2953 inline Bignum operator %(const Bignum &x) const
2954 { return Bignum(true, op_dispatch2<Remainder,std::intptr_t>(val,
2955 x.val));
2956 }
2957
2958 template <typename T,
2959 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator %(const T x) const2960 inline Bignum operator %(const T x) const
2961 { return Bignum(true, op_dispatch2<Remainder,std::intptr_t>(val,
2962 Bignum(x).val));
2963 }
2964
operator -() const2965 inline Bignum operator -() const
2966 { return Bignum(true, op_dispatch1<Minus,std::intptr_t>(val));
2967 }
2968
operator &(const Bignum & x) const2969 inline Bignum operator &(const Bignum &x) const
2970 { return Bignum(true, op_dispatch2<Logand,std::intptr_t>(val,
2971 x.val));
2972 }
2973
2974 template <typename T,
2975 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator &(const T x) const2976 inline Bignum operator &(const T x) const
2977 { return Bignum(true, op_dispatch2<Logand,std::intptr_t>(val,
2978 Bignum(x).val));
2979 }
2980
operator |(const Bignum & x) const2981 inline Bignum operator |(const Bignum &x) const
2982 { return Bignum(true, op_dispatch2<Logor,std::intptr_t>(val,
2983 x.val));
2984 }
2985
2986 template <typename T,
2987 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator |(const T x) const2988 inline Bignum operator |(const T x) const
2989 { return Bignum(true, op_dispatch2<Logor,std::intptr_t>(val,
2990 Bignum(x).val));
2991 }
2992
operator ^(const Bignum & x) const2993 inline Bignum operator ^(const Bignum &x) const
2994 { return Bignum(true, op_dispatch2<Logxor,std::intptr_t>(val,
2995 x.val));
2996 }
2997
2998 template <typename T,
2999 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator ^(const T x) const3000 inline Bignum operator ^(const T x) const
3001 { return Bignum(true, op_dispatch2<Logxor,std::intptr_t>(val,
3002 Bignum(x).val));
3003 }
3004
3005 template <typename T,
3006 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator <<(T n) const3007 inline Bignum operator <<(T n) const
3008 { return Bignum(true, op_dispatch1<LeftShift,std::intptr_t>(val,
3009 static_cast<std::int64_t>(n)));
3010 }
3011
operator <<(Bignum n) const3012 inline Bignum operator <<(Bignum n) const
3013 { return Bignum(true, op_dispatch1<LeftShift,std::intptr_t>(val,
3014 static_cast<std::int64_t>(n)));
3015 }
3016
3017 template <typename T,
3018 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator >>(T n) const3019 inline Bignum operator >>(T n) const
3020 { return Bignum(true, op_dispatch1<RightShift,std::intptr_t>(val,
3021 static_cast<std::int64_t>(n)));
3022 }
3023
operator >>(Bignum n) const3024 inline Bignum operator >>(Bignum n) const
3025 { return Bignum(true, op_dispatch1<RightShift,std::intptr_t>(val,
3026 static_cast<std::int64_t>(n)));
3027 }
3028
operator ~() const3029 inline Bignum operator ~() const
3030 { return Bignum(true, op_dispatch1<Lognot,std::intptr_t>(val));
3031 }
3032
operator ==(const Bignum & x) const3033 inline bool operator ==(const Bignum &x) const
3034 { return op_dispatch2<Eqn,bool>(val, x.val);
3035 }
3036 template <typename T,
3037 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator ==(const T x) const3038 inline bool operator ==(const T x) const
3039 { return op_dispatch2<Eqn,bool>(val, Bignum(x).val);
3040 }
3041
operator !=(const Bignum & x) const3042 inline bool operator !=(const Bignum &x) const
3043 { return !op_dispatch2<Eqn,bool>(val, x.val);
3044 }
3045 template <typename T,
3046 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator !=(const T x) const3047 inline bool operator !=(const T x) const
3048 { return !op_dispatch2<Eqn,bool>(val, Bignum(x).val);
3049 }
3050
operator >(const Bignum & x) const3051 inline bool operator >(const Bignum &x) const
3052 { return op_dispatch2<Greaterp,bool>(val, x.val);
3053 }
3054 template <typename T,
3055 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator >(const T x) const3056 inline bool operator >(const T x) const
3057 { return op_dispatch2<Greaterp,bool>(val, Bignum(x).val);
3058 }
3059
operator >=(const Bignum & x) const3060 inline bool operator >=(const Bignum &x) const
3061 { return op_dispatch2<Geq,bool>(val, x.val);
3062 }
3063 template <typename T,
3064 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator >=(const T x) const3065 inline bool operator >=(const T x) const
3066 { return op_dispatch2<Geq,bool>(val, Bignum(x).val);
3067 }
3068
operator <(const Bignum & x) const3069 inline bool operator <(const Bignum &x) const
3070 { return op_dispatch2<Lessp,bool>(val, x.val);
3071 }
3072 template <typename T,
3073 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator <(const T x) const3074 inline bool operator <(const T x) const
3075 { return op_dispatch2<Lessp,bool>(val, Bignum(x).val);
3076 }
3077
operator <=(const Bignum & x) const3078 inline bool operator <=(const Bignum &x) const
3079 { return op_dispatch2<Leq,bool>(val, x.val);
3080 }
3081 template <typename T,
3082 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator <=(const T x) const3083 inline bool operator <=(const T x) const
3084 { return op_dispatch2<Leq,bool>(val, Bignum(x).val);
3085 }
3086
operator +=(const Bignum & x)3087 inline void operator +=(const Bignum &x)
3088 { std::intptr_t r = op_dispatch2<Plus,std::intptr_t>(val, x.val);
3089 abandon(val);
3090 val = r;
3091 }
3092
3093 template <typename T,
3094 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator +=(T x)3095 inline void operator +=(T x)
3096 { std::intptr_t r = op_dispatch2<Plus,std::intptr_t>(val,
3097 Bignum(x).val);
3098 abandon(val);
3099 val = r;
3100 }
3101
operator -=(const Bignum & x)3102 inline void operator -=(const Bignum &x)
3103 { std::intptr_t r = op_dispatch2<Difference,std::intptr_t>(val,
3104 x.val);
3105 abandon(val);
3106 val = r;
3107 }
3108
3109 template <typename T,
3110 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator -=(T x)3111 inline void operator -=(T x)
3112 { std::intptr_t r = op_dispatch2<Difference,std::intptr_t>(val,
3113 Bignum(x).val);
3114 abandon(val);
3115 val = r;
3116 }
3117
operator *=(const Bignum & x)3118 inline void operator *=(const Bignum &x)
3119 { std::intptr_t r = op_dispatch2<Times,std::intptr_t>(val, x.val);
3120 abandon(val);
3121 val = r;
3122 }
3123 template <typename T,
3124 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator *=(T x)3125 inline void operator *=(T x)
3126 { std::intptr_t r = op_dispatch2<Times,std::intptr_t>(val,
3127 Bignum(x).val);
3128 abandon(val);
3129 val = r;
3130 }
3131
3132
operator /=(const Bignum & x)3133 inline void operator /=(const Bignum &x)
3134 { std::intptr_t r = op_dispatch2<Quotient,std::intptr_t>(val,
3135 x.val);
3136 abandon(val);
3137 val = r;
3138 }
3139 template <typename T,
3140 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator /=(T x)3141 inline void operator /=(T x)
3142 { std::intptr_t r = op_dispatch2<Quotient,std::intptr_t>(val,
3143 Bignum(x).val);
3144 abandon(val);
3145 val = r;
3146 }
3147
3148
operator %=(const Bignum & x)3149 inline void operator %=(const Bignum &x)
3150 { std::intptr_t r = op_dispatch2<Remainder,std::intptr_t>(val,
3151 x.val);
3152 abandon(val);
3153 val = r;
3154 }
3155
3156 template <typename T,
3157 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator %=(T x)3158 inline void operator %=(T x)
3159 { std::intptr_t r = op_dispatch2<Remainder,std::intptr_t>(val,
3160 Bignum(x).val);
3161 abandon(val);
3162 val = r;
3163 }
3164
operator &=(const Bignum & x)3165 inline void operator &=(const Bignum &x)
3166 { std::intptr_t r = op_dispatch2<Logand,std::intptr_t>(val, x.val);
3167 abandon(val);
3168 val = r;
3169 }
3170 template <typename T,
3171 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator &=(T x)3172 inline void operator &=(T x)
3173 { std::intptr_t r = op_dispatch2<Logand,std::intptr_t>(val,
3174 Bignum(x).val);
3175 abandon(val);
3176 val = r;
3177 }
3178
operator |=(const Bignum & x)3179 inline void operator |=(const Bignum &x)
3180 { std::intptr_t r = op_dispatch2<Logor,std::intptr_t>(val, x.val);
3181 abandon(val);
3182 val = r;
3183 }
3184
3185 template <typename T,
3186 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator |=(T x)3187 inline void operator |=(T x)
3188 { std::intptr_t r = op_dispatch2<Logor,std::intptr_t>(val,
3189 Bignum(x).val);
3190 abandon(val);
3191 val = r;
3192 }
3193
operator ^=(const Bignum & x)3194 inline void operator ^=(const Bignum &x)
3195 { std::intptr_t r = op_dispatch2<Logxor,std::intptr_t>(val, x.val);
3196 abandon(val);
3197 val = r;
3198 }
3199
3200 template <typename T,
3201 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator ^=(T x)3202 inline void operator ^=(T x)
3203 { std::intptr_t r = op_dispatch2<Logxor,std::intptr_t>(val,
3204 Bignum(x).val);
3205 abandon(val);
3206 val = r;
3207 }
3208
3209 template <typename T,
3210 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator <<=(T n)3211 inline void operator <<=(T n)
3212 { std::intptr_t r = op_dispatch1<LeftShift,std::intptr_t>(val,
3213 static_cast<std::int64_t>(n));
3214 abandon(val);
3215 val = r;
3216 }
3217
3218 template <typename T,
3219 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
operator >>=(T n)3220 inline void operator >>=(T n)
3221 { std::intptr_t r = op_dispatch1<RightShift,std::intptr_t>(val,
3222 static_cast<std::int64_t>(n));
3223 abandon(val);
3224 val = r;
3225 }
3226
operator ++()3227 inline Bignum operator ++()
3228 { std::intptr_t r = bigplus_small(val, 1);
3229 abandon(val);
3230 val = r;
3231 return *this;
3232 }
3233
operator ++(int)3234 inline Bignum operator ++(int)
3235 { std::intptr_t r = bigplus_small(val, 1);
3236 // I assign explicitly to oldval.val because trying to use a constructor
3237 // of Bignum or assigning to one would so things more complicated than I want!
3238 Bignum oldval;
3239 oldval.val = val;
3240 val = r;
3241 return oldval;
3242 }
3243
operator --()3244 inline Bignum operator --()
3245 { std::intptr_t r = bigplus_small(val, -1);
3246 abandon(val);
3247 val = r;
3248 return *this;
3249 }
3250
operator --(int)3251 inline Bignum operator --(int)
3252 { std::intptr_t r = bigplus_small(val, -1);
3253 Bignum oldval;
3254 oldval.val = val;
3255 val = r;
3256 return oldval;
3257 }
3258
operator <<(std::ostream & out,const Bignum & a)3259 friend std::ostream & operator << (std::ostream &out, const Bignum &a)
3260 { std::ios_base::fmtflags fg = out.flags();
3261 #if defined LISP && !defined ZAPPA
3262 LispObject s;
3263 #else
3264 char *s;
3265 #endif
3266 if ((static_cast<unsigned int>(fg) & std::ios_base::hex) != 0U)
3267 s = bignum_to_string_hex(a.val);
3268 else if ((static_cast<unsigned int>(fg) & std::ios_base::oct) != 0U)
3269 s = bignum_to_string_octal(a.val);
3270 else if ((static_cast<unsigned int>(fg) & std::ios_base::dec) != 0U)
3271 s = bignum_to_string(a.val);
3272 else if (radix::is_binary_output(out))
3273 s = bignum_to_string_binary(a.val);
3274 else s = bignum_to_string(a.val);
3275 #if defined LISP && !defined ZAPPA
3276 std::string ss(s, length_of_byteheader(qheader(s)));
3277 out << ss;
3278 #else
3279 out << s;
3280 #endif
3281 abandon_string(s);
3282 return out;
3283 }
operator >>(std::istream & in,Bignum & a)3284 friend std::istream & operator >> (std::istream &in, Bignum &a)
3285 { std::int64_t n;
3286 // What I really want to do is to read in a string of digits and then
3287 // use string_to_bignum().
3288 in >> n;
3289 abandon(a.val);
3290 a.val = int_to_bignum(n);
3291 return in;
3292 }
3293 };
3294
3295 // I use a suffix "_Z" for bignums, with Z chosen to reminding me that this
3296 // gives me an Integer, the "Z" (typically written in a blackboard font)
3297 // standing for the ring of integers.
operator ""_Z(const char * s)3298 inline Bignum operator "" _Z(const char *s)
3299 { return Bignum(s);
3300 }
3301
to_string(Bignum x)3302 inline const string_handle to_string(Bignum x)
3303 { return bignum_to_string(x.val);
3304 }
3305
uniform_positive_bignum(std::size_t n)3306 inline Bignum uniform_positive_bignum(std::size_t n)
3307 { return Bignum(true, uniform_positive(n));
3308 }
3309
uniform_signed_bignum(std::size_t n)3310 inline Bignum uniform_signed_bignum(std::size_t n)
3311 { return Bignum(true, uniform_signed(n));
3312 }
3313
uniform_upto_bignum(Bignum a)3314 inline Bignum uniform_upto_bignum(Bignum a)
3315 { return Bignum(true, uniform_upto(a.val));
3316 }
3317
fudge_distribution_bignum(Bignum a,int n)3318 inline Bignum fudge_distribution_bignum(Bignum a, int n)
3319 { return Bignum(true, fudge_distribution(a.val, n));
3320 }
3321
random_upto_bits_bignum(std::size_t n)3322 inline Bignum random_upto_bits_bignum(std::size_t n)
3323 { return Bignum(true, random_upto_bits(n));
3324 }
3325
square(const Bignum & x)3326 inline Bignum square(const Bignum &x)
3327 { return Bignum(true, op_dispatch1<Square,std::intptr_t>(x.val));
3328 }
3329
isqrt(const Bignum & x)3330 inline Bignum isqrt(const Bignum &x)
3331 { return Bignum(true, op_dispatch1<Isqrt,std::intptr_t>(x.val));
3332 }
3333
abs(const Bignum & x)3334 inline Bignum abs(const Bignum &x)
3335 { return Bignum(true, op_dispatch1<Abs,std::intptr_t>(x.val));
3336 }
3337
zerop(const Bignum & x)3338 inline bool zerop(const Bignum &x)
3339 { return op_dispatch1<Zerop,bool>(x.val);
3340 }
3341
onep(const Bignum & x)3342 inline bool onep(const Bignum &x)
3343 { return op_dispatch1<Onep,bool>(x.val);
3344 }
3345
minusp(const Bignum & x)3346 inline bool minusp(const Bignum &x)
3347 { return op_dispatch1<Minusp,bool>(x.val);
3348 }
3349
evenp(const Bignum & x)3350 inline bool evenp(const Bignum &x)
3351 { return op_dispatch1<Evenp,bool>(x.val);
3352 }
3353
oddp(const Bignum & x)3354 inline bool oddp(const Bignum &x)
3355 { return op_dispatch1<Oddp,bool>(x.val);
3356 }
3357 template <typename T,
3358 typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
pow(const Bignum & x,T n)3359 inline Bignum pow(const Bignum &x, T n)
3360 { if (n == 0) return Bignum(true, int_to_bignum(1));
3361 else if (n == 1) return Bignum(true,
3362 copy_if_no_garbage_collector(x.val));
3363 else if (n == 2) return square(x);
3364 else return Bignum(true,
3365 op_dispatch1<Pow,std::intptr_t>(x.val, static_cast<std::int64_t>(n)));
3366 }
3367
3368 inline double double_bignum(const Bignum &x);
3369
pow(const Bignum & x,double n)3370 inline double pow(const Bignum &x, double n)
3371 { return std::pow(double_bignum(x), n);
3372 }
3373
gcd(const Bignum & x,const Bignum & y)3374 inline Bignum gcd(const Bignum &x, const Bignum &y)
3375 { return Bignum(true, op_dispatch2<Gcd,std::intptr_t>(x.val,
3376 y.val));
3377 }
3378
lcm(const Bignum & x,const Bignum & y)3379 inline Bignum lcm(const Bignum &x, const Bignum &y)
3380 { return Bignum(true, op_dispatch2<Lcm,std::intptr_t>(x.val,
3381 y.val));
3382 }
3383
fix_bignum(double d)3384 inline Bignum fix_bignum(double d)
3385 { return Bignum(true, trunc_double_to_int(d));
3386 }
3387
round_bignum(double d)3388 inline Bignum round_bignum(double d)
3389 { return Bignum(true, round_double_to_int(d));
3390 }
3391
trunc_bignum(double d)3392 inline Bignum trunc_bignum(double d)
3393 { return Bignum(true, trunc_double_to_int(d));
3394 }
3395
floor_bignum(double d)3396 inline Bignum floor_bignum(double d)
3397 { return Bignum(true, floor_double_to_int(d));
3398 }
3399
ceil_bignum(double d)3400 inline Bignum ceil_bignum(double d)
3401 { return Bignum(true, ceiling_double_to_int(d));
3402 }
3403
fix_bignum(float d)3404 inline Bignum fix_bignum(float d)
3405 { return fix_bignum(static_cast<double>(d));
3406 }
3407
round_bignum(float d)3408 inline Bignum round_bignum(float d)
3409 { return round_bignum(static_cast<double>(d));
3410 }
3411
trunc_bignum(float d)3412 inline Bignum trunc_bignum(float d)
3413 { return trunc_bignum(static_cast<double>(d));
3414 }
3415
floor_bignum(float d)3416 inline Bignum floor_bignum(float d)
3417 { return floor_bignum(static_cast<double>(d));
3418 }
3419
ceil_bignum(float d)3420 inline Bignum ceil_bignum(float d)
3421 { return ceil_bignum(static_cast<double>(d));
3422 }
3423
float_bignum(const Bignum & x)3424 inline double float_bignum(const Bignum &x)
3425 { return op_dispatch1<Float,float>(x.val);
3426 }
3427
double_bignum(const Bignum & x)3428 inline double double_bignum(const Bignum &x)
3429 { return op_dispatch1<Double,double>(x.val);
3430 }
3431
3432 // This will return a normalized double and an integer exponent.
3433 // It can be better than using frexp(double_bignum(x), ..) because it
3434 // avoids overflow.
3435
frexp_bignum(const Bignum & x,std::int64_t & xx)3436 inline double frexp_bignum(const Bignum &x, std::int64_t &xx)
3437 { return op_dispatch1<Frexp,double>(x.val, xx);
3438 }
3439
3440 #ifdef softfloat_h
3441
frexp128_bignum(const Bignum & x,std::int64_t & xx)3442 inline float128_t frexp128_bignum(const Bignum &x, std::int64_t &xx)
3443 { return op_dispatch1<Frexp128,float128_t>(x.val, xx);
3444 }
3445
float128_bignum(const Bignum & x)3446 inline float128_t float128_bignum(const Bignum &x)
3447 { return op_dispatch1<Float128,float128_t>(x.val);
3448 }
3449
3450 #endif // softfloat_h
3451
3452 //=========================================================================
3453 // display() will show the internal representation of a bignum as a
3454 // sequence of hex values. This is obviously useful while debugging!
3455 //=========================================================================
3456
display(const char * label,const std::uint64_t * a,std::size_t lena)3457 inline void display(const char *label, const std::uint64_t *a,
3458 std::size_t lena)
3459 { std::cout << label << " [" << static_cast<int>(lena) << "]";
3460 for (std::size_t i=0; i<lena; i++)
3461 { if (i!=0 && i%3==0) std::cout << std::endl << " ";
3462 std::cout << " "
3463 << std::hex << std::setfill('0')
3464 << "0x" << std::setw(16) << a[lena-i-1]
3465 << std::dec << std::setfill(' ');
3466 }
3467 std::cout << std::endl;
3468 }
3469
3470 // "rdisplay" is for generating trace output for use with Reduce.
3471 // The format is name := 0xDDDDDDD$
3472 // which will be easy to copy and paste into Reduce.
3473
rdisplay(const char * label,const std::uint64_t * a,std::size_t lena)3474 inline void rdisplay(const char *label, const std::uint64_t *a,
3475 std::size_t lena)
3476 { std::cout << label << " := 0x";
3477 for (std::size_t i=0; i<lena; i++)
3478 { std::cout << std::hex << std::setfill('0')
3479 << std::setw(16) << a[lena-i-1]
3480 << std::dec << std::setfill(' ');
3481 }
3482 std::cout << "$" << std::endl;
3483 }
3484
3485 // I provide a function that accesses (b<<shift)[n]. Note that the
3486 // valid index values n will from from 0 up to and including lenb.
3487
shifted_digit(std::uint64_t * b,std::size_t lenb,int shift,std::size_t n)3488 inline std::uint64_t shifted_digit(std::uint64_t *b, std::size_t lenb,
3489 int shift, std::size_t n)
3490 { if (n == 0) return b[0]<<shift;
3491 else if (n == lenb) return b[lenb-1]>>(64-shift);
3492 else return (b[n]<<shift) | (b[n-1]>>(64-shift));
3493 }
3494
display(const char * label,std::uint64_t * a,std::size_t lena,int shift)3495 inline void display(const char *label, std::uint64_t *a,
3496 std::size_t lena,
3497 int shift)
3498 { std::cout << label << " [" << static_cast<int>(lena) << "]";
3499 for (std::size_t i=0; i<=lena; i++)
3500 { if (i!=0 && i%3==0) std::cout << std::endl << " ";
3501 std::cout << " "
3502 << std::hex << std::setfill('0')
3503 << "0x" << std::setw(16)
3504 << shifted_digit(a, lena, shift, lena-i)
3505 << std::dec << std::setfill(' ');
3506 }
3507 std::cout << std::endl;
3508 }
3509
display(const char * label,std::intptr_t a)3510 inline void display(const char *label, std::intptr_t a)
3511 { if (stored_as_fixnum(a))
3512 { std::cout << label << " [fixnum] " << std::hex
3513 << "0x" << a << std::dec << " = "
3514 << int_of_handle(a) << std::endl;
3515 return;
3516 }
3517 std::uint64_t *d = vector_of_handle(a);
3518 std::size_t len = number_size(d);
3519 std::cout << label << " [" << static_cast<int>(len) << "]";
3520 for (std::size_t i=0; i<len; i++)
3521 std::cout << " "
3522 << std::hex << std::setfill('0')
3523 << "0x" << std::setw(16) << d[len-i-1]
3524 << std::dec << std::setfill(' ');
3525 std::cout << std::endl;
3526 }
3527
display(const char * label,const Bignum & a)3528 inline void display(const char *label, const Bignum &a)
3529 { display(label, a.val);
3530 }
3531
3532
3533 //=========================================================================
3534 //=========================================================================
3535 // I will have a collection of low level functions that support the
3536 // fundamental operations needed for implementing big-number arithmetic:
3537 // add-with-carry, multiplication and division.
3538 //=========================================================================
3539 //=========================================================================
3540
3541 #ifdef __GNUC__
3542
3543 // Note that __GNUC__ also gets defined by clang on the Macintosh, so
3544 // this code is probably optimized there too. This must NEVER be called
3545 // with a zero argument.
3546
3547 // Count the leading zeros in a 64-bit word.
3548
nlz(std::uint64_t x)3549 inline int nlz(std::uint64_t x)
3550 { return __builtin_clzll(x); // Must use the 64-bit version of clz.
3551 }
3552
popcount(std::uint64_t x)3553 inline int popcount(std::uint64_t x)
3554 { return __builtin_popcountll(x);
3555 }
3556
3557 #else // __GNUC__
3558
nlz(std::uint64_t x)3559 inline int nlz(std::uint64_t x)
3560 { int n = 0;
3561 if (x <= 0x00000000FFFFFFFFU)
3562 { n = n +32;
3563 x = x <<32;
3564 }
3565 if (x <= 0x0000FFFFFFFFFFFFU)
3566 { n = n +16;
3567 x = x <<16;
3568 }
3569 if (x <= 0x00FFFFFFFFFFFFFFU)
3570 { n = n + 8;
3571 x = x << 8;
3572 }
3573 if (x <= 0x0FFFFFFFFFFFFFFFU)
3574 { n = n + 4;
3575 x = x << 4;
3576 }
3577 if (x <= 0x3FFFFFFFFFFFFFFFU)
3578 { n = n + 2;
3579 x = x << 2;
3580 }
3581 if (x <= 0x7FFFFFFFFFFFFFFFU)
3582 { n = n + 1;
3583 }
3584 return n;
3585 }
3586
popcount(std::uint64_t x)3587 inline int popcount(std::uint64_t x)
3588 { x = (x & 0x5555555555555555U) + (x >> 1 & 0x5555555555555555U);
3589 x = (x & 0x3333333333333333U) + (x >> 2 & 0x3333333333333333U);
3590 x = x + (x >> 4) & 0x0f0f0f0f0f0f0f0fU;
3591 x = x + (x >> 8);
3592 x = x + (x >> 16);
3593 x = x + (x >> 32) & 0x7f;
3594 }
3595
3596 #endif // __GNUC__
3597
3598 // Round a size_t integer up to the next higher power of 2.
3599 // I do this based on counting the number of leading zeros in the
3600 // binary representation of n-1.
3601
next_power_of_2(std::size_t n)3602 inline std::size_t next_power_of_2(std::size_t n)
3603 { return (static_cast<std::size_t>(1)) << (64-nlz(
3604 static_cast<std::uint64_t>(n-1)));
3605 }
3606
log_next_power_of_2(std::size_t n)3607 inline unsigned int log_next_power_of_2(std::size_t n)
3608 { return (64-nlz(static_cast<std::uint64_t>(n-1)));
3609 }
3610
3611 // I am going to represent bignums as arrays of 64-bit digits.
3612 // Overall the representation will use 2s complement, and so all but the
3613 // top digit will be treated as unsigned, while the top one is signed
3614 // and the whole number must act as if it had its sign bit propagated
3615 // indefinitely to the left. When I pass numbers to the low level
3616 // code I will pass references to the input arrays and lengths. I will
3617 // pass an arrange that will certainly be large enough to hold the result
3618 // and the arithmetic functions will return the length in it that is used.
3619 // This length will be such that the overall number does not have any
3620 // extraneous leading zeros or leading 0xffffffffffffffff words, save that
3621 // the value zero will be returned as a single word value not a no-word
3622 // one. Note the word "extraneous", because the positive value 2^64-1
3623 // will be represented as a 2-word item with 0 in the higher digit and
3624 // 0xffffffffffffffff in the lower one - the leading zero is needed so
3625 // that it is clear that the value is positive. A consequence of all this
3626 // is that any bignum with length 1 can be extracted as an int64_t without
3627 // loss.
3628
3629 // I want "add-with-carry" operations, and so I provide a function here to
3630 // implement it. If the C++ compiler had a nice intrinsic I would like
3631 // to use that! Well Intel compilers have an _addcarry_u64 that passes and
3632 // returns the carry in an unsigned char and uses a pointer not a reference
3633 // argument for passing back the result.
3634
3635 // a1 and a2 are 64-bit unsigned integers. While c_in is also that type it
3636 // must only have one of the values 0 or 1. The effect will be to set r to
3637 // the low 64-bits of a1+a2+c_in and return any carry that is generated.
3638
3639 // I have an overload of add_with_carry for use where it is known that
3640 // the input carry is zero. That cases saves a small amount of work.
3641 // The code as written here seems to lead to a good compiled version using
3642 // g++ on x86_64 and -O3.
3643
add_with_carry(std::uint64_t a1,std::uint64_t a2,std::uint64_t & r)3644 inline std::uint64_t add_with_carry(std::uint64_t a1,
3645 std::uint64_t a2,
3646 std::uint64_t &r)
3647 { return ((r = a1 + a2) < a1);
3648 }
3649
3650 // Now the general version with a carry-in. Note that in fact this version
3651 // will support any value in c_in, not merely a single bit. Thus the
3652 // carry_out can end up as 0, 1 or 2.
3653
add_with_carry(std::uint64_t a1,std::uint64_t a2,std::uint64_t a3,std::uint64_t & r)3654 inline std::uint64_t add_with_carry(std::uint64_t a1,
3655 std::uint64_t a2,
3656 std::uint64_t a3, std::uint64_t &r)
3657 { std::uint64_t w;
3658 int c1 = add_with_carry(a1, a3, w);
3659 return c1 + add_with_carry(w, a2, r);
3660 }
3661
3662 // In some places my code may be made nicer by having a version that
3663 // adds 4 values.
3664
add_with_carry(std::uint64_t a1,std::uint64_t a2,std::uint64_t a3,std::uint64_t a4,std::uint64_t & r)3665 inline std::uint64_t add_with_carry(std::uint64_t a1,
3666 std::uint64_t a2,
3667 std::uint64_t a3, std::uint64_t a4,
3668 std::uint64_t &r)
3669 { std::uint64_t w1, w2;
3670 int c1 = add_with_carry(a1, a2, w1);
3671 int c2 = add_with_carry(a3, a4, w2);
3672 return c1 + c2 + add_with_carry(w1, w2, r);
3673 }
3674
3675 // subtract_with_borrow does
3676 // r = a1 - a2 - b_in;
3677 // and returns 1 is there is a borrow out.
3678
subtract_with_borrow(std::uint64_t a1,std::uint64_t a2,std::uint64_t & r)3679 inline std::uint64_t subtract_with_borrow(std::uint64_t a1,
3680 std::uint64_t a2,
3681 std::uint64_t &r)
3682 { r = a1 - a2;
3683 return (r > a1);
3684 }
3685
subtract_with_borrow(std::uint64_t a1,std::uint64_t a2,std::uint64_t b_in,std::uint64_t & r)3686 inline std::uint64_t subtract_with_borrow(std::uint64_t a1,
3687 std::uint64_t a2,
3688 std::uint64_t b_in, std::uint64_t &r)
3689 { std::uint64_t w;
3690 int b1 = subtract_with_borrow(a1, b_in, w);
3691 return b1 + subtract_with_borrow(w, a2, r);
3692 }
3693
3694 // I want code that will multiply two 64-bit values and yield a 128-bit
3695 // result. The result must be expressed as a pair of 64-bit integers.
3696 // If I have a type "__int128", as will often be the case when using gcc,
3697 // this is very easy to express. Otherwise I split the two inputs into
3698 // 32-bit halves, do 4 multiplications and some additions to construct
3699 // the result. At least I can keep the code portable, even if I can then
3700 // worry about performance a bit.
3701
3702
3703
3704
3705 #ifdef __SIZEOF_INT128__
3706
3707 // Well it seems that g++ and clang have different views about how to
3708 // ask for unsigned 128-bit integers! So I abstract that away via a typedef
3709 // called UNIT128.
3710
3711 #ifdef __CLANG__
3712 typedef __int128 INT128;
3713 typedef __uint128 UINT128;
3714 #else // __CLANG__
3715 typedef __int128 INT128;
3716 typedef unsigned __int128 UINT128;
3717 #endif // __CLANG__
3718
3719 // At least for debugging I may wish to display 128-bit integers. Here I
3720 // only do hex printing. I could do decimal and octal if I really wanted
3721 // but just for debugging that does not seem vital. If some C++ compiler
3722 // already supported printing of 128-bit ints this definition might clash
3723 // and would need commenting out.
3724
operator <<(std::ostream & out,UINT128 a)3725 inline std::ostream & operator << (std::ostream &out, UINT128 a)
3726 { out << std::hex << std::setw(16) << std::setfill('0') <<
3727 static_cast<std::uint64_t>(a>>64)
3728 << " "
3729 << static_cast<std::uint64_t>(a) << std::dec << std::setfill(' ');
3730 return out;
3731 }
3732
pack128(std::uint64_t hi,std::uint64_t lo)3733 inline UINT128 pack128(std::uint64_t hi, std::uint64_t lo)
3734 { return (static_cast<UINT128>(hi)<<64) | lo;
3735 }
3736
multiply64(std::uint64_t a,std::uint64_t b,std::uint64_t & hi,std::uint64_t & lo)3737 inline void multiply64(std::uint64_t a, std::uint64_t b,
3738 std::uint64_t &hi, std::uint64_t &lo)
3739 { UINT128 r = static_cast<UINT128>(a)*static_cast<UINT128>(b);
3740 hi = static_cast<std::uint64_t>(r >> 64);
3741 lo = static_cast<std::uint64_t>(r);
3742 }
3743
3744 // Now much the same but forming a*b+c. Note that this can not overflow
3745 // the 128-bit result. Both hi and lo are only updated at the end
3746 // of this, and so they are allowed to be the same as other arguments.
3747
multiply64(std::uint64_t a,std::uint64_t b,std::uint64_t c,std::uint64_t & hi,std::uint64_t & lo)3748 inline void multiply64(std::uint64_t a, std::uint64_t b,
3749 std::uint64_t c,
3750 std::uint64_t &hi, std::uint64_t &lo)
3751 { UINT128 r = static_cast<UINT128>(a)*static_cast<UINT128>
3752 (b) + static_cast<UINT128>(c);
3753 hi = static_cast<std::uint64_t>(r >> 64);
3754 lo = static_cast<std::uint64_t>(r);
3755 }
3756
signed_multiply64(std::int64_t a,std::int64_t b,std::int64_t & hi,std::uint64_t & lo)3757 inline void signed_multiply64(std::int64_t a, std::int64_t b,
3758 std::int64_t &hi, std::uint64_t &lo)
3759 { INT128 r = static_cast<INT128>(a)*static_cast<INT128>(b);
3760 hi = static_cast<std::int64_t>(static_cast<UINT128>(r) >> 64);
3761 lo = static_cast<std::uint64_t>(r);
3762 }
3763
signed_multiply64(std::int64_t a,std::int64_t b,std::uint64_t c,std::int64_t & hi,std::uint64_t & lo)3764 inline void signed_multiply64(std::int64_t a, std::int64_t b,
3765 std::uint64_t c,
3766 std::int64_t &hi, std::uint64_t &lo)
3767 { UINT128 r = static_cast<UINT128>(
3768 static_cast<INT128>(a)*static_cast<INT128>(b))
3769 + static_cast<UINT128>(c);
3770 hi = static_cast<std::int64_t>(r >> 64);
3771 lo = static_cast<std::uint64_t>(r);
3772 }
3773
3774 // divide {hi,lo} by divisor and generate a quotient and a remainder. The
3775 // version of the code that is able to use __int128 can serve as clean
3776 // documentation of the intent.
3777
divide64(std::uint64_t hi,std::uint64_t lo,std::uint64_t divisor,std::uint64_t & q,std::uint64_t & r)3778 inline void divide64(std::uint64_t hi, std::uint64_t lo,
3779 std::uint64_t divisor,
3780 std::uint64_t &q, std::uint64_t &r)
3781 { arithlib_assert(divisor != 0 && hi < divisor);
3782 UINT128 dividend = pack128(hi, lo);
3783 q = dividend / divisor;
3784 r = dividend % divisor;
3785 }
3786
3787 #else // __SIZEOF_INT128__
3788
3789 // If the C++ system I am using does not support and 128-bit integer
3790 // type or if I have not detected it everything can still be done using
3791 // lots of 64-bit operations, with each 64-bit value often treated as
3792 // two 32-bit halves.
3793
multiply64(std::uint64_t a,std::uint64_t b,std::uint64_t & hi,std::uint64_t & lo)3794 inline void multiply64(std::uint64_t a, std::uint64_t b,
3795 std::uint64_t &hi, std::uint64_t &lo)
3796 { std::uint64_t a1 = a >> 32, // top half
3797 a0 = a & 0xFFFFFFFFU; // low half
3798 std::uint64_t b1 = b >> 32, // top half
3799 b0 = b & 0xFFFFFFFFU; // low half
3800 std::uint64_t u1 = a1*b1, // top of result
3801 u0 = a0*b0; // bottom of result
3802 // Now I need to add in the two "middle" bits a0*b1 and a1*b0
3803 std::uint64_t w = a0*b1;
3804 u1 += w >> 32;
3805 w <<= 32;
3806 u0 += w;
3807 if (u0 < w) u1++;
3808 // a0*b1 done
3809 w = a1*b0;
3810 u1 += w >> 32;
3811 w <<= 32;
3812 u0 += w;
3813 if (u0 < w) u1++;
3814 hi = u1;
3815 lo = u0;
3816 }
3817
3818 // Now much the same but forming a*b+c. Note that this can not overflow
3819 // the 128-bit result. Both hi and lo are only updated at the end
3820 // of this, and so they are allowed to be the same as other arguments.
3821
multiply64(std::uint64_t a,std::uint64_t b,std::uint64_t c,std::uint64_t & hi,std::uint64_t & lo)3822 inline void multiply64(std::uint64_t a, std::uint64_t b,
3823 std::uint64_t c,
3824 std::uint64_t &hi, std::uint64_t &lo)
3825 { std::uint64_t a1 = a >> 32, // top half
3826 a0 = a & 0xFFFFFFFFU; // low half
3827 std::uint64_t b1 = b >> 32, // top half
3828 b0 = b & 0xFFFFFFFFU; // low half
3829 std::uint64_t u1 = a1*b1, // top of result
3830 u0 = a0*b0; // bottom of result
3831 // Now I need to add in the two "middle" bits a0*b1 and a1*b0
3832 std::uint64_t w = a0*b1;
3833 u1 += w >> 32;
3834 w <<= 32;
3835 u0 += w;
3836 if (u0 < w) u1++;
3837 // a0*b1 done
3838 w = a1*b0;
3839 u1 += w >> 32;
3840 w <<= 32;
3841 u0 += w;
3842 if (u0 < w) u1++;
3843 u0 += c; // add in C.
3844 if (u0 < c) u1++;
3845 hi = u1;
3846 lo = u0;
3847 }
3848
signed_multiply64(std::int64_t a,std::int64_t b,std::int64_t & hi,std::uint64_t & lo)3849 inline void signed_multiply64(std::int64_t a, std::int64_t b,
3850 std::int64_t &hi, std::uint64_t &lo)
3851 { std::uint64_t h, l;
3852 multiply64(static_cast<std::uint64_t>(a),
3853 static_cast<std::uint64_t>(b), h,
3854 l);
3855 if (a < 0) h -= static_cast<std::uint64_t>(b);
3856 if (b < 0) h -= static_cast<std::uint64_t>(a);
3857 hi = static_cast<std::int64_t>(h);
3858 lo = l;
3859 }
3860
signed_multiply64(std::int64_t a,std::int64_t b,std::uint64_t c,std::int64_t & hi,std::uint64_t & lo)3861 inline void signed_multiply64(std::int64_t a, std::int64_t b,
3862 std::uint64_t c,
3863 std::int64_t &hi, std::uint64_t &lo)
3864 { std::uint64_t h, l;
3865 multiply64(static_cast<std::uint64_t>(a),
3866 static_cast<std::uint64_t>(b), c, h,
3867 l);
3868 if (a < 0) h -= static_cast<std::uint64_t>(b);
3869 if (b < 0) h -= static_cast<std::uint64_t>(a);
3870 hi = static_cast<std::int64_t>(h);
3871 lo = l;
3872 }
3873
divide64(std::uint64_t hi,std::uint64_t lo,std::uint64_t divisor,std::uint64_t & q,std::uint64_t & r)3874 inline void divide64(std::uint64_t hi, std::uint64_t lo,
3875 std::uint64_t divisor,
3876 std::uint64_t &q, std::uint64_t &r)
3877 { arithlib_assert(divisor != 0 && hi < divisor);
3878 std::uint64_t u1 = hi;
3879 std::uint64_t u0 = lo;
3880 std::uint64_t c = divisor;
3881 // See the Hacker's Delight for commentary about what follows. The associated
3882 // web-site explains usage rights:
3883 // "You are free to use, copy, and distribute any of the code on this web
3884 // site (www.hackersdelight.org) , whether modified by you or not. You need
3885 // not give attribution. This includes the algorithms (some of which appear
3886 // in Hacker's Delight), the Hacker's Assistant, and any code submitted by
3887 // readers. Submitters implicitly agree to this." and then "The author has
3888 // taken care in the preparation of this material, but makes no expressed
3889 // or implied warranty of any kind and assumes no responsibility for errors
3890 // or omissions. No liability is assumed for incidental or consequential
3891 // damages in connection with or arising out of the use of the information
3892 // or programs contained herein."
3893 // I may not be obliged to give attribution, but I view it as polite to!
3894 // Any error that have crept in in my adapaptation of the original code
3895 // will be my fault, but you see in the BSD license at the top of this
3896 // file that I disclaim any possible liability for consequent loss or damage.
3897 const std::uint64_t base = 0x100000000U; // Number base (32 bits).
3898 std::uint64_t un1, un0, // Norm. dividend LSD's.
3899 vn1, vn0, // Norm. divisor digits.
3900 q1, q0, // Quotient digits.
3901 un32, un21, un10,// Dividend digit pairs.
3902 rhat; // A remainder.
3903 // I am going to shift both operands left until the divisor has its
3904 // most significant bit set.
3905 int s = nlz(c); // Shift amount for norm. 0 <= s <= 63.
3906 c = c << s; // Normalize divisor.
3907 // Now I split the divisor from a single 64-bit number into a pair
3908 // of 32-vit values.
3909 vn1 = c >> 32; // Break divisor up into
3910 vn0 = c & 0xFFFFFFFFU; // two 32-bit digits.
3911 // Shift the dividend... and split it into parts.
3912 if (s == 0) un32 = u1;
3913 else un32 = (u1 << s) | (u0 >> (64 - s));
3914 un10 = u0 << s; // Shift dividend left.
3915 un1 = un10 >> 32; // Break right half of
3916 un0 = un10 & 0xFFFFFFFFU; // dividend into two digits.
3917 // Predict a 32-bit quotient digit...
3918 q1 = un32/vn1; // Compute the first
3919 rhat = un32 - q1*vn1; // quotient digit, q1.
3920 again1:
3921 if (q1 >= base || q1*vn0 > base*rhat + un1)
3922 { q1 = q1 - 1;
3923 rhat = rhat + vn1;
3924 if (rhat < base) goto again1;
3925 }
3926 un21 = un32*base + un1 - q1*c; // Multiply and subtract.
3927 q0 = un21/vn1; // Compute the second
3928 rhat = un21 - q0*vn1; // quotient digit, q0.
3929 again2:
3930 if (q0 >= base || q0*vn0 > base*rhat + un0)
3931 { q0 = q0 - 1;
3932 rhat = rhat + vn1;
3933 if (rhat < base) goto again2;
3934 }
3935 q = (q1 << 32) | q0; // assemble and return quotient & remainder
3936 r = (un21*base + un0 - q0*c) >> s;
3937 }
3938
3939 #endif // __SIZEOF_INT128__
3940
3941 // While my arithmetic is all done in uint64_t (and that is important so
3942 // that in C++ the consequences of overflow are defined) I need to treat
3943 // some top-digits as signed: here are values and tests relating to that.
3944
3945 INLINE_VAR const std::uint64_t allbits =
3946 ~static_cast<std::uint64_t>(0);
3947 INLINE_VAR const std::uint64_t topbit = static_cast<std::uint64_t>
3948 (1)<<63;
3949 INLINE_VAR const std::uint64_t allbuttop = topbit - 1;
3950
positive(std::uint64_t a)3951 inline bool positive(std::uint64_t a)
3952 { return static_cast<std::int64_t>(a) >= 0;
3953 }
3954
negative(std::uint64_t a)3955 inline bool negative(std::uint64_t a)
3956 { return static_cast<std::int64_t>(a) < 0;
3957 }
3958
3959
3960 // This next function might be naivly written as
3961 // return ((a1==0 && positive(a2)) ||
3962 // (a1==-1 && negative(a2)));
3963 // and it is to test if a bignum can have its top digit removed.
3964
shrinkable(std::uint64_t a1,std::uint64_t a2)3965 inline bool shrinkable(std::uint64_t a1, std::uint64_t a2)
3966 { return ((a1 + (a2>>63)) == 0);
3967 }
3968
internal_copy(const std::uint64_t * a,std::size_t lena,std::uint64_t * b)3969 inline void internal_copy(const std::uint64_t *a, std::size_t lena,
3970 std::uint64_t *b)
3971 { std::memcpy(b, a, lena*sizeof(std::uint64_t));
3972 }
3973
3974 // This internal functions sets b to be -a without altering its length.
3975 // Because length is not changed it does not need a length for the
3976 // destination passed to it.
3977
internal_negate(const std::uint64_t * a,std::size_t lena,std::uint64_t * b)3978 inline void internal_negate(const std::uint64_t *a, std::size_t lena,
3979 std::uint64_t *b)
3980 { std::uint64_t carry = 1;
3981 for (std::size_t i=0; i<lena; i++)
3982 { std::uint64_t w = b[i] = ~a[i] + carry;
3983 carry = (w < carry ? 1 : 0);
3984 }
3985 }
3986
3987 // When printing numbers in octal it will be handy to be able treat the
3988 // data as an array of 3-bit digits, so here is an access function that
3989 // does that. There is a messy issue about the top of a number, where it
3990 // may not be a whole number of 3-bit octal digits. I pass in v, a vector
3991 // of 64-bit values, n which is the length of that vector and i which
3992 // is the index of the octal digit that I wish to extract. To help with
3993 // that I have a function virtual_digit64() which lets me read from a
3994 // bignum as if it has been usefully sign-extended.
3995
virtual_digit64(const std::uint64_t * v,std::size_t n,std::size_t j)3996 inline std::uint64_t virtual_digit64(const std::uint64_t *v,
3997 std::size_t n,
3998 std::size_t j)
3999 { if (j < n) return v[j];
4000 else if (positive(v[n-1])) return 0;
4001 else return UINT64_C(0xffffffffffffffff);
4002 }
4003
4004 // This function reads a 3-bit digit from a bignum, and is for use when
4005 // printing in octal.
4006
read_u3(const std::uint64_t * v,std::size_t n,std::size_t i)4007 inline int read_u3(const std::uint64_t *v, std::size_t n,
4008 std::size_t i)
4009 { std::size_t bits = 3*i;
4010 std::size_t n0 = bits/64; // word with lowest bit of the 3
4011 std::size_t s0 =
4012 bits%64; // amount to shift right to align it properly
4013 std::uint64_t w = virtual_digit64(v, n, n0) >> s0;
4014 // If I needed to shift by 62 or 63 bits then the octal digit I am interested
4015 // in needs some bits from the next word up.
4016 if (s0 >= 62) w |= (virtual_digit64(v, n, n0+1) << (64-s0));
4017 return static_cast<int>(w & 0x7);
4018 }
4019
4020 //=========================================================================
4021 //=========================================================================
4022 // Some support for two models of memory layout. If VSL is set a number
4023 // will be represented as an intptr_t value with its low 3 bits used as
4024 // a tag. When the tag is removed and the intptr_t is cast to (uint64_t *)
4025 // it points at a block of words. The first word holds a header value
4026 // that includes (in packed form) the length of the block. Beyond that
4027 // is the row of uint64_t values making up the bignum itself.
4028 //
4029 // For more direct C++ use the type is just (uint64_t *) and it refers
4030 // directly to the row of digits of the bignum. However at the address
4031 // just ahead of that (ie at v[-1]) there is a header word giving the
4032 // length of the bignum.
4033 // Sometime soon this header word will be structured as two 32-bit
4034 // parts. One will give the number of 64-bit elements of the vector that
4035 // are actually in ise. The other will be a small integer indicating
4036 // a power of two that is the size of memory block that was allocated.
4037 // Such a scheme always rounds allocated sizes up using next_power_of_2()
4038 // and then when the actual number of digits a number occupies turns out
4039 // to be less than it might have there is no need to recycle memory - the
4040 // "actual length" field is just updates. Furthermore a modest sized
4041 // table can keep freelists of discarded blocks on each size, so allocation
4042 // is potentially speeded up.
4043 //=========================================================================
4044 //=========================================================================
4045
4046
4047 // For a free-standing bignum application (including my test code for the
4048 // stuff here, bignums are represented as blocks of memory (allocated using
4049 // malloc) where the pointer that is used points to the start of the
4050 // array of bignum digits, and the word in memory before that contains
4051 // the length (in words) of the block.
4052 // Strings are returned to the user as freshly malloced memory holding a
4053 // native-style C++ string with a terminating NUL character at the end.
4054
4055
4056 //=========================================================================
4057 //=========================================================================
4058 // Random number support
4059 //=========================================================================
4060 //=========================================================================
4061
4062 // Well this is the first place in this file where an issue arises that will
4063 // apply in several other places. I want some data that is static and
4064 // some that is thread_local. But I am making this a header-only library, so
4065 // it is liable to be included from several compilation units, and so if
4066 // I simply make variables
4067 // static int something;
4068 // thread_local int something_else;
4069 // I will suffer from multiple-definition problems. For the next few years
4070 // (the current date is May 2019) I can not use "inline int something;"
4071 // which would do the trick but which is not supported by all the slightly
4072 // older C++ implementations I want to be able to use. My resolution is
4073 // along the lines of
4074 // inline int &something() { static int x = 17; return x; }
4075 // And then I can go for instance:
4076 // int a = something();
4077 // something()++;
4078 // int &aref = something();
4079 // ... aref .. aref ... aref;
4080 // The effect is that about the only misery is that when I refer to the value
4081 // I need to write and extra pair of parentheses. And the last fragment of
4082 // example shows how if I am going to make multiple uses of something I can
4083 // do the function call just once. That minor optimization may become
4084 // more important if I use thread_local instead of static, since it can
4085 // make the access overhead of of thread local variable arise just once
4086 // rather than potentially many times.
4087 // There is (I am afraid) more to say, especially in the static value is
4088 // a more complex type than merely "int". The initialization of x will
4089 // happen just once. But about all you can guarantee is that it will happen
4090 // at latest when something() is first called. This could result in the code
4091 // for something() needing to incorporate a "has it been initialized" test.
4092 // That though may be extra encouragement to use the "aref" idiom.
4093 // Now if x is a class object we can also worry about when its destructor
4094 // will be called! Well x is static it gets destroyed at program termination
4095 // with all the usual joys regarding seqencing. If it is thread_local its
4096 // destructor is invoked as the thread terminates. Again it will be prudent
4097 // to avoid too many assumptions about the ordering of destruction, and when
4098 // such odrerings are relied upon to include copious comments in the code
4099 // as to why everything will be OK. In particular it will be hard to have
4100 // guarantees about the order in which constructors have been called.
4101 //
4102 // Note to self: one might believe that
4103 // class Something { static int x; ... }
4104 // was going to be helpful, because that guarantees that there will only be
4105 // a single instance of x (and if it was thread_local similarly). However
4106 // what you have there is a declation of x not a definition, and without a
4107 // ?thread_local? int Something::x = 3;
4108 // sort of definition somewhere. But without it being "inline int.." This
4109 // would cause trouble with multiple-file scenarios. So I am not going to
4110 // be using static members of classes to sort myself out.
4111 //
4112 // It is useful to be able to generate random values. C++11 is simultaneously
4113 // very helpful and rather unhelpful. The class std::random_device is
4114 // expected to yield genuine unpredictable values, but it is not guaranteed
4115 // to and it fails to on some platforms, so despite the fact that when it
4116 // works it is a really good thing I can not rely solely on it. Each time I
4117 // use a random_device it gives me just 32 bits. For my real generator that
4118 // is not really enough.
4119 // So here I create 3 notionally unpredictable units and then merge in the
4120 // identity of the current thread and two measurements related to time.
4121 // To avoid thread safety issues with random_device I make calls to it
4122 // global, and then the thread identifier and time of day information stands
4123 // a prospect of arranging that each thread gets its own mersenne-twister
4124 // with its own seeding.
4125 // Note that Wikipedia explains "Multiple instances that differ only in
4126 // seed value (but not other parameters) are not generally appropriate
4127 // for Monte-Carlo simulations that require independent random number
4128 // generators" and here even the independence of my thread-specific
4129 // seed values is questionable.
4130
4131 // I perform all this setup at initialization time, but by wrapping the
4132 // same sequence of steps as a critical region I could use it to re-seed
4133 // generators whenever I felt the need to.
4134 //
4135
4136 // The code here is explicitly aware of the prospect of threading, and
4137 // should lead to different pseudo-random sequences in each thread.
4138 // If you do not have C++17 inline variables then there will be a separate
4139 // generator for each compilation unit and especially if you have an
4140 // unreliable std::random_device about the only thing liable to keep
4141 // things distinct in each will be the high resolution clock. Well to
4142 // try to improve things there I will use the address of one of these
4143 // variables as part of the seeding process, so that if they all end
4144 // up static rather than inline that will give per-compilation-uint
4145 // variation.
4146
4147 // Note that the thread local status information for a random number
4148 // generator will be initialized in EVERY thread that is created. This
4149 // includes the worker threads for Karatsuba multiplicatin and in a
4150 // broader context where I use this library it will include threads that
4151 // are used for GUI or other I/O purposes. So theer is a benefit if C++
4152 // delays initialization of any of the variables within the following
4153 // function until the function is first used!
4154
ref_mersenne_twister()4155 inline std::mt19937_64 &ref_mersenne_twister()
4156 { std::random_device basic_randomness;
4157 // Yes the static procedure-local variables here may carry some
4158 // overhead as the system considers whether it wants to initialize them, but
4159 // the overall cost here is already probably high as I accumulate entropy
4160 // and so I am not going to worry.
4161 static thread_local std::uint64_t threadid =
4162 static_cast<std::uint64_t>(std::hash<std::thread::id>()(
4163 std::this_thread::get_id()));
4164 static std::uint64_t seed_component_1 = static_cast<std::uint64_t>
4165 (basic_randomness());
4166 static std::uint64_t seed_component_2 = static_cast<std::uint64_t>
4167 (basic_randomness());
4168 static std::uint64_t seed_component_3 = static_cast<std::uint64_t>
4169 (basic_randomness());
4170 static thread_local std::uint64_t time_now =
4171 static_cast<std::uint64_t>
4172 (std::time(NULL));
4173 static thread_local std::uint64_t chrono_now =
4174 static_cast<std::uint64_t>(
4175 std::chrono::high_resolution_clock::now().time_since_epoch().count());
4176 // In my first draft of this library I had made the random seed directly
4177 // from uint64_t values. However when testing on a Raspberry Pi that
4178 // triggered a messages about bugs in gcc before gcc7 (relating to the
4179 // alignment of some values passed as arguments in obscure cases). Building
4180 // the seed sequence using 32-bit values avoids that issue, and since this
4181 // is only done during initialization it is not time-critical.
4182 //
4183 static thread_local std::seed_seq random_seed
4184 { static_cast<std::uint32_t>(threadid),
4185 static_cast<std::uint32_t>(seed_component_1),
4186 static_cast<std::uint32_t>(seed_component_2),
4187 static_cast<std::uint32_t>(seed_component_3),
4188 static_cast<std::uint32_t>(time_now),
4189 static_cast<std::uint32_t>(chrono_now),
4190 static_cast<std::uint32_t>(threadid>>32),
4191 static_cast<std::uint32_t>(seed_component_1>>32),
4192 static_cast<std::uint32_t>(seed_component_2>>32),
4193 static_cast<std::uint32_t>(seed_component_3>>32),
4194 static_cast<std::uint32_t>(time_now>>32),
4195 static_cast<std::uint32_t>(chrono_now>>32),
4196 static_cast<std::uint32_t>(
4197 reinterpret_cast<std::uintptr_t>(&seed_component_1)),
4198 static_cast<std::uint32_t>(
4199 static_cast<std::uint64_t>(
4200 reinterpret_cast<std::uintptr_t>(&seed_component_1))>>32)
4201 };
4202
4203 static thread_local std::mt19937_64 inner_mersenne_twister(
4204 random_seed);
4205 // mersenne_twister() now generates 64-bit unsigned integers.
4206 return inner_mersenne_twister;
4207 }
4208
4209 // If you are going to use vary many random numbers it may be a good
4210 // and you might be running under Cygwin or mingw32 it could be idea to
4211 // use ref_mersenne_twister() once to collect the thread local instance
4212 // relevant to you [note that it is a class object that provides operator(),
4213 // not really a function, despite appearances!]. That way you only do the
4214 // thread_local activity once (I hope).
4215
mersenne_twister()4216 MAYBE_UNUSED static std::uint64_t mersenne_twister()
4217 { return ref_mersenne_twister()();
4218 }
4219
4220 // To re-seed I can just call this. I think that when I re-seed it will be
4221 // to gain more repeatable behaviour, and so I am fairly happy about
4222 // limiting the amount of input entropy here to 64-bits. If I was keen I
4223 // could provide a reseed-method taking a bignum argument that could have
4224 // lots of data in it. Note that this will reseed the random number
4225 // generator associated with the thread it is called from. Note that there
4226 // is one generator per thread, so if you have multiple threads and you reseed
4227 // you are liable to want to reseed each of them.
4228 //
4229
reseed(std::uint64_t n)4230 MAYBE_UNUSED static void reseed(std::uint64_t n)
4231 { ref_mersenne_twister().seed(n);
4232 }
4233
4234 // Now a number of functions for setting up random bignums. These may be
4235 // useful for users, but they will also be very useful while testing this
4236 // code.
4237
4238 // Return a random integer in the range 0 ... n-1.
4239 // Given that the largest n that can be passed is UINT64_MAX the biggest
4240 // rangs that can be generated here is 1 less than the full range of 64-bit
4241 // values. To get a full 64-bit range merely call mersenne_twister()
4242 // directly.
4243
uniform_uint64(std::uint64_t n)4244 inline std::uint64_t uniform_uint64(std::uint64_t n)
4245 { if (n <= 1) return 0;
4246 // I I want the remainder operation on the last line of this function to
4247 // return a uniformly distributed result. To ensure that I want r to be
4248 // drawn uniformly from a range that is a multiple of n.
4249 std::uint64_t q = UINT64_MAX/n;
4250 std::uint64_t w = n*q;
4251 std::uint64_t r;
4252 // In the worst case here n was just over UINT64_MAX/2 and q came out
4253 // as 1. In that case on average I will need to call mersenne_twister
4254 // twice. Either larger or smaller inputs will behave better, and rather
4255 // small inputs will mean I hardly ever need to re-try.
4256 std::mt19937_64 &mt = ref_mersenne_twister();
4257 do
4258 { r = mt();
4259 }
4260 while (r >= w);
4261 return r%n;
4262 }
4263
4264 // A uniform distribution across the range [0 .. (2^bits)-1], ie
4265 // a bignum using (up to) the given number of bits. So eg uniform_positive(3)
4266 // should return 0,1,2,3,4,5,6 or 7 each with equal probability.
4267
uniform_positive(std::uint64_t * r,std::size_t & lenr,std::size_t bits)4268 inline void uniform_positive(std::uint64_t *r, std::size_t &lenr,
4269 std::size_t bits)
4270 { if (bits == 0)
4271 { r[0] = 0;
4272 lenr = 1;
4273 }
4274 lenr = (bits+63)/64;
4275 // ref_mersenne_twister returns a reference to a thread_local entity and
4276 // I hope that my cacheing its value here I reduce thread local access
4277 // overheads.
4278 std::mt19937_64 &mt = ref_mersenne_twister();
4279 for (std::size_t i=0; i<lenr; i++)
4280 r[i] = mt();
4281 if (bits%64 == 0) r[lenr-1] = 0;
4282 else r[lenr-1] &= UINT64_C(0xffffffffffffffff) >> (64-bits%64);
4283 while (lenr!=1 && shrinkable(r[lenr-1], r[lenr-2])) lenr--;
4284 }
4285
uniform_positive(std::size_t n)4286 inline std::intptr_t uniform_positive(std::size_t n)
4287 { std::size_t lenr = (n + 63)/64;
4288 if (lenr == 0) lenr = 1; // special case!
4289 std::size_t save = lenr;
4290 std::uint64_t *r = reserve(lenr);
4291 uniform_positive(r, lenr, n);
4292 return confirm_size(r, save, lenr);
4293 }
4294
4295 // As above but returning a value that may be negative. uniform_signed(3)
4296 // could return -8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6 or 7.
4297 // Note that while uniform_unsigned(0) can only return the value 0,
4298 // uniform_signed(0) can return -1 or 0.
4299
uniform_signed(std::uint64_t * r,std::size_t & lenr,std::size_t bits)4300 inline void uniform_signed(std::uint64_t *r, std::size_t &lenr,
4301 std::size_t bits)
4302 { lenr = 1 + bits/64;
4303 std::mt19937_64 &mt = ref_mersenne_twister();
4304 for (std::size_t i=0; i<lenr; i++)
4305 r[i] = mt();
4306 // Now if the "extra" bit is zero my number will end up positive.
4307 if ((r[lenr-1] & (UINT64_C(1) << (bits%64))) == 0)
4308 { r[lenr-1] &= UINT64_C(0xffffffffffffffff) >> (63-bits%64);
4309 while (lenr!=1 && r[lenr-1] == 0) lenr--;
4310 }
4311 // Otherwise the result will end up negative.
4312 else
4313 { r[lenr-1] |= UINT64_C(0xffffffffffffffff) << (bits%64);
4314 while (lenr!=1 && r[lenr-1] == UINT64_C(0xffffffffffffffff)) lenr--;
4315 }
4316 }
4317
uniform_signed(std::size_t n)4318 inline std::intptr_t uniform_signed(std::size_t n)
4319 { std::size_t lenr = n/64+1;
4320 std::size_t save = lenr;
4321 std::uint64_t *r = reserve(lenr);
4322 uniform_signed(r, lenr, n);
4323 return confirm_size(r, save, lenr);
4324 }
4325
4326 inline std::size_t bignum_bits(const std::uint64_t *a,
4327 std::size_t lena);
4328
4329 // Generate a a value in the range 0 .. a-1 using a uniform distribution
4330
uniform_upto(std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr)4331 inline void uniform_upto(std::uint64_t *a, std::size_t lena,
4332 std::uint64_t *r,
4333 std::size_t &lenr)
4334 { std::size_t n = bignum_bits(a, lena);
4335 // I will repeatedly generate numbers that have as many bits as a until
4336 // I get one that has a value less than a has. On average that should only
4337 // take two tries.
4338 for (;;)
4339 { push(a);
4340 uniform_positive(r, lenr, n);
4341 pop(a);
4342 if (lena > lenr) return;
4343 for (std::size_t len=lena;;)
4344 { len--;
4345 if (a[len] > r[len]) return;
4346 if (a[len] < r[len] || len == 0) break;
4347 }
4348 }
4349 }
4350
uniform_upto(std::intptr_t aa)4351 inline std::intptr_t uniform_upto(std::intptr_t aa)
4352 { if (stored_as_fixnum(aa))
4353 { std::uint64_t r = uniform_uint64(static_cast<std::uint64_t>
4354 (int_of_handle(
4355 aa)));
4356 return int_to_handle(r);
4357 }
4358 std::uint64_t *a = vector_of_handle(aa);
4359 std::size_t lena = number_size(a);
4360 push(a);
4361 std::uint64_t *r = reserve(lena);
4362 pop(a);
4363 std::size_t lenr;
4364 uniform_upto(a, lena, r, lenr);
4365 return confirm_size(r, lena, lenr);
4366 }
4367
truncate_positive(const std::uint64_t * r,std::size_t & n)4368 inline void truncate_positive(const std::uint64_t *r, std::size_t &n)
4369 { while (r[n-1]==0 && n>1 && positive(r[n-2])) n--;
4370 }
4371
truncate_negative(const std::uint64_t * r,std::size_t & n)4372 inline void truncate_negative(const std::uint64_t *r, std::size_t &n)
4373 { while (r[n-1]==allbits && n>1 && negative(r[n-2])) n--;
4374 }
4375
truncate_unsigned(const std::uint64_t * r,std::size_t & n)4376 inline void truncate_unsigned(const std::uint64_t *r, std::size_t &n)
4377 { while (r[n-1]==0 && n>1) n--;
4378 }
4379
4380 // The following is a rather strange function. It looks at the 4 bit number n.
4381 // It then processes its input a in accordance with the following table, where
4382 // A is the (positive) input value and X is A rounded down to the nearest
4383 // power of 2 less than it (ie keeping just the top bit of A):
4384 //
4385 // 0 X-1 8 -(X-1)
4386 // 1 X 9 -X
4387 // 2 X+1 10 -(X+1)
4388 // 3 A 11 -A
4389 // 4 A 12 -A
4390 // 5 A 13 -A
4391 // 6 A 14 -A
4392 // 7 A 15 -A
4393
4394 // The idea behind this is that the input A will be a random value from a
4395 // reasonably smooth distribution, and n will be a random 4 bit value. The
4396 // output will still be random, but now half the time it will be negative.
4397 // And a significant proportion of the time it will be a power of 2 (or one
4398 // either side of being a power of 2). This last is something I want because
4399 // with an internal representation that is based on 2s complement values
4400 // close to powers of 2 can easily be "edge cases" that deserve extra attention
4401 // during testing.
4402
fudge_distribution(const std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr,int n)4403 inline void fudge_distribution(const std::uint64_t *a,
4404 std::size_t lena,
4405 std::uint64_t *r, std::size_t &lenr, int n)
4406 { lenr = lena;
4407 switch (n&7)
4408 { case 0:
4409 case 1:
4410 case 2:
4411 for (std::size_t i=0; i<lena+1; i++) r[i] = 0;
4412 if (a[lena-1] == 0)
4413 { if (lena>1) r[lena-2] = static_cast<std::uint64_t>(1)<<63;
4414 }
4415 else r[lena-1] = static_cast<std::uint64_t>(1) << (63-nlz(a[lena-1]));
4416 if ((n&7) == 0) // decrement it
4417 { if (lena!=1 || a[0]!=0) // avoid decrementing zero.
4418 { std::uint64_t *p = r;
4419 while (*p == 0) *p++ = static_cast<std::uint64_t>(-1);
4420 (*p)--;
4421 }
4422 }
4423 else if ((n&7) == 2) // increment it
4424 { std::uint64_t *p = r;
4425 while (*p == static_cast<std::uint64_t>(-1)) *p++ = 0;
4426 (*p)++;
4427 }
4428 break;
4429 default:
4430 for (std::size_t i=0; i<lena; i++) r[i] = a[i];
4431 break;
4432 }
4433 if ((n&8) != 0)
4434 { std::uint64_t carry = 1;
4435 for (std::size_t i=0; i<lena+1; i++)
4436 { carry = add_with_carry(~r[i], carry, r[i]);
4437 }
4438 truncate_negative(r, lenr);
4439 }
4440 else truncate_positive(r, lenr);
4441 }
4442
fudge_distribution(std::intptr_t aa,int n)4443 inline std::intptr_t fudge_distribution(std::intptr_t aa, int n)
4444 { std::uint64_t *a;
4445 std::size_t lena;
4446 std::uint64_t w[2];
4447 if (stored_as_fixnum(aa))
4448 { w[1] = static_cast<std::uint64_t>(int_of_handle(aa));
4449 lena = 1;
4450 a = &w[1];
4451 }
4452 else
4453 { a = vector_of_handle(aa);
4454 lena = number_size(a);
4455 }
4456 push(a);
4457 std::uint64_t *r = reserve(lena+1);
4458 pop(a);
4459 std::size_t lenr;
4460 fudge_distribution(a, lena, r, lenr, n);
4461 return confirm_size(r, lena+1, lenr);
4462 }
4463
4464 // Generate a value in the range 0 .. 2^bits-1 using a distribution such
4465 // numbers with each bit-length are equally probable. This works by
4466 // selecting a big-length uniformly and then creating a number uniformly
4467 // distributed across all those with that exact bit-width. This is perhaps
4468 // not a very nice distribution from a mathematical perspective, but is is
4469 // nevertheless a useful one to have in some test code.
4470
random_upto_bits(std::uint64_t * r,std::size_t & lenr,std::size_t n)4471 inline void random_upto_bits(std::uint64_t *r, std::size_t &lenr,
4472 std::size_t n)
4473 { std::size_t bits = static_cast<std::size_t>(uniform_uint64(n));
4474 if (bits == 0)
4475 { r[0] = 0;
4476 lenr = 1;
4477 return;
4478 }
4479 // The number will have from 1 to 64 bits in its top digit.
4480 lenr = (bits+63)/64;
4481 std::mt19937_64 &mt = ref_mersenne_twister();
4482 for (std::size_t i=0; i<lenr; i++)
4483 r[i] = mt();
4484 if (n%64 != 0)
4485 r[lenr-1] &= UINT64_C(0xffffffffffffffff) >> (64-bits%64);
4486 r[lenr-1] |= UINT64_C(1) << ((bits-1)%64);
4487 if (bits%64 == 0) r[lenr++] = 0;
4488 arithlib_assert(!negative(r[lenr-1]));
4489 }
4490
random_upto_bits(std::size_t bits)4491 inline std::intptr_t random_upto_bits(std::size_t bits)
4492 { std::size_t m = 1+bits/64;
4493 if (m == 0) m = 1;
4494 std::uint64_t *r = reserve(m);
4495 std::size_t lenr;
4496 random_upto_bits(r, lenr, bits);
4497 return confirm_size(r, m, lenr);
4498 }
4499
4500 //=========================================================================
4501 //=========================================================================
4502 // Here I have a few tiny conversion functions followed by code for
4503 // conversion between big numbers and strings. All of these are rather
4504 // important for getting data in and out of the big number format and so
4505 // deserve to be shown early.
4506 //=========================================================================
4507 //=========================================================================
4508
4509
4510 // Convert a 64-bit integer to a bignum.
4511 // This can be useful when there is no special-purpose code to
4512 // perform arithmetic between a bignum and a native int64_t integer
4513 // directly.
4514
int_to_bignum(std::int64_t n,std::uint64_t * r)4515 inline void int_to_bignum(std::int64_t n, std::uint64_t *r)
4516 { r[0] = static_cast<std::uint64_t>(n);
4517 }
4518
int_to_bignum(std::int64_t n)4519 inline std::intptr_t int_to_bignum(std::int64_t n)
4520 { if (fits_into_fixnum(n)) return int_to_handle(n);
4521 std::uint64_t *r = reserve(1);
4522 int_to_bignum(n, r);
4523 return confirm_size(r, 1, 1);
4524 }
4525
unsigned_int_to_bignum(std::uint64_t n,std::uint64_t * r,std::size_t & lenr)4526 inline void unsigned_int_to_bignum(std::uint64_t n, std::uint64_t *r,
4527 std::size_t &lenr)
4528 { r[0] = n;
4529 if (negative(n))
4530 { r[1] = 0;
4531 lenr = 2;
4532 }
4533 else lenr = 1;
4534 }
4535
unsigned_int_to_bignum(std::uint64_t n)4536 inline std::intptr_t unsigned_int_to_bignum(std::uint64_t n)
4537 { std::size_t w = (negative(n) ? 2 : 1);
4538 std::uint64_t *r = reserve(w);
4539 std::size_t lenr;
4540 unsigned_int_to_bignum(n, r, lenr);
4541 return confirm_size(r, w, lenr);
4542 }
4543
4544 #ifdef softfloat_h
4545 // Some constants that are useful when I am dealing with float128_t.
4546
4547 #ifdef LITTLEENDIAN
4548 INLINE_VAR float128_t
4549 f128_0 = {{0, INT64_C(0x0000000000000000)}},
4550 f128_half = {{0, INT64_C(0x3ffe000000000000)}},
4551 f128_mhalf = {{0, INT64_C(0xbffe000000000000)}},
4552 f128_1 = {{0, INT64_C(0x3fff000000000000)}},
4553 f128_m1 = {{0, INT64_C(0xbfff000000000000)}},
4554 f128_N1 = {{0, INT64_C(0x4fff000000000000)}}; // 2^4096
4555 #else // !LITTLEENDIAN
4556 INLINE_VAR float128_t
4557 f128_0 = {{INT64_C(0x0000000000000000), 0}},
4558 f128_half = {{INT64_C(0x3ffe000000000000), 0}},
4559 f128_mhalf = {{INT64_C(0xbffe000000000000), 0}},
4560 f128_1 = {{INT64_C(0x3fff000000000000), 0}},
4561 f128_m1 = {{INT64_C(0xbfff000000000000), 0}},
4562 f128_N1 = {{INT64_C(0x4fff000000000000), 0}};
4563 #endif // !LITTLEENDIAN
4564
4565 // The following tests are not supported by the version of softfloat that
4566 // I am using, so I implement them myself.
4567
f128_zero(float128_t p)4568 inline bool f128_zero(float128_t p)
4569 { return (p.v[HIPART] & 0x7fffffffffffffff) == 0 &&
4570 p.v[LOPART] == 0;
4571 }
4572
f128_infinite(float128_t p)4573 inline bool f128_infinite(float128_t p)
4574 { return (p.v[HIPART] & 0x7fffffffffffffff) == 0x7fff000000000000 &&
4575 p.v[LOPART] == 0;
4576 }
4577
f128_nan(float128_t p)4578 inline bool f128_nan(float128_t p)
4579 { return (p.v[HIPART] & 0x7fff000000000000) == 0x7fff000000000000 &&
4580 ((p.v[HIPART] & 0x0000ffffffffffff) != 0 ||
4581 p.v[LOPART] != 0);
4582 }
4583
ldexp(float128_t p,int x)4584 inline float128_t ldexp(float128_t p, int x)
4585 { if (f128_zero(p) ||
4586 f128_infinite(p) ||
4587 f128_nan(p)) return p; // special cases!
4588 // Calculate the value I expect to want to leave in the exponent field.
4589 x = ((p.v[HIPART] >> 48) & 0x7fff) + x;
4590 // In case of overflow leave an infinity of the right sign. This involves
4591 // forcing all bits of the exponent to be 1, all bits of the mantissa to be
4592 // zero and leaving the sign bit unaltered.
4593 if (x >= 0x7fff)
4594 { p.v[HIPART] |= INT64_C(0x7fff000000000000);
4595 p.v[HIPART] &= INT64_C(0xffff000000000000);
4596 p.v[LOPART] = 0;
4597 return p;
4598 }
4599 // Using ldexp() to decrease an expeonent can lead to underflow. The value
4600 // 0 in x here would be the exponent one below that of the smallest
4601 // normal number, so a value < -114 corresponds to a number so much smaller
4602 // that it would not even qualify as a sub-norm. But even in that case
4603 // I need to preserve the sign bit.
4604 else if (x < -114)
4605 { p.v[HIPART] &= INT64_C(
4606 0x8000000000000000); // preserve sign of input
4607 p.v[LOPART] = 0;
4608 return p;
4609 }
4610 // In the case that ldexp underflows I have to be especially careful
4611 // because of the joys of sub-normal numbers and gradual underflow.
4612 // I deal with this by first forcing the exponent to be one that will
4613 // not lead to a sub-norm and then using a multiply to scale it down.
4614 if (x <= 0)
4615 { p.v[HIPART] = (p.v[HIPART] & INT64_C(0x8000ffffffffffff)) |
4616 (static_cast<std::uint64_t>(x+4096) << 48);
4617 p = f128_div(p, f128_N1);
4618 }
4619 else p.v[HIPART] = (p.v[HIPART] & INT64_C(0x8000ffffffffffff)) |
4620 (static_cast<std::uint64_t>(x) << 48);
4621 return p;
4622 }
4623
frexp(float128_t p,int & x)4624 inline float128_t frexp(float128_t p, int &x)
4625 { if (f128_zero(p) ||
4626 f128_infinite(p) ||
4627 f128_nan(p))
4628 { x = 0;
4629 return p;
4630 }
4631 int px = ((p.v[HIPART] >> 48) & 0x7fff);
4632 // If I had a sub-normal number I will multiply if by 2^4096 before
4633 // extracting its exponent. Doing that will have turned any non-zero
4634 // sub-norm into a legitimate normalized number while not getting large
4635 // enough to risk overflow...
4636 if (px == 0)
4637 { p = f128_mul(p, f128_N1);
4638 px = ((p.v[HIPART] >> 48) & 0x7fff) - 4096;
4639 }
4640 // Now I can set the exponent field such that the resulting number is in
4641 // the range 0.5 <= p < 1.0.
4642 p.v[HIPART] = (p.v[HIPART] & INT64_C(0x8000ffffffffffff)) |
4643 (static_cast<std::uint64_t>(0x3ffe) << 48);
4644 // .. and adjust the exponent value that I will return so it is if the
4645 // scaled mantissa is now exactly the same as the input.
4646 x = px - 0x3ffe;
4647 return p;
4648 }
4649
4650 // return fractional part and set i to integer part. Since this is in C++
4651 // I can use a reference argument for i now a pointer and I can overload the
4652 // vanilla name "modf" along the style of the way C++11 does.
4653
modf(float128_t d,float128_t & i)4654 inline float128_t modf(float128_t d, float128_t &i)
4655 { i = d;
4656 // Extract the exponent
4657 int x = ((d.v[HIPART] >> 48) & 0x7fff) - 0x3ffe;
4658 // If |d| < 1.0 then the integer part is zero.
4659 if (x <= 0) i = f128_0;
4660 // Next look at cases where the integer part will life entirely within
4661 // the high word.
4662 else if (x <= 49) // 49 not 48 because of hidden bit.
4663 { i.v[HIPART] &=
4664 ASR(static_cast<std::int64_t>(0xffff000000000000), x-1);
4665 i.v[LOPART] = 0;
4666 }
4667 else if (x <= 112)
4668 { i.v[LOPART] &= (-static_cast<std::uint64_t>(1)) << (113-x);
4669 }
4670 // If the number is large enough then then it is its own integer part, and
4671 // the fractional part will be zero.
4672 else return f128_0;
4673 return f128_sub(d, i);
4674 }
4675
4676 #endif // softfloat_h
4677
4678 // When doubles (and float128_t values where available) are to be
4679 // compared against a bignum to get proper results the double should
4680 // (at least in effect) be converted to a bignum. If one does the comparison
4681 // by converting both inputs to floating point (which may feel easier) there
4682 // are multiple problems. First the bignum might have a value outside
4683 // the range of floats, so you get overflow. Then it might differ from
4684 // a float in a bit position several hundred betlow its most significant
4685 // one, and converting to a float would lose that information.
4686
4687 // double_to_bits() turns a floating point value into an integer plus
4688 // an exponent. It sets mantissa*2^exponent = d. This function will not
4689 // give sensible output if passed an infinity or a NaN and so they should be
4690 // filtered out before it is called.
4691
double_to_bits(double d,std::int64_t & mantissa,int & exponent)4692 inline void double_to_bits(double d, std::int64_t &mantissa,
4693 int &exponent)
4694 { if (d == 0.0)
4695 { mantissa = 0;
4696 exponent = 0;
4697 return;
4698 }
4699 int x;
4700 d = std::frexp(d, &x);
4701 // now |d| is in the range [0.5,1) -- note closed at the 0.5 end and open
4702 // at the other. And x is the power of 2 that the original input was scaled
4703 // by to achieve this.
4704 d = d*9007199254740992.0; // 2^53;
4705 // The conversion to an integer here will always be exact.
4706 mantissa = static_cast<std::int64_t>(d);
4707 exponent = x - 53;
4708 }
4709
4710 // There are places where I need to shift a 128 or 192-bit number that is
4711 // represented using several int64 values...
4712
shiftleft(std::int64_t & hi,std::uint64_t & lo,int n)4713 inline void shiftleft(std::int64_t &hi, std::uint64_t &lo, int n)
4714 { if (n == 0) return;
4715 else if (n < 64)
4716 { hi = ASL(hi, n) | (lo >> (64-n));
4717 lo = lo << n;
4718 }
4719 else if (n == 64)
4720 { hi = lo;
4721 lo = 0;
4722 }
4723 else
4724 { hi = lo << (n-64);
4725 lo = 0;
4726 }
4727 }
4728
shiftleft(std::int64_t & hi,std::uint64_t & mid,std::uint64_t & lo,int n)4729 inline void shiftleft(std::int64_t &hi, std::uint64_t &mid,
4730 std::uint64_t &lo,
4731 int n)
4732 { if (n == 0) return;
4733 else if (n < 64)
4734 { hi = ASL(hi, n) | (mid >> (64-n));
4735 mid = (mid << n) | (lo >> (64-n));
4736 lo = lo << n;
4737 }
4738 else if (n == 64)
4739 { hi = mid;
4740 mid = lo;
4741 lo = 0;
4742 }
4743 else if (n < 128)
4744 { hi = (mid << (n-64)) | (lo >> (128-n));
4745 mid = lo << (n-64);
4746 lo = 0;
4747 }
4748 else if (n == 128)
4749 { hi = lo;
4750 mid = lo = 0;
4751 }
4752 else
4753 { hi = lo << (n-128);
4754 mid = lo = 0;
4755 }
4756 }
4757
shiftright(std::int64_t & hi,std::uint64_t & lo,int n)4758 inline void shiftright(std::int64_t &hi, std::uint64_t &lo, int n)
4759 { if (n == 0) return;
4760 else if (n < 64)
4761 { lo = (lo >> n) | ASL(hi, 64-n);
4762 hi = ASR(hi, n);
4763 }
4764 else if (n == 64)
4765 { lo = hi;
4766 hi = hi<0 ? -1 : 0;
4767 }
4768 else
4769 { lo = ASR(hi, n-64);
4770 hi = hi<0 ? -1 : 0;
4771 }
4772 }
4773
4774 // This next sets top and next to the two top 64-bit digits for a bignum,
4775 // and len to the length (measured in words) of that bignum. For values
4776 // |d| < 2^63 next will in fact be a signed value, len==1 and top will
4777 // in fact be irrelevant. This should be seen as a special degenerate case.
4778 // Whenever len>1 on output the number should be such that to make a bignum
4779 // with value to match the float you append len-2 zero words. Note that
4780 // for inputs in 2^63 <= d < 2^64 the result will have top==0 and next
4781 // the integer value of d and len==2, with something similar for the
4782 // equivalent negative range. The leading 0 or -1 is required in those
4783 // cases. The result will be be any fractional part left over when d is
4784 // converted to an integer, and this can only be nonzero is cases where
4785 // |d| < 2^53.
4786 //
4787 // In the case that the floating point input is small its value may lie
4788 // between two integers, and in that case I might want to adjust it in the
4789 // sense of ROUND, TRUNC, FLOOR or CEILING. I will pass an extra argument
4790 // to explain which I require.
4791
4792 enum RoundingMode {ROUND, TRUNC, FLOOR, CEILING};
4793
double_to_virtual_bignum(double d,std::int64_t & top,std::uint64_t & next,std::size_t & len,RoundingMode mode)4794 inline void double_to_virtual_bignum(double d,
4795 std::int64_t &top, std::uint64_t &next, std::size_t &len,
4796 RoundingMode mode)
4797 { if (d == 0.0)
4798 { top = next = 0;
4799 len = 1;
4800 return;
4801 }
4802 // NaN leads to a returned value with zero length. Having a zero length
4803 // for a bignum is invalid, so this marks the situation well.
4804 else if (std::isnan(d))
4805 { top = next = 0;
4806 len = 0;
4807 return;
4808 }
4809 // Infinties turn into values with maximum length and a top digit that
4810 // captures the sign of the input.
4811 if (std::isinf(d))
4812 { top = d < 0.0 ? -1 : 1;
4813 next = 0;
4814 len = SIZE_MAX;
4815 return;
4816 }
4817 // From here down I do not need to worry about zero, infinity or NaNs. But
4818 // I may need to think about rounding!
4819 double intpart;
4820 double fracpart = std::modf(d, &intpart);
4821 std::int64_t mantissa;
4822 int exponent;
4823 double_to_bits(intpart, mantissa, exponent);
4824 // Now I know intpart(d) = mantissa*2^exponent and mantissa is an integer.
4825 std::uint64_t lowbit = mantissa & -static_cast<std::uint64_t>
4826 (mantissa);
4827 int lz = 63 - nlz(lowbit); // low zero bits
4828 mantissa = ASR(mantissa, lz);
4829 exponent += lz;
4830 // Now mantissa has its least significant bit a "1". At this stage the
4831 // input 1.0 (eg) should have turned into mantissa=1, exponent==0. And 1.5
4832 // should have become mantissa=1, exponent=0 and fracpart = 0.5. fracpart has
4833 // the same sign as the original input.
4834 // So now I can apply my rounding mode...
4835 switch (mode)
4836 { case ROUND:
4837 if (fracpart >= 0.5) mantissa++;
4838 else if (fracpart <= -0.5) mantissa--;
4839 break;
4840 case TRUNC: // the effect of modf is this already.
4841 break;
4842 case FLOOR:
4843 if (fracpart != 0.0 && d < 0.0) mantissa--;
4844 break;
4845 case CEILING:
4846 if (fracpart != 0.0 && d > 0.0) mantissa++;
4847 break;
4848 }
4849 next = static_cast<std::uint64_t>(mantissa);
4850 top = d<0.0 && mantissa!=0 ? -1 : 0;
4851 if (exponent < 0)
4852 { top = 0;
4853 len = 1;
4854 }
4855 else
4856 { len = 2 + exponent/64;
4857 exponent = exponent%64;
4858 // Now shift left by exponent, which is less than 64 here.
4859 shiftleft(top, next, exponent);
4860 // In some cases this has still left all the bits I care about in next,
4861 // with top not needed, so in such cases I will shrink by a word.
4862 if (shrinkable(top, next))
4863 { top = next;
4864 next = 0;
4865 len--;
4866 }
4867 }
4868 }
4869
4870 #ifdef softfloat_h
4871
4872 // For int128_t the mantissa needs to be returned as a 128-bit integer, and
4873 // I do that as a pair of 64-bit integers here. Infinities and NaNs would
4874 // lead to nonsense output. Subnormal numbers are got wrong at present!
4875
float128_to_bits(float128_t d,std::int64_t & mhi,std::uint64_t & mlo,int & exponent)4876 inline void float128_to_bits(float128_t d,
4877 std::int64_t &mhi, std::uint64_t &mlo, int &exponent)
4878 { if (f128_nan(d) || f128_zero(d))
4879 { mhi = mlo = 0;
4880 exponent = 0;
4881 return;
4882 }
4883 else if (f128_infinite(d))
4884 { if (f128_lt(d, f128_0)) mhi = mlo = -1;
4885 else mhi = mlo = 0;
4886 exponent = INT_MAX;
4887 return;
4888 }
4889 // With float128_t the easier way to go is to access the bit-patterns.
4890 exponent = ((d.v[HIPART] >> 48) & 0x7fff);
4891 if (exponent == 0) // subnormal number
4892 { d = f128_mul(d, f128_N1);
4893 exponent -= 4096;
4894 }
4895 exponent -= 0x3ffe;
4896 mhi = (d.v[HIPART] & 0xffffffffffff) | 0x0001000000000000;;
4897 mlo = d.v[LOPART];
4898 if (static_cast<std::int64_t>(d.v[HIPART]) < 0)
4899 { mlo = -mlo;
4900 if (mlo == 0) mhi = -mhi;
4901 else mhi = ~mhi;
4902 }
4903 exponent -= 113;
4904 }
4905
inc128(std::int64_t & hi,std::uint64_t & lo)4906 inline void inc128(std::int64_t &hi, std::uint64_t &lo)
4907 { if (++lo == 0) hi++;
4908 }
4909
dec128(std::int64_t & hi,std::uint64_t & lo)4910 inline void dec128(std::int64_t &hi, std::uint64_t &lo)
4911 { if (lo-- == 0) hi--;
4912 }
4913
4914 // For a float128_t value I need to generate (up to) 3 64-bit digits for
4915 // the way it would end up as a bignum.
4916
float128_to_virtual_bignum(float128_t d,std::int64_t & top,std::uint64_t & mid,std::uint64_t & next,std::size_t & len,RoundingMode mode)4917 inline void float128_to_virtual_bignum(float128_t d,
4918 std::int64_t &top, std::uint64_t &mid, std::uint64_t &next,
4919 std::size_t &len,
4920 RoundingMode mode)
4921 { if (f128_zero(d))
4922 { top = mid = next = 0;
4923 len = 1;
4924 return;
4925 }
4926 else if (f128_nan(d))
4927 { top = mid = next = 0;
4928 len = 0;
4929 return;
4930 }
4931 else if (f128_infinite(d))
4932 { if (f128_lt(d, f128_0)) top = mid = next = -1;
4933 else top = mid = next = 0;
4934 len = SIZE_MAX;
4935 return;
4936 }
4937 float128_t intpart;
4938 float128_t fracpart = modf(d, intpart);
4939 std::int64_t mhi;
4940 std::uint64_t mlo;
4941 int exponent;
4942 float128_to_bits(intpart, mhi, mlo, exponent);
4943 // Now I know intpart(d) = mantissa*2^exponent and mantissa is an integer.
4944 int lz;
4945 if (mlo != 0)
4946 { std::uint64_t lowbit = mlo & (-mlo);
4947 lz = 63 - nlz(lowbit); // low zero bits
4948 }
4949 else
4950 { std::uint64_t lowbit = mhi & (-static_cast<std::uint64_t>(mhi));
4951 lz = 64 + 63 - nlz(lowbit); // low zero bits
4952 }
4953 shiftright(mhi, mlo, lz);
4954 exponent += lz;
4955 // Now mantissa has its least significant bit a "1". At this stage the
4956 // input 1.0 (eg) should have turned into mantissa=1, exponent==0. And 1.5
4957 // should have become mantissa=1, exponent=0 and fracpart = 0.5. fracpart has
4958 // the same sign as the original input.
4959 // So now I can apply my rounding mode...
4960 switch (mode)
4961 { case ROUND:
4962 if (!f128_lt(fracpart, f128_half)) inc128(mhi, mlo);
4963 else if (f128_le(fracpart, f128_mhalf)) dec128(mhi, mlo);
4964 break;
4965 case TRUNC: // the effect of modf is this already.
4966 break;
4967 case FLOOR:
4968 if (!f128_zero(fracpart) && f128_lt(d, f128_0)) dec128(mhi, mlo);
4969 break;
4970 case CEILING:
4971 if (!f128_zero(fracpart) && !f128_lt(d, f128_0)) inc128(mhi, mlo);
4972 break;
4973 }
4974 // Now I need to shift things left so that the number of trailing zeros
4975 // to the right of my value is a multiple of 64. That may cause the
4976 // mantissa to spread into parts of 3 words: (top, mid, next).
4977
4978 next = mlo;
4979 mid = mhi;
4980 top = mhi<0 ? -1 : 0;
4981 if (exponent < 0)
4982 { top = 0;
4983 len = 1;
4984 }
4985 else
4986 { len = 3 + exponent/64;
4987 exponent = exponent%64;
4988 shiftleft(top, mid, next, exponent);
4989 // In some cases this has still left all the bits I care about in next.
4990 if (shrinkable(top, mid))
4991 { top = mid;
4992 mid = next;
4993 next = 0;
4994 len--;
4995 }
4996 if (shrinkable(top, mid))
4997 { top = mid;
4998 mid = next;
4999 next = 0;
5000 len--;
5001 }
5002 }
5003 }
5004
5005 #endif // softfloat_h
5006
double_to_int(double d,RoundingMode mode)5007 inline std::intptr_t double_to_int(double d, RoundingMode mode)
5008 {
5009 // I return 0 if the input is a NaN or either +infinity or -infinity.
5010 // This is somewhat arbitrary, but right now I am not minded to raise an
5011 // exception.
5012 if (!std::isfinite(d) || d==0.0) return int_to_handle(0);
5013 std::int64_t top;
5014 std::uint64_t next;
5015 std::size_t len;
5016 double_to_virtual_bignum(d, top, next, len, mode);
5017 std::uint64_t *r = reserve(len);
5018 if (len == 1) r[0] = top;
5019 else
5020 { for (std::size_t i=0; i<len-2; i++) r[i] = 0;
5021 r[len-1] = top;
5022 r[len-2] = next;
5023 }
5024 return confirm_size(r, len, len);
5025 }
5026
round_double_to_int(double d)5027 inline std::intptr_t round_double_to_int(double d)
5028 { return double_to_int(d, RoundingMode::ROUND);
5029 }
5030
trunc_double_to_int(double d)5031 inline std::intptr_t trunc_double_to_int(double d)
5032 { return double_to_int(d, RoundingMode::TRUNC);
5033 }
5034
floor_double_to_int(double d)5035 inline std::intptr_t floor_double_to_int(double d)
5036 { return double_to_int(d, RoundingMode::FLOOR);
5037 }
5038
ceiling_double_to_int(double d)5039 inline std::intptr_t ceiling_double_to_int(double d)
5040 { return double_to_int(d, RoundingMode::CEILING);
5041 }
5042
5043 #ifdef softfloat_h
5044
float128_to_int(float128_t d,RoundingMode mode)5045 inline std::intptr_t float128_to_int(float128_t d, RoundingMode mode)
5046 { if (f128_zero(d) ||
5047 f128_infinite(d) ||
5048 f128_nan(d)) return int_to_handle(0);
5049 std::int64_t top;
5050 std::uint64_t mid, next;
5051 std::size_t len;
5052 float128_to_virtual_bignum(d, top, mid, next, len, mode);
5053 std::uint64_t *r = reserve(len);
5054 if (len == 1) r[0] = top;
5055 else if (len == 2)
5056 { r[1] = top;
5057 r[0] = mid;
5058 }
5059 else
5060 { for (std::size_t i=0; i<len-3; i++) r[i] = 0;
5061 r[len-1] = top;
5062 r[len-2] = mid;
5063 r[len-3] = next;
5064 }
5065 return confirm_size(r, len, len);
5066 }
5067
round_float128_to_int(float128_t d)5068 inline std::intptr_t round_float128_to_int(float128_t d)
5069 { return float128_to_int(d, RoundingMode::ROUND);
5070 }
5071
trunc_float128_to_int(float128_t d)5072 inline std::intptr_t trunc_float128_to_int(float128_t d)
5073 { return float128_to_int(d, RoundingMode::TRUNC);
5074 }
5075
floor_float128_to_int(float128_t d)5076 inline std::intptr_t floor_float128_to_int(float128_t d)
5077 { return float128_to_int(d, RoundingMode::FLOOR);
5078 }
5079
ceiling_float128_to_int(float128_t d)5080 inline std::intptr_t ceiling_float128_to_int(float128_t d)
5081 { return float128_to_int(d, RoundingMode::CEILING);
5082 }
5083
5084 #endif // softfloat_h
5085
op(std::int64_t a)5086 inline std::int64_t Int64_t::op(std::int64_t a)
5087 { return a;
5088 }
5089
op(std::uint64_t * a)5090 inline std::int64_t Int64_t::op(std::uint64_t *a)
5091 { return static_cast<std::int64_t>(a[0]);
5092 }
5093
op(std::int64_t a)5094 inline std::uint64_t Uint64_t::op(std::int64_t a)
5095 { return static_cast<std::uint64_t>(a);
5096 }
5097
op(std::uint64_t * a)5098 inline std::uint64_t Uint64_t::op(std::uint64_t *a)
5099 { return a[0];
5100 }
5101
5102 // A cast from a double to a float is entitled, by the C++ standard to
5103 // make a system-defined choice as to whether to round up or down.
5104 // I want to guarantee to follow IEEE round-to-nearest-with-tie-break-
5105 // -to-even, and so I will write a messy function here to achieve that and
5106 // hence end up with better portability.
5107 //
5108 // The two things I think are illustrated here are
5109 // (1) How horrible this is!
5110 // (2) That C++11 manages to provide enough facilities for me to implement it
5111 // in a manner that I believe is standards-conforming at least on IEEE
5112 // platforms with the rounding-style set normally.
5113
cast_to_float(double d)5114 inline float cast_to_float(double d)
5115 {
5116 // If the argument is a NaN then return a NaN of type float.
5117 if (std::isnan(d)) return std::nanf("");
5118 // In C++ a narrowing cast here where the result would be out of range gives
5119 // undefined behaviour, so I need to filter that case first. I am going
5120 // to allow double values that are up to FLT_MAX*(1+2^(-24)) to round
5121 // down to FLT_MAX - beyond that lies overflow reported as HUGE_VALF which
5122 // on all modern systems will be an IEEE infinity.
5123 double limit = static_cast<double>(FLT_MAX) +
5124 static_cast<double>(FLT_MAX) /
5125 static_cast<double>(0x1000000);
5126 if (d >= limit) return HUGE_VALF;
5127 else if (d <= -limit) return -HUGE_VALF;
5128 else if (d >= static_cast<double>(FLT_MAX)) return FLT_MAX;
5129 else if (d <= static_cast<double>(FLT_MIN)) return FLT_MIN;
5130 // Now I am not going to get any overflow - whew.
5131 float r1 = static_cast<float>(d);
5132 // If the conversion was exact I do not have anything more to do!
5133 if (static_cast<double>(r1) == d) return r1;
5134 double err1 = static_cast<double>(r1) - d;
5135 float r2;
5136 double err2;
5137 // Now I am going to find the next consecutive floating point value (in
5138 // the correct direction) so that r1 and r2 are two values with d between
5139 // them.
5140 if (err1 > 0.0)
5141 { r2 = std::nextafterf(r1, FLT_MIN);
5142 err2 = d - static_cast<double>(r2);
5143 }
5144 else
5145 { r2 = std::nextafterf(r1, FLT_MAX);
5146 err2 = static_cast<double>(r2) - d;
5147 err1 = -err1;
5148 }
5149 if (err1 < err2) return r1;
5150 else if (err2 < err1) return r2;
5151 // Here I am at a half-way point. Hah - can add my error to a candidate
5152 // result and the rounding there will then follow the "rounding style" that
5153 // is in force - which I jolly well expect to be IEEE!
5154 return (r1 < r2 ? r1 : r2) + static_cast<float>(err1);
5155 }
5156
5157 // On Cygwin (at least) the std::ldexpf function that is part of C++11
5158 // is hidden in the header file perhaps because of issues about thread
5159 // safety in its implementation. I reason here that converting from a
5160 // float to a double will never lose anything, then ldexp() can be used.
5161 // The case back to a float can not introduxce rounding, but might notice
5162 // overflow leading to a result that is an IEEE infinity.
5163
ldexpf(float a,int n)5164 inline float ldexpf(float a, int n)
5165 { return cast_to_float(std::ldexp(static_cast<double>(a), n));
5166 }
5167
op(std::int64_t a)5168 inline float Float::op(std::int64_t a)
5169 {
5170 // if |a| < 2^52 I can convert to a double exactly
5171 if (a > -0x10000000000000 && a < 0x10000000000000)
5172 return cast_to_float(static_cast<double>(a));
5173 std::int64_t hi = a & 0xfffffc0000000000; // 22 bits
5174 std::int64_t mid = a & 0x000003fffff00000; // 22 bits
5175 std::int64_t lo = a & 0x00000000000fffff; // 20 bits
5176 if (hi == 0 || hi == 0xfffffc000000000)
5177 return cast_to_float(static_cast<double>(hi) +
5178 static_cast<double>(mid) + static_cast<double>(lo));
5179 // This next line will move a ">0.5ulp" case so that it is visible
5180 // within just the high 44 bits. This is because the whole number can
5181 // only be a 0.5ulp case if all the bits below the top 24 are zero, and
5182 // for that to happen certainly the low 20 bits must all be zero...
5183 if (lo != 0) mid |= 1;
5184 return cast_to_float(static_cast<double>(hi) +
5185 static_cast<double>(mid));
5186 }
5187
op(std::uint64_t * a)5188 inline float Float::op(std::uint64_t *a)
5189 { std::size_t lena = number_size(a);
5190 if (lena == 1) return Float::op(static_cast<std::int64_t>(a[0]));
5191 // Now I need to do something similar to that done for the int64_t case
5192 // but written larger. Specifically I want to split my input number into
5193 // its top 24 bits and then all the rest. I will take separate paths
5194 // for the positive and negative cases.
5195 std::uint64_t top24;
5196 int lz;
5197 bool sign = false;
5198 std::uint64_t top, next;
5199 bool carried = true;
5200 for (std::size_t i=0; i<lena-2; i++)
5201 { if (a[i] != 0)
5202 { carried = false;
5203 break;
5204 }
5205 }
5206 // Grap the top 128 bits of the number as {top,next}.
5207 top = a[lena-1];
5208 next = a[lena-2]; // lena >= 2 here
5209 // Take its absolute value.
5210 if (negative(top))
5211 { sign = true;
5212 top = ~top;
5213 next = ~next;
5214 if (carried)
5215 { next++;
5216 if (next == 0) top++;
5217 }
5218 }
5219 if (!carried) next |= 1;
5220 // Now I need to do something very much like the code for the int64_t case.
5221 if (top == 0) lz = nlz(next) + 64;
5222 else lz = nlz(top);
5223 //
5224 // uint64_t top24 = {top,next} >> (128-24-lz);
5225 int sh = 128-24-lz;
5226 // Note that sh can never be zero here.
5227 if (sh < 64) top24 = (next >> sh) | (top << (64-sh));
5228 else top24 = top >> (sh-64);
5229 //
5230 // {top,next} = {top,next} << lz+24; // keep only the fraction bits
5231 sh = lz+24;
5232 if (sh < 64)
5233 { top = (top << sh) | (next >> (64-sh));
5234 next = next << sh;
5235 }
5236 else
5237 { top = next << (sh - 64);
5238 next = 0;
5239 }
5240 //
5241 // if ({top,next} > 0x80000000000000000000000000000000U) top24++;
5242 // else if ({top,next} == 0x80000000000000000000000000000000U)
5243 // top24 += (top24 & 1);
5244 if (top > 0x8000000000000000U) top24++;
5245 else if (top == 0x8000000000000000U)
5246 { if (next != 0) top24++;
5247 else top24 += (top24&1);
5248 }
5249 arithlib_assert(top24 >= (static_cast<std::int64_t>(1))<<23 &&
5250 top24 <= (static_cast<std::int64_t>(1))<<24);
5251 double d = static_cast<float>(top24);
5252 arithlib_assert(top24 == static_cast<std::uint64_t>(d));
5253 if (sign) d = -d;
5254 return ldexpf(d, static_cast<int>(128-24-lz+64*(lena-2)));
5255 }
5256
op(std::int64_t a,std::int64_t & x)5257 inline double Frexp::op(std::int64_t a, std::int64_t &x)
5258 {
5259 // The bad news here is that I am not confident that C++ will guarantee
5260 // to round large integer values in any particular way when it converts
5261 // them to floating point. So I will take careful action so that the
5262 // conversions that I do are ones that will be exact, and I will perform
5263 // rounding in IEEE style myself.
5264 // First I will see if the value is small enough that I can work directly.
5265 const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
5266 if (a >= -range && a <= range) return static_cast<double>(a);
5267 // I will now drop down to a sign and magnitude representation
5268 bool sign = a < 0;
5269 std::uint64_t top53 = sign ? -static_cast<std::uint64_t>(a) : a;
5270 // Because top53 >= 2^53 the number of leading zeros in its representation is
5271 // at most 10. Ha ha. That guaranteed that the shift below will not overflow
5272 // and is why I chose my range as I did.
5273 int lz = nlz(top53);
5274 std::uint64_t low = top53 << (lz+53);
5275 top53 = top53 >> (64-53-lz);
5276 if (low > 0x8000000000000000U) top53++;
5277 else if (low == 0x8000000000000000U) top53 += (top53 &
5278 1); // round to even
5279 arithlib_assert(top53 >= (static_cast<std::int64_t>(1))<<52 &&
5280 top53 <= (static_cast<std::int64_t>(1))<<53);
5281 // The next line should never introduce any rounding at all.
5282 double d = static_cast<double>(top53);
5283 arithlib_assert(top53 == static_cast<std::uint64_t>(d));
5284 if (sign) d = -d;
5285 x =64-53-lz;
5286 return d;
5287 }
5288
op(std::int64_t a)5289 inline double Double::op(std::int64_t a)
5290 {
5291 // One would obviously like to go "return (double)a;" however C++ says
5292 // "If the value being converted is in the range of values that can
5293 // be represented but the value cannot be represented exactly, it is
5294 // an implementation-defined choice of either the next lower or higher
5295 // representable value."
5296 // and I feel I should guarantee to round in IEEE style. I can do that
5297 // by splitting the integer into two parts. Each of the two casts can deliver
5298 // a double precision result without need for rounding
5299 std::int64_t hi = a & 0xffffffff00000000;
5300 std::int64_t lo = a & 0x00000000ffffffff;
5301 double d = static_cast<double>(lo);
5302 return d + static_cast<double>(hi);
5303 }
5304
op(std::uint64_t * a,std::int64_t & x)5305 inline double Frexp::op(std::uint64_t *a, std::int64_t &x)
5306 { std::size_t lena = number_size(a);
5307 if (lena == 1) return Frexp::op(static_cast<std::int64_t>(a[0]), x);
5308 // Now I need to do something similar to that done for the int64_t case
5309 // but written larger. Specifically I want to split my input number into
5310 // its top 53 bits and then all the rest. I will take separate paths
5311 // for the positive and negative cases.
5312 std::uint64_t top53;
5313 int lz;
5314 bool sign = false;
5315 std::uint64_t top, next;
5316 bool carried = true;
5317 for (std::size_t i=0; i<lena-2; i++)
5318 { if (a[i] != 0)
5319 { carried = false;
5320 break;
5321 }
5322 }
5323 // Grap the top 128 bits of the number as {top,next}.
5324 top = a[lena-1];
5325 next = a[lena-2]; // lena >= 2 here
5326 // Take its absolute value.
5327 if (negative(top))
5328 { sign = true;
5329 top = ~top;
5330 next = ~next;
5331 if (carried)
5332 { next++;
5333 if (next == 0) top++;
5334 }
5335 }
5336 if (!carried) next |= 1;
5337 // Now I need to do something very much like the code for the int64_t case.
5338 if (top == 0) lz = nlz(next) + 64;
5339 else lz = nlz(top);
5340 //
5341 // uint64_t top53 = {top,next} >> (128-53-lz);
5342 int sh = 128-53-lz;
5343 // Note that sh can never be zero here.
5344 if (sh < 64) top53 = (next >> sh) | (top << (64-sh));
5345 else top53 = top >> (sh-64);
5346 //
5347 // {top,next} = {top,next} << lz+53; // keep only the fraction bits
5348 sh = lz+53;
5349 if (sh < 64)
5350 { top = (top << sh) | (next >> (64-sh));
5351 next = next << sh;
5352 }
5353 else
5354 { top = next << (sh - 64);
5355 next = 0;
5356 }
5357 //
5358 // if ({top,next} > 0x80000000000000000000000000000000U) top53++;
5359 // else if ({top,next} == 0x80000000000000000000000000000000U)
5360 // top53 += (top53 & 1);
5361 if (top > 0x8000000000000000U) top53++;
5362 else if (top == 0x8000000000000000U)
5363 { if (next != 0) top53++;
5364 else top53 += (top53&1);
5365 }
5366 arithlib_assert(top53 >= (static_cast<std::int64_t>(1))<<52 &&
5367 top53 <= (static_cast<std::int64_t>(1))<<53);
5368 double d = static_cast<double>(top53);
5369 arithlib_assert(top53 == static_cast<std::uint64_t>(d));
5370 if (sign) d = -d;
5371 x = 128-53-lz+64*(lena-2);
5372 return d;
5373 }
5374
op(std::uint64_t * a)5375 inline double Double::op(std::uint64_t *a)
5376 { std::int64_t x = 0;
5377 double d = Frexp::op(a, x);
5378 if (x > 10000) x = 10000;
5379 return std::ldexp(d, static_cast<int>(x));
5380 }
5381
5382 #ifdef softfloat_h
5383
op(std::int64_t a)5384 inline float128_t Float128::op(std::int64_t a)
5385 { return i64_to_f128(a);
5386 }
5387
op(std::int64_t a,std::int64_t & x)5388 inline float128_t Frexp128::op(std::int64_t a, std::int64_t &x)
5389 { float128_t d = i64_to_f128(a), d1;
5390 int xi = 0;
5391 f128M_frexp(&d, &d1, &xi); // in the CSL sources.
5392 x = xi;
5393 return d1;
5394 }
5395
op(std::uint64_t * a,std::int64_t & x)5396 inline float128_t Frexp128::op(std::uint64_t *a, std::int64_t &x)
5397 { std::size_t lena = number_size(a);
5398 if (lena == 1) return Float128::op(static_cast<std::int64_t>(a[0]));
5399 std::uint64_t top113, top113a;
5400 int lz;
5401 bool sign = false;
5402 std::uint64_t top, next1, next2;
5403 bool carried = true;
5404 for (std::size_t i=0; i<lena-3; i++)
5405 { if (a[i] != 0)
5406 { carried = false;
5407 break;
5408 }
5409 }
5410 // Grap the top 192 bits of the number as {top,next}.
5411 top = a[lena-1];
5412 next1 = a[lena-2];
5413 next2 = lena==2 ? 0 : a[lena-3];
5414 // Take its absolute value.
5415 if (negative(top))
5416 { sign = true;
5417 top = ~top;
5418 next1 = ~next1;
5419 next2 = ~next2;
5420 if (carried)
5421 { next2++;
5422 if (next2 == 0)
5423 { next1++;
5424 if (next1 == 0) top++;
5425 }
5426 }
5427 }
5428 if (!carried) next2 |= 1;
5429 // I now have {top,next1,next2} the top 192 bits of my integer. top may be
5430 // zero, but if it is then next1 will have its top bit set, and so within
5431 // these bits I certainly have the 113 that I need to obtain an accurate
5432 // floating point value.
5433 if (top == 0) lz = nlz(next1) + 64;
5434 else lz = nlz(top);
5435 //
5436 // uint64_t {top113,top112a} = {top,next1,next2} >> (128-113-lz);
5437 int sh = 192-113-lz;
5438 // Note that sh can never be zero here.
5439 if (sh < 64)
5440 { top113 = (next1 >> sh) | (top << (64-sh));
5441 top113a = (next2 >> sh) | (next1 << (64-sh));
5442 }
5443 else
5444 { top113 = top >> (sh-64);
5445 top113a = (next1 >> (sh-64)) | (top << (128-sh));
5446 }
5447 //
5448 // {top,next} = {top,next} << lz+113; // keep only the fraction bits
5449 sh = lz+113;
5450 if (sh < 64)
5451 { top = (top << sh) | (next1 >> (64-sh));
5452 next1 = (next1 << sh) | (next2 >> (64-sh));
5453 next2 = next2 << sh;
5454 }
5455 else
5456 { top = next1 << (sh - 64);
5457 next1 = (next1 << (sh-64)) | (next2 >> (129-sh));
5458 next2 = 0;
5459 }
5460 //
5461 // if ({top,next1,next2} > 0x80000000000000000000000000000000U) top113++;
5462 // else if ({top,next1, next2} == 0x80000000000000000000000000000000U)
5463 // top113 += (top113 & 1);
5464 if (top > 0x8000000000000000U)
5465 { top113a++;
5466 if (top113a == 0) top113++;
5467 }
5468 else if (top == 0x8000000000000000U)
5469 { if (next1 != 0 || (next1==0 && next2!=0))
5470 { top113a++;
5471 if (top113a == 0) top113++;
5472 }
5473 else top113 += add_with_carry(top113a, top113a&1, top113a);
5474 }
5475 // float128_t d = i64_to_f128({top113, top113a});
5476 float128_t d = i64_to_f128(top113);
5477 float128_t two32 = i64_to_f128(0x100000000);
5478 d = f128_add(f128_mul(f128_mul(two32, two32), d),
5479 ui64_to_f128(top113a));
5480 if (sign) d = f128_sub(i64_to_f128(0), d);
5481 x = 192-113-lz+64*(lena-2);
5482 return d;
5483 }
5484
op(std::uint64_t * a)5485 inline float128_t Float128::op(std::uint64_t *a)
5486 { std::int64_t x = 0;
5487 float128_t d = Frexp128::op(a, x);
5488 if (x > 100000) x = 100000;
5489 // There is an implementation of ldexp() for 128-bit floats in
5490 // the CSL source file arith14.cpp.
5491 f128M_ldexp(&d, static_cast<int>(x));
5492 return d;
5493 }
5494
5495 #endif // softfloat_t
5496
5497 INLINE_VAR const std::uint64_t ten19 = UINT64_C(10000000000000000000);
5498
string_to_bignum(const char * s)5499 inline std::intptr_t string_to_bignum(const char *s)
5500 { bool sign = false;
5501 if (*s == '-')
5502 { sign = true;
5503 s++;
5504 }
5505 std::size_t chars = std::strlen(s);
5506 std::size_t words = 1 + (108853*static_cast<std::uint64_t>
5507 (chars))/0x200000;
5508 // I have predicted the number of 64-bit digits that will be needed to
5509 // represent an s-digit (decimal) number based an approximation
5510 // 108853/2^21 for log(10)/log(2^64). In 64-bit arithmetic the numerator
5511 // here will not overflow until you have an improbable string of length
5512 // 2^47 as input! The division by a power of 2 should be done very
5513 // rapidly as a shift. I rather expect this calculation to give a rather
5514 // good measure of how many 64-bit words will be needed! It must never be an
5515 // overestimate so that the vector that I allocate never overflows. Somewhat
5516 // rarely it will be and overestimate and it will be necessary to trim the
5517 // vector at the end.
5518 std::uint64_t *r = reserve(words);
5519 for (std::size_t i=0; i<words; i++) r[i] = 0;
5520 // Now for each chunk of digits NNNN in the input I want to go in effect
5521 // r = 10^19*r + NNNN;
5522 // where the number 19 is used because 10^19 is the largest power of 10
5523 // that fits in a 64-bit word.
5524 std::size_t next = 19*((chars-1)/19);
5525 while (chars != 0)
5526 { std::uint64_t d = 0;
5527 // assemble 19 digit blocks from the input into a value (d).
5528 while (chars != next)
5529 { arithlib_assert(std::isdigit(*s));
5530 d = 10*d + (*s++ - '0');
5531 chars--;
5532 }
5533 next -= 19;
5534 // now perform r = 10^19*r + d to consolidate into the eventual result.
5535 for (std::size_t i=0; i<words; i++)
5536 multiply64(r[i], ten19, d, d, r[i]);
5537 }
5538 std::size_t n1 = words;
5539 // Here I may be negating a positive number, and in 2s complement that
5540 // can never lead to a number growing in length.
5541 if (sign)
5542 { internal_negate(r, words, r);
5543 truncate_negative(r, n1);
5544 }
5545 // However I could not have been precisely certain how many 64-bit words were
5546 // needed and I arranged that any error was conservative - ie allocating
5547 // more that would eventually be used.
5548 else truncate_positive(r, n1);
5549 return confirm_size(r, words, n1);
5550 }
5551
5552 // The next functions are a key one for printing values. They convert a
5553 // bignum so that it is still stored as a sequence of digits each within
5554 // a 64-bit work, but now each digit will be be in the range 0 - (10^19-1)
5555 // so that the value is in effect represented base 10^19. From that state
5556 // printing it in decimal becomes easy!
5557
5558
5559 // This first one takes a number represented base 2^64 with digits
5560 // 0 to n-1 and divides it by 10^19, returning the remainder and
5561 // setting both the digits and its length suitably to be the quotient.
5562 // The number is POSITIVE here. Note that the function overwrites its input
5563 // with the quotient.
5564
5565
short_divide_ten_19(std::uint64_t * r,std::size_t & n)5566 inline std::uint64_t short_divide_ten_19(std::uint64_t *r,
5567 std::size_t &n)
5568 { std::uint64_t hi = 0;
5569 std::size_t i=n-1;
5570 for (;;)
5571 { divide64(hi, r[i], ten19, r[i], hi);
5572 if (i == 0) break;
5573 i--;
5574 }
5575 if (r[n-1] == 0) n--;
5576 return hi;
5577 }
5578
5579 // How many bits are there in a bignum?
5580
5581 // Note that if a bignum occupies over 1/8 of your total memory that
5582 // the number of bits it uses might overflow size_t. On a 32-bit system
5583 // this might happen if the number occupies over 512 Mbytes and I view
5584 // that as a situation I will accept as a limit for 32-bit platforms.
5585
bignum_bits(const std::uint64_t * a,std::size_t lena)5586 inline std::size_t bignum_bits(const std::uint64_t *a,
5587 std::size_t lena)
5588 { if (lena == 0 && a[0] == 0) return 1; // say that 0 has 1 bit.
5589 std::uint64_t top = a[lena-1]; // top digit.
5590 // The exact interpretation of "the length in bits of a negative number"
5591 // is something I need to think through. Well Common Lisp counts the
5592 // number of bits apart from the sign bit, so we have
5593 // n bignum_bits(n) bignum_bits(-n)
5594 // 0 0 0
5595 // 1 1 1 0 ..11111:
5596 // 2 2 10 1 ..1111:0
5597 // 3 2 11 2 ..111:01
5598 // 4 3 100 2 ..111:00
5599 // 7 3 111 3 ..11:001
5600 // 8 4 1000 3 ..11:000
5601 if (negative(top))
5602 { std::uint64_t carry = 1;
5603 for (std::size_t i=0; i<lena; i++)
5604 { top = ~a[i] + carry;
5605 carry = (top < carry ? 1 : 0);
5606 }
5607 top--;
5608 }
5609 return 64*(lena-1) + (top==0 ? 0 : 64-nlz(top));
5610 }
5611
5612 // I want an estimate of the number of bytes that it will take to
5613 // represent a number when I convert it to a string.
5614 //
5615 // I will work through an example. Consider the input 12024932 = 0xb77c64.
5616 // [I use this value because at one time it revealed a mistake I had made!]
5617 // This value uses 24 bits, ie its value is at least 2^23 (8388608) and
5618 // it is less than 2^26 (16777216). log10(2^24) is 7.2247... so in decimal
5619 // the number will use 7.2 digits, well that must be rounded up to 8.
5620 // log10(2^24) = 24*log10(2) = 24*0.301030.. < 24*(617/2048) [because that
5621 // fraction = 0.30127.. > log10(2)]. So if one the number of decimal digits
5622 // that can be generated will be ceil(24*617/2048). I will compute that by
5623 // forming a quotient that is truncated towards zero and then adding 1, and
5624 // in this case this yields 8 as required. For negative numbers I will add 1
5625 // to allow for a "-" sign.
5626
predict_size_in_bytes(const std::uint64_t * a,std::size_t lena)5627 inline std::size_t predict_size_in_bytes(const std::uint64_t *a,
5628 std::size_t lena)
5629 {
5630 // I am first going to estimate the size in BITS and then I will
5631 // see how that maps onto bytes.
5632 std::size_t r = bignum_bits(a, lena);
5633 r = 1 + static_cast<std::size_t>(
5634 (617*static_cast<std::uint64_t>(r))/2048);
5635 if (negative(a[lena-1])) r += 2; // allow space for a "-" sign.
5636 return r;
5637 }
5638
bignum_to_string_length(std::uint64_t * a,std::size_t lena)5639 inline std::size_t bignum_to_string_length(std::uint64_t *a,
5640 std::size_t lena)
5641 { if (lena == 1)
5642 { std::int64_t v = a[0];
5643 // Note that the negative numbers checked against are 1 digit shorter so as
5644 // to allow space for the "-" sign.
5645 if (v <= 9999999 && v >= -999999) return 7;
5646 else if (v <= 999999999999999 && v >= -99999999999999) return 15;
5647 else return 23;
5648 }
5649 else return predict_size_in_bytes(a, lena);
5650 }
5651
5652 // The "as_unsigned" option here is not for general use - it is JUST for
5653 // internal debugging because at times I work with values that are known
5654 // to be positive and so where the top digit must be treated as unsigned...
5655
bignum_to_string(char * result,std::size_t m,std::uint64_t * a,std::size_t lena,bool as_unsigned=false)5656 inline std::size_t bignum_to_string(char *result, std::size_t m,
5657 std::uint64_t *a, std::size_t lena,
5658 bool as_unsigned=false)
5659 {
5660 // Making one-word numbers a special case simplifies things later on! It may
5661 // also make this case go just slightly faster.
5662 if (lena == 1)
5663 { std::uint64_t v = a[0];
5664 bool sign;
5665 if (negative(v) && !as_unsigned)
5666 { sign = true;
5667 v = -v;
5668 }
5669 else sign = false;
5670 char buffer[24];
5671 std::size_t len = 0;
5672 while (v != 0)
5673 { buffer[len++] = '0' + v%10;
5674 v = v/10;
5675 }
5676 // Now I have the decimal digits on the number in my buffer, with the
5677 // least significant first and the most significant last. Insert the sign bit
5678 // if needed (and deal with the special case of zero).
5679 if (sign) buffer[len++] = '-';
5680 else if (len == 0) buffer[len++] = '0';
5681 for (std::size_t i=0; i<len; i++) result[i] = buffer[len-i-1];
5682 return len;
5683 }
5684 // The size (m) for the block of memory that I put my result in is
5685 // such that it could hold the string representation of my input, and
5686 // I estimate that via predict_size_in_bytes(). Well the smallest bignum
5687 // that will need 2 words will be {0,0x8000000000000000}, ie 2^63. That
5688 // will need 19 decimal digits plus space for a sign bit, so there will be
5689 // at least 20 bytes allocated for the printed representation of any 2-word
5690 // bignum, and at least 40 for a 3-word value, at least 59 for a 4-word one
5691 // etc. This means that the space I will allocate here for the result
5692 // leaves me with plenty of workspace to use while constructing the
5693 // output string. The case liable to be tightest will be that of the
5694 // smallest 2-woed bignum, so if I ensure that is OK all the rest will
5695 // certainly be safe.
5696 //
5697 // I am going to build up (decimal) digits of the converted number by
5698 // repeatedly dividing by 10^19. Each time I do that the remainder I
5699 // amd left with is the next low 19 decimal digits of my number. Doing the
5700 // divisions needs a vector to store the number I am dividing by 10^19 and
5701 // to put the quotient, and I do not want to corrupt my original input, so
5702 // I will copy my input into a fresh vector. And I will force it to be
5703 // positive. The made-positive version might have a leading digit with
5704 // its top bit set - that will not worry me because I view it as unsigned.
5705 //
5706 // I have allocated the space that will be needed for the eventual string of
5707 // characters. I will use that space to save numeric values along the way, so
5708 // here I cast so I can use that same memory as a vector of 64-bit integers.
5709 // I will only ever access data in the format that it was placed into memory!
5710 // Note that this will assume that the string data was allocated so as to
5711 // be aligned suitably for uint64_t values.
5712 std::uint64_t *r = reinterpret_cast<std::uint64_t *>(result);
5713 std::size_t i;
5714 // For the edge case lena==2 and m==20. I copy 2 words across. That will leave
5715 // 4 bytes unused.
5716 for (i=0; i<lena; i++) r[i] = a[i];
5717 for (; i<m/sizeof(std::uint64_t); i++) r[i] = 0;
5718 // Make the number positive
5719 bool sign = false;
5720 if (negative(r[lena-1]) && !as_unsigned)
5721 { sign = true;
5722 internal_negate(r, lena, r);
5723 }
5724 // Now my number is positive and is of length lena, but the vector it is
5725 // stored in is length m with m usefully larger than lena. I will repeatedly
5726 // divide by 10^19 and each time I do that I can store the remainder working
5727 // down from the top of the vector. That should JUST keep up so that I
5728 // never overwrite digits of the reducing part! I will stop when the
5729 // number I have been working with end up < 10^19.
5730 std::size_t p = m/sizeof(std::uint64_t)
5731 -1; // where to put next output digit
5732 // Each value written into the vector here will stand for 19 decimal
5733 // digits, and will use 8 bytes. So here the nastiest case will be when the
5734 // number of decimal digits to end up with is 7 mod 8 (so that I lose as
5735 // much space as possible) and the number is as large as possible. My
5736 // belief is that numbers from 10^16 upwards will lead to there being enough
5737 // space.
5738 while (lena > 1 || r[0] > ten19)
5739 { std::uint64_t d = short_divide_ten_19(r, lena);
5740 r[p--] = d;
5741 }
5742 r[p] = r[0];
5743 // Now I have the data that has to go into my result as a sequence of
5744 // digits base 10^19, with the most significant one first. Convert
5745 // to character data. I write in the string data just over what has been
5746 // digits data, and I have arranged to position everything to (just)
5747 // avoid overwriting myself.
5748 std::uint64_t top = r[p++];
5749 if (top == 0) top = r[p++]; // discard potential leading zero!
5750 // Get a pointer into the buffer as character data...
5751 char *p1 = reinterpret_cast<char *>(result);
5752 std::size_t len = 0;
5753 if (sign)
5754 { *p1++ = '-';
5755 len = 1;
5756 }
5757 // I am going to convert my numbers to decimal using explicit code here.
5758 // in an earlier draft I used sprintf(), however that adds unnecessary
5759 // overhead.
5760 char buffer[24];
5761 int bp = 0;
5762 // The first part of the number is printed naturally so that it only
5763 // uses as many bytes of output as it needs.
5764 do
5765 { buffer[bp++] = '0' + top%10;
5766 top = top/10;
5767 }
5768 while (top != 0);
5769 do
5770 { *p1++ = buffer[--bp];
5771 len++;
5772 }
5773 while (bp != 0);
5774 arithlib_assert(len + 19*(m/sizeof(std::uint64_t)-p)<= m);
5775 while (p < m/sizeof(std::uint64_t))
5776 {
5777 // I will always pick up the number I am going to expand before writing any
5778 // digits into the buffer.
5779 top = r[p++];
5780 // For subsequent chunks I want to print exactly 19 decimal digits.
5781 for (std::size_t i=0; i<18; i++)
5782 { p1[18-i] = '0' + top%10;
5783 top = top/10;
5784 }
5785 *p1 = '0' + static_cast<int>(top);
5786 p1 += 19;
5787 len += 19;
5788 }
5789 // To convince myself that this is safe consider when I pick up the final
5790 // chunk. It will turn into 19 bytes of output, so where it comes from must
5791 // be no more than 19 bytes before the length (m) of the final string, because
5792 // otherwise it would have got clobbered when I unpacked the previous chunk.
5793 // But this final chunk is itself 8 bytes wide and there can be up to 7 bytes
5794 // beyond it that are there to support proper alignment - so that last chunk
5795 // lives within the final 15 bytes of the buffer and that is a fortiori within
5796 // the last 19 as required.
5797 return len;
5798 }
5799
bignum_to_string(std::uint64_t * a,std::size_t lena,bool as_unsigned=false)5800 inline string_handle bignum_to_string(std::uint64_t *a,
5801 std::size_t lena,
5802 bool as_unsigned=false)
5803 { std::size_t len = bignum_to_string_length(a, lena);
5804 push(a);
5805 char *s = reserve_string(len);
5806 pop(a);
5807 std::size_t final_len = bignum_to_string(s, len, a, lena,
5808 as_unsigned);
5809 return confirm_size_string(s, len, final_len);
5810 }
5811
bignum_to_string(std::intptr_t aa)5812 inline string_handle bignum_to_string(std::intptr_t aa)
5813 { std::uint64_t *a, v[1];
5814 std::size_t lena;
5815 if (stored_as_fixnum(aa))
5816 { v[0] = int_of_handle(aa);
5817 a = v;
5818 lena = 1;
5819 }
5820 else
5821 { a = vector_of_handle(aa);
5822 lena = number_size(a);
5823 }
5824 return bignum_to_string(a, lena);
5825 }
5826
5827 // As well as converting to decimal I can do hex, octal or binary!
5828
bignum_to_string_hex_length(std::intptr_t aa)5829 inline std::size_t bignum_to_string_hex_length(std::intptr_t aa)
5830 { return 24;
5831 }
5832
bignum_to_string_hex(std::intptr_t aa)5833 inline string_handle bignum_to_string_hex(std::intptr_t aa)
5834 { std::uint64_t *a, v[1];
5835 std::size_t n;
5836 if (stored_as_fixnum(aa))
5837 { v[0] = int_of_handle(aa);
5838 a = v;
5839 n = 1;
5840 }
5841 else
5842 { a = vector_of_handle(aa);
5843 n = number_size(a);
5844 }
5845 // Making the value zero a special case simplifies things later on!
5846 if (n == 1 && a[0] == 0)
5847 { char *r = reserve_string(1);
5848 std::strcpy(r, "0");
5849 return confirm_size_string(r, 1, 1);
5850 }
5851 // printing in hexadecimal should be way easier!
5852 std::size_t m = 16*n;
5853 std::uint64_t top = a[n-1];
5854 bool sign = negative(top);
5855 if (sign)
5856 { m += 2; // for "~f"
5857 while ((top>>60) == 0xf)
5858 { top = top << 4;
5859 m--;
5860 }
5861 }
5862 else
5863 { while (top == 0)
5864 { n--;
5865 top = a[n-1];
5866 }
5867 while ((top>>60) == 0)
5868 { top = top << 4;
5869 m--;
5870 }
5871 }
5872 push(a);
5873 char *r = reserve_string(m);
5874 pop(a);
5875 char *p = reinterpret_cast<char *>(r);
5876 top = a[n-1];
5877 if (sign)
5878 { *p++ = '~';
5879 *p++ = 'f';
5880 }
5881 bool started = false;
5882 for (std::size_t i=0; i<n; i++)
5883 { std::uint64_t v = a[n-i-1];
5884 for (int j=0; j<16; j++)
5885 { int d = static_cast<int>(v >> (60-4*j)) & 0xf;
5886 if (!started)
5887 { if ((sign && d==0xf) ||
5888 (!sign && d==0)) continue;
5889 started = true;
5890 }
5891 *p++ = "0123456789abcdef"[d];
5892 }
5893 }
5894 return confirm_size_string(r, m, m);
5895 }
5896
bignum_to_string_octal_length(std::intptr_t aa)5897 inline std::size_t bignum_to_string_octal_length(std::intptr_t aa)
5898 { return 24;
5899 }
5900
bignum_to_string_octal(std::intptr_t aa)5901 inline string_handle bignum_to_string_octal(std::intptr_t aa)
5902 { std::uint64_t *a, v[1];
5903 std::size_t n;
5904 if (stored_as_fixnum(aa))
5905 { v[0] = int_of_handle(aa);
5906 a = v;
5907 n = 1;
5908 }
5909 else
5910 { a = vector_of_handle(aa);
5911 n = number_size(a);
5912 }
5913 std::size_t width = (64*n +
5914 2)/3; // raw number of octal digits needed.
5915 std::uint64_t top = a[n-1];
5916 bool sign = negative(top);
5917 // There is a slight misery in that 64 is not a multiple of 3 (!) and so
5918 // the octal representation of a value has some digits that depend on a pair
5919 // of adjacent words from the bignum.
5920 std::size_t
5921 nn; // will be the number of characters used in the output
5922 if (sign)
5923 { while (read_u3(a, n, width-1) == 7 && width > 1) width--;
5924 nn = width+2;
5925 }
5926 else
5927 { while (read_u3(a, n, width-1) == 0 && width > 1) width--;
5928 nn = width;
5929 }
5930 push(a);
5931 char *r = reserve_string(nn);
5932 pop(a);
5933 char *p = reinterpret_cast<char *>(r);
5934 if (sign)
5935 { *p++ = '~';
5936 *p++ = '7';
5937 }
5938 for (std::size_t i=0; i<width; i++)
5939 *p++ = '0' + read_u3(a, n, width-i-1);
5940 return confirm_size_string(r, nn, width);
5941 }
5942
bignum_to_string_binary_length(std::intptr_t aa)5943 inline std::size_t bignum_to_string_binary_length(std::intptr_t aa)
5944 { return 24;
5945 }
5946
bignum_to_string_binary(std::intptr_t aa)5947 inline string_handle bignum_to_string_binary(std::intptr_t aa)
5948 { std::uint64_t *a, v[1];
5949 std::size_t n;
5950 if (stored_as_fixnum(aa))
5951 { v[0] = int_of_handle(aa);
5952 a = v;
5953 n = 1;
5954 }
5955 else
5956 { a = vector_of_handle(aa);
5957 n = number_size(a);
5958 }
5959 // Making the value zero a special case simplifies things later on!
5960 if (n == 1 && a[0] == 0)
5961 { char *r = reserve_string(1);
5962 std::strcpy(r, "0");
5963 return confirm_size_string(r, 1, 1);
5964 }
5965 std::size_t m = 64*n;
5966 std::uint64_t top = a[n-1];
5967 bool sign = negative(top);
5968 if (sign)
5969 { m += 2; // for "~1"
5970 while ((top>>63) == 1)
5971 { top = top << 1;
5972 m--;
5973 }
5974 }
5975 else
5976 { arithlib_assert(top != 0);
5977 while ((top>>63) == 0)
5978 { top = top << 1;
5979 m--;
5980 }
5981 }
5982 push(a);
5983 char *r = reserve_string(m);
5984 pop(a);
5985 char *p = reinterpret_cast<char *>(r);
5986 top = a[n-1];
5987 if (sign)
5988 { *p++ = '~';
5989 *p++ = '1';
5990 }
5991 bool started = false;
5992 for (std::size_t i=0; i<n; i++)
5993 { std::uint64_t v = a[n-i-1];
5994 for (int j=0; j<64; j++)
5995 { int d = static_cast<int>(v >> (63-j)) & 0x1;
5996 if (!started)
5997 { if ((sign && d==1) ||
5998 (!sign && d==0)) continue;
5999 started = true;
6000 }
6001 *p++ = '0' + d;
6002 }
6003 }
6004 return confirm_size_string(r, m, m);
6005 }
6006
6007 //=========================================================================
6008 //=========================================================================
6009 // Big number comparisons.
6010 //=========================================================================
6011 //=========================================================================
6012
6013
op(std::uint64_t * a)6014 inline bool Zerop::op(std::uint64_t *a)
6015 { return number_size(a) == 1 && a[0] == 0;
6016 }
6017
op(std::int64_t a)6018 inline bool Zerop::op(std::int64_t a)
6019 { return a == 0;
6020 }
6021
op(std::uint64_t * a)6022 inline bool Onep::op(std::uint64_t *a)
6023 { return number_size(a) == 1 && a[0] == 1;
6024 }
6025
op(std::int64_t a)6026 inline bool Onep::op(std::int64_t a)
6027 { return a == 1;
6028 }
6029
op(std::uint64_t * a)6030 inline bool Minusp::op(std::uint64_t *a)
6031 { return negative(a[number_size(a)-1]);
6032 }
6033
op(std::int64_t a)6034 inline bool Minusp::op(std::int64_t a)
6035 { return a < 0;
6036 }
6037
op(std::uint64_t * a)6038 inline bool Evenp::op(std::uint64_t *a)
6039 { return (a[0] & 1) == 0;
6040 }
6041
op(std::int64_t a)6042 inline bool Evenp::op(std::int64_t a)
6043 { return (a & 1) == 0;
6044 }
6045
op(std::uint64_t * a)6046 inline bool Oddp::op(std::uint64_t *a)
6047 { return (a[0] & 1) != 0;
6048 }
6049
op(std::int64_t a)6050 inline bool Oddp::op(std::int64_t a)
6051 { return (a & 1) != 0;
6052 }
6053
6054 // eqn
6055
bigeqn(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb)6056 inline bool bigeqn(const std::uint64_t *a, std::size_t lena,
6057 const std::uint64_t *b, std::size_t lenb)
6058 { if (lena != lenb) return false;
6059 return std::memcmp(a, b, lena*sizeof(std::uint64_t)) == 0;
6060 }
6061
6062
op(std::uint64_t * a,std::uint64_t * b)6063 inline bool Eqn::op(std::uint64_t *a, std::uint64_t *b)
6064 { std::size_t lena = number_size(a);
6065 std::size_t lenb = number_size(b);
6066 return bigeqn(a, lena, b, lenb);
6067 }
6068
op(std::uint64_t * a,std::int64_t b)6069 inline bool Eqn::op(std::uint64_t *a, std::int64_t b)
6070 { std::size_t lena = number_size(a);
6071 return lena==1 && static_cast<std::int64_t>(a[0])==b;
6072 }
6073
op(std::int64_t a,std::uint64_t * b)6074 inline bool Eqn::op(std::int64_t a, std::uint64_t *b)
6075 { std::size_t lenb = number_size(b);
6076 return lenb==1 && a==static_cast<std::int64_t>(b[0]);
6077 }
6078
op(std::int64_t a,std::int64_t b)6079 inline bool Eqn::op(std::int64_t a, std::int64_t b)
6080 { return (a == b);
6081 }
6082
op(std::int64_t a,float b)6083 inline bool Eqn::op(std::int64_t a, float b)
6084 { return Eqn::op(a, static_cast<double>(b));
6085 }
6086
op(std::uint64_t * a,float b)6087 inline bool Eqn::op(std::uint64_t *a, float b)
6088 { return Eqn::op(a, static_cast<double>(b));
6089 }
6090
op(float a,std::int64_t b)6091 inline bool Eqn::op(float a, std::int64_t b)
6092 { return Eqn::op(static_cast<double>(a), b);
6093 }
6094
op(float a,std::uint64_t * b)6095 inline bool Eqn::op(float a, std::uint64_t *b)
6096 { return Eqn::op(static_cast<double>(a), b);
6097 }
6098
op(std::int64_t a,double b)6099 inline bool Eqn::op(std::int64_t a, double b)
6100 { const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
6101 if (a >= -range && a <= range) return static_cast<double>(a) == b;
6102 // The value on the next line is a floating point representation of 2^63,
6103 // so any floating value at least that large is bigger than any int64_t value.
6104 if (b >= 9223372036854775808.0) return false;
6105 else if (b < -9223372036854775808.0) return false;
6106 if (std::isnan(b)) return false;
6107 return a == static_cast<std::int64_t>(b);
6108 }
6109
eqnfloat(std::uint64_t * a,std::size_t lena,double b)6110 inline bool eqnfloat(std::uint64_t *a, std::size_t lena, double b)
6111 { if (std::isnan(b)||
6112 std::isinf(b)) return false;
6113 std::int64_t top = static_cast<std::int64_t>(a[lena-1]);
6114 // If the signs differn than the values are certainly not equal.
6115 if (top >= 0 && b <= 0.0) return false;
6116 if (top < 0 && b >= 0.0) return false;
6117 double ipart;
6118 double fpart = std::modf(b, &ipart);
6119 if (fpart != 0.0) return false; // not an integer so not equal.
6120 std::int64_t hi;
6121 std::uint64_t next;
6122 std::size_t len;
6123 double_to_virtual_bignum(ipart, hi, next, len, RoundingMode::TRUNC);
6124 if (len != lena) return false;
6125 if (len == 1) return a[0] == static_cast<std::uint64_t>(top);
6126 if (a[len-1] != static_cast<std::uint64_t>(top) ||
6127 a[len-2] != next) return false;
6128 for (std::size_t i=0; i<len-2; i++)
6129 if (a[i] != 0) return false;
6130 return true;
6131 }
6132
op(std::uint64_t * a,double b)6133 inline bool Eqn::op(std::uint64_t *a, double b)
6134 { std::size_t lena = number_size(a);
6135 if (lena == 1) return Eqn::op(static_cast<std::int64_t>(a[0]), b);
6136 return eqnfloat(a, lena, b);
6137 }
6138
op(double a,std::int64_t b)6139 inline bool Eqn::op(double a, std::int64_t b)
6140 { return Eqn::op(b, a);
6141 }
6142
op(double a,std::uint64_t * b)6143 inline bool Eqn::op(double a, std::uint64_t *b)
6144 { return Eqn::op(b, a);
6145 }
6146
6147 #ifdef softfloat_h
6148
6149 // The following constants are 2^112 and -2^112 and their reciprocals, which
6150 // are used in rationalf128 because any 128-bit floating point value that
6151 // is that large is necessarily an exact integer.
6152
6153 #ifdef LITTLEENDIAN
6154
6155 INLINE_VAR float128_t FP128_INT_LIMIT = {{0, INT64_C(0x406f000000000000)}};
6156 INLINE_VAR float128_t FP128_MINUS_INT_LIMIT = {{0, INT64_C(0xc06f000000000000)}};
6157
6158 #else // !LITTLEENDIAN
6159
6160 INLINE_VAR float128_t FP128_INT_LIMIT = {{INT64_C(0x406f000000000000), 0}};
6161 INLINE_VAR float128_t FP128_MINUS_INT_LIMIT = {{INT64_C(0xc06f000000000000), 0}};
6162
6163 #endif // !LITTLEENDIAN
6164
6165
eqnbigfloat(std::uint64_t * a,std::size_t lena,float128_t b)6166 inline bool eqnbigfloat(std::uint64_t *a, std::size_t lena,
6167 float128_t b)
6168 { if (!f128_eq(b, b)) return false; // a NaN if b!=b
6169 std::int64_t top = static_cast<std::int64_t>(a[lena-1]);
6170 if (top >= 0 && f128_lt(b, f128_0)) return false;
6171 if (top < 0 && !f128_lt(b, f128_0)) return false;
6172 // Now the two inputs have the same sign.
6173 if (lena == 1 ||
6174 (lena == 2 &&
6175 !((a[1] > 0x0001000000000000 ||
6176 (a[1] == 0x0001000000000000 && a[0] != 0)) ||
6177 static_cast<std::int64_t>(a[1]) < -static_cast<std::int64_t>
6178 (0x0001000000000000))))
6179 {
6180 // Here the integer is of modest size - if the float is huge we can
6181 // resolve matters cheaply.
6182 if (f128_lt(FP128_INT_LIMIT, b) ||
6183 f128_lt(b, FP128_MINUS_INT_LIMIT)) return false;
6184 // Convert a to a float128 and compare. The conversion will not lose any
6185 // information because the |a| <= 2^112 so it will fit within the mantissa
6186 // bits that are available.
6187 float128_t aa = Float128::op(a);
6188 return f128_eq(aa, b);
6189 }
6190 else
6191 {
6192 // Now the integer is rather big. If I was KEEN I would estimate the size of
6193 // the float from its exponent and compare with the number of bits in the
6194 // integer to filter out cases where their sized were very different. However
6195 // I am not feeling very keen! I can afford to convert the float to an integer,
6196 // and because it is large when I fix it there will not be any discarded
6197 // fractional part...
6198 std::intptr_t bb = round_float128_to_int(b);
6199 return op_dispatch2<Eqn,bool>(vector_to_handle(a), bb);
6200 }
6201 }
6202
op(std::int64_t a,float128_t b)6203 inline bool Eqn::op(std::int64_t a, float128_t b)
6204 { return f128_eq(i64_to_f128(a), b);
6205 }
6206
op(std::uint64_t * a,float128_t b)6207 inline bool Eqn::op(std::uint64_t *a, float128_t b)
6208 { std::size_t lena = number_size(a);
6209 if (lena == 1) return Eqn::op(static_cast<std::int64_t>(a[0]), b);
6210 return eqnbigfloat(a, lena, b);
6211 }
6212
op(float128_t a,std::int64_t b)6213 inline bool Eqn::op(float128_t a, std::int64_t b)
6214 { return Eqn::op(b, a);
6215 }
6216
op(float128_t a,std::uint64_t * b)6217 inline bool Eqn::op(float128_t a, std::uint64_t *b)
6218 { return Eqn::op(b, a);
6219 }
6220
6221 #endif // softfloat_h
6222
op(std::uint64_t * a,std::uint64_t * b)6223 inline bool Neqn::op(std::uint64_t *a, std::uint64_t *b)
6224 { std::size_t lena = number_size(a);
6225 std::size_t lenb = number_size(b);
6226 return !bigeqn(a, lena, b, lenb);
6227 }
6228
op(std::uint64_t * a,std::int64_t b)6229 inline bool Neqn::op(std::uint64_t *a, std::int64_t b)
6230 { std::size_t lena = number_size(a);
6231 return lena!=1 || static_cast<std::int64_t>(a[0])!=b;
6232 }
6233
op(std::int64_t a,std::uint64_t * b)6234 inline bool Neqn::op(std::int64_t a, std::uint64_t *b)
6235 { std::size_t lenb = number_size(b);
6236 return lenb!=1 || a!=static_cast<std::int64_t>(b[0]);
6237 }
6238
op(std::int64_t a,std::int64_t b)6239 inline bool Neqn::op(std::int64_t a, std::int64_t b)
6240 { return (a != b);
6241 }
6242
op(std::int64_t a,float b)6243 inline bool Neqn::op(std::int64_t a, float b)
6244 { return Neqn::op(a, static_cast<double>(b));
6245 }
6246
op(std::uint64_t * a,float b)6247 inline bool Neqn::op(std::uint64_t *a, float b)
6248 { return Neqn::op(a, static_cast<double>(b));
6249 }
6250
op(float a,std::int64_t b)6251 inline bool Neqn::op(float a, std::int64_t b)
6252 { return Neqn::op(static_cast<double>(a), b);
6253 }
6254
op(float a,std::uint64_t * b)6255 inline bool Neqn::op(float a, std::uint64_t *b)
6256 { return Neqn::op(static_cast<double>(a), b);
6257 }
6258
op(std::int64_t a,double b)6259 inline bool Neqn::op(std::int64_t a, double b)
6260 { const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
6261 if (a >= -range && a <= range) return static_cast<double>(a) != b;
6262 // The value on the next line is a floating point representation of 2^63,
6263 // so any floating value at least that large is bigger than any int64_t value.
6264 if (b >= 9223372036854775808.0) return true;
6265 else if (b < -9223372036854775808.0) return true;
6266 if (std::isnan(b)) return false; // Ha Ha Ha!
6267 return a != static_cast<std::int64_t>(b);
6268 }
6269
op(std::uint64_t * a,double b)6270 inline bool Neqn::op(std::uint64_t *a, double b)
6271 { std::size_t lena = number_size(a);
6272 if (lena == 1) return Neqn::op(static_cast<std::int64_t>(a[0]), b);
6273 return !eqnfloat(a, lena, b);
6274 }
6275
op(double a,std::int64_t b)6276 inline bool Neqn::op(double a, std::int64_t b)
6277 { return Neqn::op(b, a);
6278 }
6279
op(double a,std::uint64_t * b)6280 inline bool Neqn::op(double a, std::uint64_t *b)
6281 { return Neqn::op(b, a);
6282 }
6283
6284 #ifdef softfloat_h
6285
op(std::int64_t a,float128_t b)6286 inline bool Neqn::op(std::int64_t a, float128_t b)
6287 { return !f128_eq(i64_to_f128(a), b);
6288 }
6289
op(std::uint64_t * a,float128_t b)6290 inline bool Neqn::op(std::uint64_t *a, float128_t b)
6291 { std::size_t lena = number_size(a);
6292 if (lena == 1) return Neqn::op(static_cast<std::int64_t>(a[0]), b);
6293 return !eqnbigfloat(a, lena, b);
6294 }
6295
op(float128_t a,std::int64_t b)6296 inline bool Neqn::op(float128_t a, std::int64_t b)
6297 { return Neqn::op(b, a);
6298 }
6299
op(float128_t a,std::uint64_t * b)6300 inline bool Neqn::op(float128_t a, std::uint64_t *b)
6301 { return Neqn::op(b, a);
6302 }
6303
6304 #endif // softfloat_h
6305
6306 // greaterp
6307
biggreaterp(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb)6308 inline bool biggreaterp(const std::uint64_t *a, std::size_t lena,
6309 const std::uint64_t *b, std::size_t lenb)
6310 { std::uint64_t a0 = a[lena-1], b0 = b[lenb-1];
6311 // If one of the numbers has more digits than the other then the sign of
6312 // the longer one gives my the answer.
6313 if (lena > lenb) return positive(a0);
6314 else if (lenb > lena) return negative(b0);
6315 // When the two numbers are the same length but the top digits differ
6316 // then comparing those digits tells me all I need to know.
6317 if (static_cast<std::int64_t>(a0) >
6318 static_cast<std::int64_t>(b0)) return true;
6319 if (static_cast<std::int64_t>(a0) <
6320 static_cast<std::int64_t>(b0)) return false;
6321 // Otherwise I need to scan down through digits...
6322 lena--;
6323 while (lena != 0)
6324 { lena--;
6325 a0 = a[lena];
6326 b0 = b[lena];
6327 if (a0 > b0) return true;
6328 if (a0 < b0) return false;
6329 }
6330 return false;
6331 }
6332
6333 // This version treats the two inputs as unsigned numbers. It is used from
6334 // within the GCD code (at least)
6335
big_unsigned_greaterp(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb)6336 inline bool big_unsigned_greaterp(const std::uint64_t *a,
6337 std::size_t lena,
6338 const std::uint64_t *b, std::size_t lenb)
6339 {
6340 // If one of the numbers has more digits than the other then it is the
6341 // larger.
6342 if (lena > lenb) return true;
6343 else if (lenb > lena) return false;
6344 while (lena != 0)
6345 { lena--;
6346 std::uint64_t a0 = a[lena];
6347 std::uint64_t b0 = b[lena];
6348 if (a0 > b0) return true;
6349 if (a0 < b0) return false;
6350 }
6351 return false;
6352 }
6353
op(std::uint64_t * a,std::uint64_t * b)6354 inline bool Greaterp::op(std::uint64_t *a, std::uint64_t *b)
6355 { std::size_t lena = number_size(a);
6356 std::size_t lenb = number_size(b);
6357 return biggreaterp(a, lena, b, lenb);
6358 }
6359
op(std::uint64_t * a,std::int64_t bb)6360 inline bool Greaterp::op(std::uint64_t *a, std::int64_t bb)
6361 { std::uint64_t b[1] = {static_cast<std::uint64_t>(bb)};
6362 std::size_t lena = number_size(a);
6363 return biggreaterp(a, lena, b, 1);
6364 }
6365
op(std::int64_t aa,std::uint64_t * b)6366 inline bool Greaterp::op(std::int64_t aa, std::uint64_t *b)
6367 { std::uint64_t a[1] = {static_cast<std::uint64_t>(aa)};
6368 std::size_t lenb = number_size(b);
6369 return biggreaterp(a, 1, b, lenb);
6370 }
6371
op(std::int64_t a,std::int64_t b)6372 inline bool Greaterp::op(std::int64_t a, std::int64_t b)
6373 { return a > b;
6374 }
6375
6376 // I can always widen a float to a double without loss of any information,
6377 // so all the cases of comparisons with floats (as distinct from with
6378 // double) are easy to delegate.
6379
op(std::int64_t a,float b)6380 inline bool Greaterp::op(std::int64_t a, float b)
6381 { return Greaterp::op(a, static_cast<double>(b));
6382 }
6383
op(std::uint64_t * a,float b)6384 inline bool Greaterp::op(std::uint64_t *a, float b)
6385 { return Greaterp::op(a, static_cast<double>(b));
6386 }
6387
op(float a,std::int64_t b)6388 inline bool Greaterp::op(float a, std::int64_t b)
6389 { return Greaterp::op(static_cast<double>(a), b);
6390 }
6391
op(float a,std::uint64_t * b)6392 inline bool Greaterp::op(float a, std::uint64_t *b)
6393 { return Greaterp::op(static_cast<double>(a), b);
6394 }
6395
op(std::int64_t a,double b)6396 inline bool Greaterp::op(std::int64_t a, double b)
6397 {
6398 // If the integer is small enough it can be converted to a double
6399 // without any rounding, so then I can do the comparison easily.
6400 const std::int64_t range = static_cast<std::int64_t>(1)<<53;
6401 if (a >= -range && a <= range) return static_cast<double>(a) > b;
6402 // If the floating point value is >= 2^63 or is less < -2^63 it is beyond
6403 // the range of int64_t, so the result is easy. This situation includes
6404 // the case of infinities.
6405 if (b >= 9223372036854775808.0) return false;
6406 else if (b < -9223372036854775808.0) return true;
6407 // NaNs must always return false from a comparison, so all the cases so
6408 // far will have yielded correct results. But here I must filter out
6409 // that situation.
6410 if (std::isnan(b)) return false;
6411 // Because |b| >= 2^53 but < 2^63 it can be converted to an int64_t value
6412 // without rounding.
6413 return a > static_cast<std::int64_t>(b);
6414 }
6415
6416 // This compares a bignum against a double. It may in fact only be called
6417 // in the case where it is at least a 2-word bignum, and that would render
6418 // the first segment of code unnecessary!
6419 //
6420 // The code here feels ugly and perhaps repetitive to me. For now I will
6421 // just be content to get something that works in all cases, but thinking
6422 // about how to make it tidier will be desirable. I might perhaps also
6423 // think if generalizing it to have EQN and NEQN options in the CompareMode
6424 // enumeration.
6425
6426 enum CompareMode {GREATERP, GEQ, LESSP, LEQ};
6427
greaterpfloat(std::uint64_t * a,std::size_t lena,double b,CompareMode mode)6428 inline bool greaterpfloat(std::uint64_t *a, std::size_t lena,
6429 double b,
6430 CompareMode mode)
6431 { if (std::isnan(b)) return false;
6432 // If the integer is small enough it can be converted to a double
6433 // without any rounding, so then I can do the comparison easily.
6434 if (lena == 1)
6435 { std::int64_t aa = a[0];
6436 const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
6437 if (aa >= -range && aa <= range)
6438 { double ad = static_cast<double>(aa);
6439 switch (mode)
6440 { case CompareMode::GREATERP:
6441 return static_cast<double>(ad) > b;
6442 case CompareMode::GEQ:
6443 return static_cast<double>(ad) >= b;
6444 case CompareMode::LESSP:
6445 return static_cast<double>(ad) < b;
6446 case CompareMode::LEQ:
6447 return static_cast<double>(ad) <= b;
6448 }
6449 }
6450 }
6451 // If b==+infinity then a<b and a<=b, while if b=-=infinity then
6452 // a>b and a>=b.
6453 if (std::isinf(b))
6454 { return (b > 0.0 && (mode==CompareMode::LESSP ||
6455 mode==CompareMode::LEQ)) ||
6456 (b < 0.0 && (mode==CompareMode::GREATERP ||
6457 mode==CompareMode::GEQ));
6458 }
6459 // Also if a and b have different signs it is easy to resolve the issue.
6460 if (negative(a[lena-1]) && b >= 0.0)
6461 return (mode==CompareMode::LESSP || mode==CompareMode::LEQ);
6462 if (positive(a[lena-1]) && b <= 0.0)
6463 return (mode==CompareMode::GREATERP || mode==CompareMode::GEQ);
6464 // Now if I convert b to an integer and compare I can lose a fractional
6465 // part in the case when b is small. But given that |a| is large if I
6466 // truncate b as I map it onto an integer the comparisons I make will still
6467 // be valid.
6468 std::int64_t top;
6469 std::uint64_t next;
6470 std::size_t len;
6471 double_to_virtual_bignum(b, top, next, len, RoundingMode::TRUNC);
6472 // If the numbers now differ in length that can tell me what the result is.
6473 if (lena > len)
6474 { if (negative(a[lena-1]))
6475 return (mode==CompareMode::LESSP || mode==CompareMode::LEQ);
6476 if (positive(a[lena-1]))
6477 return (mode==CompareMode::GREATERP || mode==CompareMode::GEQ);
6478 }
6479 if (lena < len)
6480 { if (positive(a[lena-1]))
6481 return (mode==CompareMode::LESSP || mode==CompareMode::LEQ);
6482 if (negative(a[lena-1]))
6483 return (mode==CompareMode::GREATERP || mode==CompareMode::GEQ);
6484 }
6485 // Now the arguments have the same length as bignums. First check for
6486 // differences in the top two digits.
6487 if (static_cast<std::int64_t>(a[lena-1]) < top ||
6488 (static_cast<std::int64_t>(a[lena-1]) == top && a[lena-2] < next))
6489 return (mode==CompareMode::LESSP || mode==CompareMode::LEQ);
6490 if (static_cast<std::int64_t>(a[lena-1]) > top ||
6491 (static_cast<std::int64_t>(a[lena-1]) == top && a[lena-2] > next))
6492 return (mode==CompareMode::GREATERP || mode==CompareMode::GEQ);
6493 // Now the top two digits of the two inputs match. If all lower digits of a
6494 // are zero then the two inputs are equal.
6495 for (std::size_t i=0; i<len; i++)
6496 { if (a[i] != 0) return (mode==CompareMode::GREATERP ||
6497 mode==CompareMode::GEQ);
6498 }
6499 // Here the inputs seem to be exactly equal in value.
6500 return mode==CompareMode::GEQ || mode==CompareMode::LEQ;
6501 }
6502
op(std::uint64_t * a,double b)6503 inline bool Greaterp::op(std::uint64_t *a, double b)
6504 { std::size_t lena = number_size(a);
6505 if (lena == 1) return Greaterp::op(static_cast<std::int64_t>(a[0]),
6506 b);
6507 return greaterpfloat(a, lena, b, CompareMode::GREATERP);
6508 }
6509
op(double a,std::int64_t b)6510 inline bool Greaterp::op(double a, std::int64_t b)
6511 { return Lessp::op(b, a);
6512 }
6513
op(double a,std::uint64_t * b)6514 inline bool Greaterp::op(double a, std::uint64_t *b)
6515 { return Lessp::op(b, a);
6516 }
6517
6518 #ifdef softfloat_h
6519
6520 //@@ This is not sorted out yet!
6521
6522 // This one function does >, >=, < and <= with "great" indicating if
6523 // the base is > or < and "ifequal" distinguishing > from >= and < from <=.
6524
greaterpbigfloat(std::uint64_t * a,std::size_t lena,float128_t b,bool great,bool ifequal)6525 inline bool greaterpbigfloat(std::uint64_t *a, std::size_t lena,
6526 float128_t b,
6527 bool great, bool ifequal)
6528 { if (f128_nan(b)) return
6529 false; // Comparisons involving a NaN => false.
6530 std::int64_t top = static_cast<std::int64_t>(a[lena-1]);
6531 if (top >= 0 && f128_lt(b, f128_0)) return great;
6532 if (top < 0 && !f128_lt(b, f128_0)) return !great;
6533 // Now the two inputs have the same sign.
6534 if (lena == 1 ||
6535 (lena == 2 &&
6536 !((a[1] > 0x0001000000000000 ||
6537 (a[1] == 0x0001000000000000 && a[0] != 0)) ||
6538 static_cast<std::int64_t>(a[1]) < -static_cast<std::int64_t>
6539 (0x0001000000000000))))
6540 {
6541 // Here the integer is of modest size - if the float is huge we can
6542 // resolve matters cheaply.
6543 if (f128_lt(FP128_INT_LIMIT, b)) return !great;
6544 if (f128_lt(b, FP128_MINUS_INT_LIMIT)) return great;
6545 // Convert a to a float128 and compare. The conversion will not lose any
6546 // information because the |a| <= 2^112 so it will fit within the mantissa
6547 // bits that are available.
6548 float128_t aa = Float128::op(a);
6549 if (great)
6550 { if (ifequal) return f128_le(b, aa);
6551 else return f128_lt(b, aa);
6552 }
6553 else
6554 { if (ifequal) return f128_le(aa, b);
6555 else return f128_lt(aa, b);
6556 }
6557 }
6558 else
6559 {
6560 // Now the integer is rather big. If I was KEEN I would estimate the size of
6561 // the float from its exponent and compare with the number of bits in the
6562 // integer to filter out cases where their sized were very different. However
6563 // I am not feeling very keen! I can afford to convert the float to an integer,
6564 // and because it is large when I fix it there will not be any discarded
6565 // fractional part...
6566 // ...
6567 std::intptr_t bb = round_float128_to_int(b);
6568 // At the moment I think there is a space-leak on bb here...
6569 if (great)
6570 if (ifequal)
6571 return op_dispatch2<Geq,bool>(vector_to_handle(a), bb);
6572 else return op_dispatch2<Greaterp,bool>(vector_to_handle(a), bb);
6573 else if (ifequal)
6574 return op_dispatch2<Leq,bool>(vector_to_handle(a), bb);
6575 else return op_dispatch2<Lessp,bool>(vector_to_handle(a), bb);
6576 }
6577 }
6578
op(std::int64_t a,float128_t b)6579 inline bool Greaterp::op(std::int64_t a, float128_t b)
6580 { return f128_lt(b, i64_to_f128(a));
6581 }
6582
op(std::uint64_t * a,float128_t b)6583 inline bool Greaterp::op(std::uint64_t *a, float128_t b)
6584 { std::size_t lena = number_size(a);
6585 if (lena == 1) return Greaterp::op(static_cast<std::int64_t>(a[0]),
6586 b);
6587 return greaterpbigfloat(a, lena, b, true, false);
6588
6589 }
6590
op(float128_t a,std::int64_t b)6591 inline bool Greaterp::op(float128_t a, std::int64_t b)
6592 { return Lessp::op(b, a);
6593 }
6594
op(float128_t a,std::uint64_t * b)6595 inline bool Greaterp::op(float128_t a, std::uint64_t *b)
6596 { return Lessp::op(b, a);
6597 }
6598
6599 #endif // softfloat_h
6600
6601 // geq
6602
op(std::uint64_t * a,std::uint64_t * b)6603 inline bool Geq::op(std::uint64_t *a, std::uint64_t *b)
6604 { return !Greaterp::op(b, a);
6605 }
6606
op(std::uint64_t * a,std::int64_t b)6607 inline bool Geq::op(std::uint64_t *a, std::int64_t b)
6608 { return !Greaterp::op(b, a);
6609 }
6610
op(std::int64_t a,std::uint64_t * b)6611 inline bool Geq::op(std::int64_t a, std::uint64_t *b)
6612 { return !Greaterp::op(b, a);
6613 }
6614
op(std::int64_t a,std::int64_t b)6615 inline bool Geq::op(std::int64_t a, std::int64_t b)
6616 { return a >= b;
6617 }
6618
op(std::int64_t a,float b)6619 inline bool Geq::op(std::int64_t a, float b)
6620 { return Geq::op(a, static_cast<double>(b));
6621 }
6622
op(std::uint64_t * a,float b)6623 inline bool Geq::op(std::uint64_t *a, float b)
6624 { return Geq::op(a, static_cast<double>(b));
6625 }
6626
op(float a,std::int64_t b)6627 inline bool Geq::op(float a, std::int64_t b)
6628 { return Geq::op(static_cast<double>(a), b);
6629 }
6630
op(float a,std::uint64_t * b)6631 inline bool Geq::op(float a, std::uint64_t *b)
6632 { return Geq::op(static_cast<double>(a), b);
6633 }
6634
op(std::int64_t a,double b)6635 inline bool Geq::op(std::int64_t a, double b)
6636 { const std::int64_t range = (static_cast<std::int64_t>(1))<<53;
6637 if (a >= -range && a <= range) return static_cast<double>(a) >= b;
6638 if (b >= 9223372036854775808.0) return false;
6639 else if (b < -9223372036854775808.0) return true;
6640 if (std::isnan(b)) return false;
6641 return a >= static_cast<std::int64_t>(b);
6642 }
6643
op(std::uint64_t * a,double b)6644 inline bool Geq::op(std::uint64_t *a, double b)
6645 { std::size_t lena = number_size(a);
6646 if (lena == 1) return Geq::op(static_cast<std::int64_t>(a[0]), b);
6647 return greaterpfloat(a, lena, b, CompareMode::GEQ);
6648 }
6649
op(double a,std::int64_t b)6650 inline bool Geq::op(double a, std::int64_t b)
6651 { return Leq::op(b, a);
6652 }
6653
op(double a,std::uint64_t * b)6654 inline bool Geq::op(double a, std::uint64_t *b)
6655 { return Leq::op(b, a);
6656 }
6657
6658 #ifdef softfloat_h
6659
op(std::int64_t a,float128_t b)6660 inline bool Geq::op(std::int64_t a, float128_t b)
6661 { return f128_le(b, i64_to_f128(a));
6662 return false;
6663 }
6664
op(std::uint64_t * a,float128_t b)6665 inline bool Geq::op(std::uint64_t *a, float128_t b)
6666 { std::size_t lena = number_size(a);
6667 if (lena == 1) return Greaterp::op(static_cast<std::int64_t>(a[0]),
6668 b);
6669 return greaterpbigfloat(a, lena, b, true, true);
6670 }
6671
op(float128_t a,std::int64_t b)6672 inline bool Geq::op(float128_t a, std::int64_t b)
6673 { return Leq::op(b, a);
6674 }
6675
op(float128_t a,std::uint64_t * b)6676 inline bool Geq::op(float128_t a, std::uint64_t *b)
6677 { return Leq::op(b, a);
6678 }
6679
6680 #endif // softfloat_h
6681
6682 // lessp
6683
op(std::uint64_t * a,std::uint64_t * b)6684 inline bool Lessp::op(std::uint64_t *a, std::uint64_t *b)
6685 { return Greaterp::op(b, a);
6686 }
6687
op(std::uint64_t * a,std::int64_t b)6688 inline bool Lessp::op(std::uint64_t *a, std::int64_t b)
6689 { return Greaterp::op(b, a);
6690 }
6691
op(std::int64_t a,std::uint64_t * b)6692 inline bool Lessp::op(std::int64_t a, std::uint64_t *b)
6693 { return Greaterp::op(b, a);
6694 }
6695
op(std::int64_t a,std::int64_t b)6696 inline bool Lessp::op(std::int64_t a, std::int64_t b)
6697 { return a < b;
6698 }
6699
op(std::int64_t a,float b)6700 inline bool Lessp::op(std::int64_t a, float b)
6701 { return Lessp::op(a, static_cast<double>(b));
6702 }
6703
op(std::uint64_t * a,float b)6704 inline bool Lessp::op(std::uint64_t *a, float b)
6705 { return Lessp::op(a, static_cast<double>(b));
6706 }
6707
op(float a,std::int64_t b)6708 inline bool Lessp::op(float a, std::int64_t b)
6709 { return Lessp::op(static_cast<double>(a), b);
6710 }
6711
op(float a,std::uint64_t * b)6712 inline bool Lessp::op(float a, std::uint64_t *b)
6713 { return Lessp::op(static_cast<double>(a), b);
6714 }
6715
op(std::int64_t a,double b)6716 inline bool Lessp::op(std::int64_t a, double b)
6717 { const std::int64_t range = static_cast<std::int64_t>(1)<<53;
6718 if (a >= -range && a <= range) return static_cast<double>(a) < b;
6719 if (b >= 9223372036854775808.0) return true;
6720 else if (b < -9223372036854775808.0) return false;
6721 if (std::isnan(b)) return false;
6722 return a < static_cast<std::int64_t>(b);
6723 }
6724
op(std::uint64_t * a,double b)6725 inline bool Lessp::op(std::uint64_t *a, double b)
6726 { std::size_t lena = number_size(a);
6727 if (lena == 1) return Lessp::op(static_cast<std::int64_t>(a[0]), b);
6728 return greaterpfloat(a, lena, b, CompareMode::LESSP);
6729 }
6730
op(double a,std::int64_t b)6731 inline bool Lessp::op(double a, std::int64_t b)
6732 { return Greaterp::op(b, a);
6733 }
6734
op(double a,std::uint64_t * b)6735 inline bool Lessp::op(double a, std::uint64_t *b)
6736 { return Greaterp::op(b, a);
6737 }
6738
6739 #ifdef softfloat_h
6740
op(std::int64_t a,float128_t b)6741 inline bool Lessp::op(std::int64_t a, float128_t b)
6742 { return f128_lt(i64_to_f128(a), b);
6743 return false;
6744 }
6745
op(std::uint64_t * a,float128_t b)6746 inline bool Lessp::op(std::uint64_t *a, float128_t b)
6747 { std::size_t lena = number_size(a);
6748 if (lena == 1) return Lessp::op(static_cast<std::int64_t>(a[0]), b);
6749 return greaterpbigfloat(a, lena, b, false, false);
6750
6751 return false;
6752 }
6753
op(float128_t a,std::int64_t b)6754 inline bool Lessp::op(float128_t a, std::int64_t b)
6755 { return Greaterp::op(b, a);
6756 }
6757
op(float128_t a,std::uint64_t * b)6758 inline bool Lessp::op(float128_t a, std::uint64_t *b)
6759 { return Greaterp::op(b, a);
6760 }
6761
6762 #endif // softfloat_h
6763
6764 // leq
6765
op(std::uint64_t * a,std::uint64_t * b)6766 inline bool Leq::op(std::uint64_t *a, std::uint64_t *b)
6767 { return !Greaterp::op(a, b);
6768 }
6769
op(std::uint64_t * a,std::int64_t b)6770 inline bool Leq::op(std::uint64_t *a, std::int64_t b)
6771 { return !Greaterp::op(a, b);
6772 }
6773
op(std::int64_t a,std::uint64_t * b)6774 inline bool Leq::op(std::int64_t a, std::uint64_t *b)
6775 { return !Greaterp::op(a, b);
6776 }
6777
op(std::int64_t a,std::int64_t b)6778 inline bool Leq::op(std::int64_t a, std::int64_t b)
6779 { return a <= b;
6780 }
6781
op(std::int64_t a,float b)6782 inline bool Leq::op(std::int64_t a, float b)
6783 { return Leq::op(a, static_cast<double>(b));
6784 }
6785
op(std::uint64_t * a,float b)6786 inline bool Leq::op(std::uint64_t *a, float b)
6787 { return Leq::op(a, static_cast<double>(b));
6788 }
6789
op(float a,std::int64_t b)6790 inline bool Leq::op(float a, std::int64_t b)
6791 { return Leq::op(static_cast<double>(a), b);
6792 }
6793
op(float a,std::uint64_t * b)6794 inline bool Leq::op(float a, std::uint64_t *b)
6795 { return Leq::op(static_cast<double>(a), b);
6796 }
6797
op(std::int64_t a,double b)6798 inline bool Leq::op(std::int64_t a, double b)
6799 { const std::int64_t range = static_cast<std::int64_t>(1)<<53;
6800 if (a >= -range && a <= range) return static_cast<double>(a) <= b;
6801 if (b >= 9223372036854775808.0) return true;
6802 else if (b < -9223372036854775808.0) return false;
6803 if (std::isnan(b)) return false;
6804 return a <= static_cast<std::int64_t>(b);
6805 }
6806
op(std::uint64_t * a,double b)6807 inline bool Leq::op(std::uint64_t *a, double b)
6808 { std::size_t lena = number_size(a);
6809 if (lena == 1) return Lessp::op(static_cast<std::int64_t>(a[0]), b);
6810 return greaterpfloat(a, lena, b, CompareMode::LEQ);
6811 }
6812
op(double a,std::int64_t b)6813 inline bool Leq::op(double a, std::int64_t b)
6814 { return Geq::op(b, a);
6815 }
6816
op(double a,std::uint64_t * b)6817 inline bool Leq::op(double a, std::uint64_t *b)
6818 { return Geq::op(b, a);
6819 }
6820
6821 #ifdef softfloat_h
6822
op(std::int64_t a,float128_t b)6823 inline bool Leq::op(std::int64_t a, float128_t b)
6824 { return f128_le(i64_to_f128(a), b);
6825 return false;
6826 }
6827
op(std::uint64_t * a,float128_t b)6828 inline bool Leq::op(std::uint64_t *a, float128_t b)
6829 { std::size_t lena = number_size(a);
6830 if (lena == 1) return Leq::op(static_cast<std::int64_t>(a[0]), b);
6831 return greaterpbigfloat(a, lena, b, false, true);
6832 }
6833
op(float128_t a,std::int64_t b)6834 inline bool Leq::op(float128_t a, std::int64_t b)
6835 { return Geq::op(b, a);
6836 }
6837
op(float128_t a,std::uint64_t * b)6838 inline bool Leq::op(float128_t a, std::uint64_t *b)
6839 { return Geq::op(b, a);
6840 }
6841
6842 #endif // softfloat_h
6843
6844
6845 // Negation, addition and subtraction. These are easy apart from a mess
6846 // concerning the representation of positive numbers that risk having the
6847 // most significant bit of their top word a 1, and the equivalent for
6848 // negative numbers.
6849 // Boolean operations all treat negative numbers as if there had been an
6850 // unending string of 1 bits before the stop bit that is stored.
6851 //=========================================================================
6852 //=========================================================================
6853
6854
6855 // Negation. Note that because I am using 2s complement the result could be
6856 // one word longer or shorter than the input. For instance if you negate
6857 // [0x8000000000000000] (a negative value) you get [0,0x8000000000000000],
6858 // and vice versa.
6859
bignegate(const std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr)6860 inline void bignegate(const std::uint64_t *a, std::size_t lena,
6861 std::uint64_t *r, std::size_t &lenr)
6862 { internal_negate(a, lena, r);
6863 // When I negate (-(2^(64n-1))) I will need to place a zero work ahead of the
6864 // value that is mow positive, making the bignum one digit longer.
6865 // If I have 2^(64n-1) it will have been represented with that padding zero
6866 // ahead of it, but when negated the bignum can shrink.
6867 if (r[lena-1]==topbit) r[lena++] = 0;
6868 else if (r[lena-1]==UINT64_C(0xffffffffffffffff) && lena>1 &&
6869 negative(r[lena-2])) lena--;
6870 lenr = lena;
6871 }
6872
op(std::uint64_t * a)6873 inline std::intptr_t Minus::op(std::uint64_t *a)
6874 { std::size_t n = number_size(a);
6875 push(a);
6876 std::uint64_t *p = reserve(n+1);
6877 pop(a);
6878 std::size_t final_n;
6879 bignegate(a, n, p, final_n);
6880 return confirm_size(p, n+1, final_n);
6881 }
6882
6883 // The following can only be called via op_dispatch1(), and in that
6884 // case the argument has to have started off as a fixnum. In such cases
6885 // the result will also be a fixnum except when negating MIN_FIXNUM. But
6886 // even in that case (-a) can not overflow 64-bit arithmetic because
6887 // the fixnum will have had at least one tag bit.
6888
op(std::int64_t a)6889 inline std::intptr_t Minus::op(std::int64_t a)
6890 { if (a == MIN_FIXNUM) return int_to_bignum(-a);
6891 else return int_to_handle(-a);
6892 }
6893
op(std::uint64_t * a)6894 inline std::intptr_t Add1::op(std::uint64_t *a)
6895 { return Plus::op(a, 1);
6896 }
6897
op(std::int64_t a)6898 inline std::intptr_t Add1::op(std::int64_t a)
6899 { return int_to_bignum(a+1);
6900 }
6901
op(std::uint64_t * a)6902 inline std::intptr_t Sub1::op(std::uint64_t *a)
6903 { return Plus::op(a, -1);
6904 }
6905
op(std::int64_t a)6906 inline std::intptr_t Sub1::op(std::int64_t a)
6907 { return int_to_bignum(a-1);
6908 }
6909
op(std::uint64_t * a)6910 inline std::intptr_t Abs::op(std::uint64_t *a)
6911 { std::size_t n = number_size(a);
6912 if (!negative(a[n-1]))
6913 { push(a);
6914 std::uint64_t *r = reserve(n);
6915 pop(a);
6916 std::memcpy(r, a, n*sizeof(std::uint64_t));
6917 return confirm_size(r, n, n);
6918 }
6919 push(a);
6920 std::uint64_t *r = reserve(n+1);
6921 pop(a);
6922 std::size_t final_n;
6923 bignegate(a, n, r, final_n);
6924 return confirm_size(r, n+1, final_n);
6925 }
6926
6927 // The following can only be called via op_dispatch1(), and in that
6928 // case the argument has to have started off as a fixnum. In such cases
6929 // the result will also be a fixnum except when negating MIN_FIXNUM. But
6930 // even in that case (-a) can not overflow 64-bit arithmetic because
6931 // the fixnum will have had at least one tag bit.
6932
op(std::int64_t a)6933 inline std::intptr_t Abs::op(std::int64_t a)
6934 { if (a == MIN_FIXNUM) return unsigned_int_to_bignum(-a);
6935 else return int_to_handle(a<0 ? -a : a);
6936 }
6937
6938 // The "bitnot" operation is simple and length can not change.
6939
biglognot(const std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr)6940 inline void biglognot(const std::uint64_t *a, std::size_t lena,
6941 std::uint64_t *r, std::size_t &lenr)
6942 { for (std::size_t i=0; i<lena; i++)
6943 { r[i] = ~a[i];
6944 }
6945 lenr = lena;
6946 }
6947
op(std::uint64_t * a)6948 inline std::intptr_t Lognot::op(std::uint64_t *a)
6949 { std::size_t n = number_size(a);
6950 push(a);
6951 std::uint64_t *p = reserve(n+1);
6952 pop(a);
6953 std::size_t final_n;
6954 biglognot(a, n, p, final_n);
6955 return confirm_size(p, n+1, final_n);
6956 }
6957
op(std::int64_t a)6958 inline std::intptr_t Lognot::op(std::int64_t a)
6959 { return int_to_handle(~a);
6960 }
6961
6962 // logand
6963
ordered_biglogand(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)6964 inline void ordered_biglogand(const std::uint64_t *a,
6965 std::size_t lena,
6966 const std::uint64_t *b, std::size_t lenb,
6967 std::uint64_t *r, std::size_t &lenr)
6968 { for (std::size_t i=0; i<lenb; i++)
6969 r[i] = a[i] & b[i];
6970 if (negative(b[lenb-1]))
6971 { for (std::size_t i=lenb; i<lena; i++) r[i] = a[i];
6972 lenr = lena;
6973 }
6974 else lenr = lenb;
6975 truncate_positive(r, lenr);
6976 }
6977
biglogand(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)6978 inline void biglogand(const std::uint64_t *a, std::size_t lena,
6979 const std::uint64_t *b, std::size_t lenb,
6980 std::uint64_t *r, std::size_t &lenr)
6981 { if (lena >= lenb) return ordered_biglogand(a, lena, b, lenb, r,
6982 lenr);
6983 else return ordered_biglogand(b, lenb, a, lena, r, lenr);
6984 }
6985
op(std::uint64_t * a,std::uint64_t * b)6986 inline std::intptr_t Logand::op(std::uint64_t *a, std::uint64_t *b)
6987 { std::size_t lena = number_size(a);
6988 std::size_t lenb = number_size(b);
6989 std::size_t n;
6990 if (lena >= lenb) n = lena;
6991 else n = lenb;
6992 push(a); push(b);
6993 std::uint64_t *p = reserve(n);
6994 pop(b); pop(a);
6995 std::size_t final_n;
6996 biglogand(a, lena, b, lenb, p, final_n);
6997 return confirm_size(p, n, final_n);
6998 }
6999
7000 // The next two are not optimised - a case of (logand bignum positive-fixnum)
7001 // is guaranteed to end up a fixnum so could be done more slickly - however
7002 // I am not going to expect that to be on the critical performance path for
7003 // enough programs for me to worry too much!
7004
op(std::uint64_t * a,std::int64_t b)7005 inline std::intptr_t Logand::op(std::uint64_t *a, std::int64_t b)
7006 { std::size_t lena = number_size(a);
7007 push(a);
7008 std::uint64_t *p = reserve(lena);
7009 pop(a);
7010 std::size_t final_n;
7011 std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
7012 biglogand(a, lena, bb, 1, p, final_n);
7013 return confirm_size(p, lena, final_n);
7014 }
7015
op(std::int64_t a,std::uint64_t * b)7016 inline std::intptr_t Logand::op(std::int64_t a, std::uint64_t *b)
7017 { std::size_t lenb = number_size(b);
7018 push(b);
7019 std::uint64_t *p = reserve(lenb);
7020 pop(b);
7021 std::size_t final_n;
7022 std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
7023 biglogand(aa, 1, b, lenb, p, final_n);
7024 return confirm_size(p, lenb, final_n);
7025 }
7026
op(std::int64_t a,std::int64_t b)7027 inline std::intptr_t Logand::op(std::int64_t a, std::int64_t b)
7028 { return int_to_handle(a & b);
7029 }
7030
7031 // logor
7032
ordered_biglogor(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7033 inline void ordered_biglogor(const std::uint64_t *a, std::size_t lena,
7034 const std::uint64_t *b, std::size_t lenb,
7035 std::uint64_t *r, std::size_t &lenr)
7036 { for (std::size_t i=0; i<lenb; i++)
7037 r[i] = a[i] | b[i];
7038 if (negative(b[lenb-1])) lenr = lenb;
7039 else
7040 { for (std::size_t i=lenb; i<lena; i++) r[i] = a[i];
7041 lenr = lena;
7042 }
7043 truncate_negative(r, lenr);
7044 }
7045
biglogor(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7046 inline void biglogor(const std::uint64_t *a, std::size_t lena,
7047 const std::uint64_t *b, std::size_t lenb,
7048 std::uint64_t *r, std::size_t &lenr)
7049 { if (lena >= lenb) return ordered_biglogor(a, lena, b, lenb, r,
7050 lenr);
7051 else return ordered_biglogor(b, lenb, a, lena, r, lenr);
7052 }
7053
op(std::uint64_t * a,std::uint64_t * b)7054 inline std::intptr_t Logor::op(std::uint64_t *a, std::uint64_t *b)
7055 { std::size_t lena = number_size(a);
7056 std::size_t lenb = number_size(b);
7057 std::size_t n;
7058 if (lena >= lenb) n = lena;
7059 else n = lenb;
7060 push(a); push(b);
7061 std::uint64_t *p = reserve(n);
7062 pop(b); pop(a);
7063 std::size_t final_n;
7064 biglogor(a, lena, b, lenb, p, final_n);
7065 return confirm_size(p, n, final_n);
7066 }
7067
op(std::uint64_t * a,std::int64_t b)7068 inline std::intptr_t Logor::op(std::uint64_t *a, std::int64_t b)
7069 { std::size_t lena = number_size(a);
7070 push(a);
7071 std::uint64_t *p = reserve(lena);
7072 pop(a);
7073 std::size_t final_n;
7074 std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
7075 biglogor(a, lena, bb, 1, p, final_n);
7076 return confirm_size(p, lena, final_n);
7077 }
7078
op(std::int64_t a,std::uint64_t * b)7079 inline std::intptr_t Logor::op(std::int64_t a, std::uint64_t *b)
7080 { std::size_t lenb = number_size(b);
7081 push(b);
7082 std::uint64_t *p = reserve(lenb);
7083 pop(b);
7084 std::size_t final_n;
7085 std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
7086 biglogor(aa, 1, b, lenb, p, final_n);
7087 return confirm_size(p, lenb, final_n);
7088 }
7089
op(std::int64_t a,std::int64_t b)7090 inline std::intptr_t Logor::op(std::int64_t a, std::int64_t b)
7091 { return int_to_handle(a | b);
7092 }
7093
7094 // logxor
7095
ordered_biglogxor(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7096 inline void ordered_biglogxor(const std::uint64_t *a,
7097 std::size_t lena,
7098 const std::uint64_t *b, std::size_t lenb,
7099 std::uint64_t *r, std::size_t &lenr)
7100 { std::size_t i;
7101 for (i=0; i<lenb; i++)
7102 r[i] = a[i] ^ b[i];
7103 if (negative(b[lenb-1]))
7104 { for (; i<lena; i++)
7105 r[i] = ~a[i];
7106 }
7107 else
7108 { for (; i<lena; i++)
7109 r[i] = a[i];
7110 }
7111 lenr = lena;
7112 // The logxor operation can cause the inputs to shrink.
7113 truncate_positive(r, lenr);
7114 truncate_negative(r, lenr);
7115 }
7116
biglogxor(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7117 inline void biglogxor(const std::uint64_t *a, std::size_t lena,
7118 const std::uint64_t *b, std::size_t lenb,
7119 std::uint64_t *r, std::size_t &lenr)
7120 { if (lena >= lenb) return ordered_biglogxor(a, lena, b, lenb, r,
7121 lenr);
7122 else return ordered_biglogxor(b, lenb, a, lena, r, lenr);
7123 }
7124
op(std::uint64_t * a,std::uint64_t * b)7125 inline std::intptr_t Logxor::op(std::uint64_t *a, std::uint64_t *b)
7126 { std::size_t lena = number_size(a);
7127 std::size_t lenb = number_size(b);
7128 std::size_t n;
7129 if (lena >= lenb) n = lena;
7130 else n = lenb;
7131 push(a); push(b);
7132 std::uint64_t *p = reserve(n);
7133 pop(b); pop(a);
7134 std::size_t final_n;
7135 biglogxor(a, lena, b, lenb, p, final_n);
7136 return confirm_size(p, n, final_n);
7137 }
7138
op(std::uint64_t * a,std::int64_t b)7139 inline std::intptr_t Logxor::op(std::uint64_t *a, std::int64_t b)
7140 { std::size_t lena = number_size(a);
7141 push(a);
7142 std::uint64_t *p = reserve(lena);
7143 pop(a);
7144 std::size_t final_n;
7145 std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
7146 biglogxor(a, lena, bb, 1, p, final_n);
7147 return confirm_size(p, lena, final_n);
7148 }
7149
op(std::int64_t a,std::uint64_t * b)7150 inline std::intptr_t Logxor::op(std::int64_t a, std::uint64_t *b)
7151 { std::size_t lenb = number_size(b);
7152 push(b);
7153 std::uint64_t *p = reserve(lenb);
7154 pop(b);
7155 std::size_t final_n;
7156 std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
7157 biglogxor(aa, 1, b, lenb, p, final_n);
7158 return confirm_size(p, lenb, final_n);
7159 }
7160
op(std::int64_t a,std::int64_t b)7161 inline std::intptr_t Logxor::op(std::int64_t a, std::int64_t b)
7162 { return int_to_handle(a ^ b);
7163 }
7164
7165
ordered_biglogeqv(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7166 inline void ordered_biglogeqv(const std::uint64_t *a,
7167 std::size_t lena,
7168 const std::uint64_t *b, std::size_t lenb,
7169 std::uint64_t *r, std::size_t &lenr)
7170 { std::size_t i;
7171 for (i=0; i<lenb; i++)
7172 r[i] = ~a[i] ^ b[i];
7173 if (negative(b[lenb-1]))
7174 { for (; i<lena; i++)
7175 r[i] = a[i];
7176 }
7177 else
7178 { for (; i<lena; i++)
7179 r[i] = ~a[i];
7180 }
7181 lenr = lena;
7182 // The logxor operation can cause the inputs to shrink.
7183 truncate_positive(r, lenr);
7184 truncate_negative(r, lenr);
7185 }
7186
biglogeqv(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7187 inline void biglogeqv(const std::uint64_t *a, std::size_t lena,
7188 const std::uint64_t *b, std::size_t lenb,
7189 std::uint64_t *r, std::size_t &lenr)
7190 { if (lena >= lenb) return ordered_biglogeqv(a, lena, b, lenb, r,
7191 lenr);
7192 else return ordered_biglogeqv(b, lenb, a, lena, r, lenr);
7193 }
7194
op(std::uint64_t * a,std::uint64_t * b)7195 inline std::intptr_t Logeqv::op(std::uint64_t *a, std::uint64_t *b)
7196 { std::size_t lena = number_size(a);
7197 std::size_t lenb = number_size(b);
7198 std::size_t n;
7199 if (lena >= lenb) n = lena;
7200 else n = lenb;
7201 push(a); push(b);
7202 std::uint64_t *p = reserve(n);
7203 pop(b); pop(a);
7204 std::size_t final_n;
7205 biglogeqv(a, lena, b, lenb, p, final_n);
7206 return confirm_size(p, n, final_n);
7207 }
7208
op(std::uint64_t * a,std::int64_t b)7209 inline std::intptr_t Logeqv::op(std::uint64_t *a, std::int64_t b)
7210 { std::size_t lena = number_size(a);
7211 push(a);
7212 std::uint64_t *p = reserve(lena);
7213 pop(a);
7214 std::size_t final_n;
7215 std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
7216 biglogeqv(a, lena, bb, 1, p, final_n);
7217 return confirm_size(p, lena, final_n);
7218 }
7219
op(std::int64_t a,std::uint64_t * b)7220 inline std::intptr_t Logeqv::op(std::int64_t a, std::uint64_t *b)
7221 { std::size_t lenb = number_size(b);
7222 push(b);
7223 std::uint64_t *p = reserve(lenb);
7224 pop(b);
7225 std::size_t final_n;
7226 std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
7227 biglogeqv(aa, 1, b, lenb, p, final_n);
7228 return confirm_size(p, lenb, final_n);
7229 }
7230
op(std::int64_t a,std::int64_t b)7231 inline std::intptr_t Logeqv::op(std::int64_t a, std::int64_t b)
7232 { return int_to_handle(~a ^ b);
7233 }
7234
7235 inline void bigrightshift(const std::uint64_t *a, std::size_t lena,
7236 std::int64_t n,
7237 std::uint64_t *r, std::size_t &lenr);
7238
bigleftshift(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7239 inline void bigleftshift(const std::uint64_t *a, std::size_t lena,
7240 std::int64_t n,
7241 std::uint64_t *r, std::size_t &lenr)
7242 { if (n == 0)
7243 { internal_copy(a, lena, r);
7244 lenr = lena;
7245 return;
7246 }
7247 else if (n < 0)
7248 { bigrightshift(a, lena, -n, r, lenr);
7249 return;
7250 }
7251 std::size_t words = n/64;
7252 std::size_t bits = n % 64;
7253 for (std::size_t i=0; i<words; i++) r[i] = 0;
7254 if (bits == 0)
7255 { for (std::size_t i=0; i<lena; i++)
7256 r[i+words] = a[i];
7257 lenr = lena+words;
7258 }
7259 else
7260 { r[words] = a[0]<<bits;
7261 for (std::size_t i=1; i<lena; i++)
7262 r[i+words] = (a[i]<<bits) |
7263 (a[i-1]>>(64-bits));
7264 r[words+lena] = (negative(a[lena-1]) ?
7265 static_cast<std::uint64_t>(-1)<<bits :
7266 0) | (a[lena-1]>>(64-bits));
7267 lenr = lena+words+1;
7268 }
7269 truncate_positive(r, lenr);
7270 truncate_negative(r, lenr);
7271 }
7272
7273 inline std::intptr_t rightshift_b(std::uint64_t *a, std::int64_t n);
7274
op(std::uint64_t * a,std::int64_t n)7275 inline std::intptr_t LeftShift::op(std::uint64_t *a, std::int64_t n)
7276 { if (n == 0) return copy_if_no_garbage_collector(a);
7277 else if (n < 0) return RightShift::op(a, -n);
7278 std::size_t lena = number_size(a);
7279 std::size_t nr = lena + (n/64) + 1;
7280 push(a);
7281 std::uint64_t *p = reserve(nr);
7282 pop(a);
7283 std::size_t final_n;
7284 bigleftshift(a, lena, n, p, final_n);
7285 return confirm_size(p, nr, final_n);
7286 }
7287
op(std::int64_t aa,std::int64_t n)7288 inline std::intptr_t LeftShift::op(std::int64_t aa, std::int64_t n)
7289 { if (n == 0) return int_to_handle(aa);
7290 else if (n < 0) return RightShift::op(aa, -n);
7291 std::size_t nr = (n/64) + 2;
7292 std::uint64_t *p = reserve(nr);
7293 std::size_t final_n;
7294 std::uint64_t a[1] = {static_cast<std::uint64_t>(aa)};
7295 bigleftshift(a, 1, n, p, final_n);
7296 return confirm_size(p, nr, final_n);
7297 }
7298
bigrightshift(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7299 inline void bigrightshift(const std::uint64_t *a, std::size_t lena,
7300 std::int64_t n,
7301 std::uint64_t *r, std::size_t &lenr)
7302 { if (n == 0)
7303 { internal_copy(a, lena, r);
7304 lenr = lena;
7305 return;
7306 }
7307 else if (n < 0)
7308 { bigleftshift(a, lena, -n, r, lenr);
7309 return;
7310 }
7311 std::size_t words = n/64;
7312 std::size_t bits = n % 64;
7313 if (words >= lena)
7314 { r[0] = negative(a[lena-1]) ? -static_cast<std::uint64_t>(1) : 0;
7315 lenr = 1;
7316 }
7317 else if (bits == 0)
7318 { for (std::size_t i=0; i<lena-words; i++)
7319 r[i] = a[i+words];
7320 lenr = lena-words;
7321 }
7322 else
7323 { for (std::size_t i=0; i<lena-words-1; i++)
7324 r[i] = (a[i+words]>>bits) |
7325 (a[i+words+1]<<(64-bits));
7326 r[lena-words-1] = ASR(a[lena-1], bits);
7327 lenr = lena-words;
7328 }
7329 truncate_positive(r, lenr);
7330 truncate_negative(r, lenr);
7331 }
7332
op(std::uint64_t * a,std::int64_t n)7333 inline std::intptr_t RightShift::op(std::uint64_t *a, std::int64_t n)
7334 { if (n == 0) return copy_if_no_garbage_collector(a);
7335 else if (n < 0) return LeftShift::op(a, -n);
7336 std::size_t lena = number_size(a);
7337 std::size_t nr;
7338 if (lena > static_cast<std::size_t>(n)/64) nr = lena - n/64;
7339 else nr = 1;
7340 push(a);
7341 std::uint64_t *p = reserve(nr);
7342 pop(a);
7343 std::size_t final_n;
7344 bigrightshift(a, lena, n, p, final_n);
7345 return confirm_size(p, nr, final_n);
7346 }
7347
op(std::int64_t a,std::int64_t n)7348 inline std::intptr_t RightShift::op(std::int64_t a, std::int64_t n)
7349 { if (n == 0) return int_to_handle(a);
7350 else if (n < 0) return LeftShift::op(a, -n);
7351 // Shifts of 64 and up obviously lose all the input data apart from its
7352 // sign, but so does a shift by 63.
7353 if (n >= 63) return int_to_handle(a>=0 ? 0 : -1);
7354 // Because C++ does not guarantee that right shifts on signed values
7355 // duplicate the sign bit I perform the "shift" here using division by
7356 // a power of 2. Because I have n <= 62 here I will not get overflow.
7357 std::int64_t q = static_cast<std::int64_t>(1)<<n;
7358 return int_to_handle((a & ~(q-1))/q);
7359 }
7360
op(std::uint64_t * a)7361 inline std::size_t IntegerLength::op(std::uint64_t *a)
7362 { return bignum_bits(a, number_size(a));
7363 }
7364
op(std::int64_t aa)7365 inline std::size_t IntegerLength::op(std::int64_t aa)
7366 { std::uint64_t a;
7367 if (aa == 0 || aa == -1) return 0;
7368 else if (aa < 0) a = -static_cast<std::uint64_t>(aa) - 1;
7369 else a = aa;
7370 return static_cast<std::size_t>(64-nlz(a));
7371 }
7372
op(std::uint64_t * a)7373 inline std::size_t Low_bit::op(std::uint64_t *a)
7374 { std::size_t lena = number_size(a);
7375 if (negative(a[lena-1])) // count trailing 1 bits!
7376 { std::size_t r=0, i=0;
7377 while (a[i++]==-static_cast<std::uint64_t>(1)) r += 64;
7378 std::uint64_t w = ~a[i-1];
7379 return static_cast<std::size_t>(64-nlz(w & (-w))+r);
7380 }
7381 else if (lena==1 && a[0]==0) return 0;
7382 else
7383 { std::size_t r=0, i=0;
7384 while (a[i++]==0) r += 64;
7385 std::uint64_t w = a[i-1];
7386 return static_cast<std::size_t>(64-nlz(w & (-w))+r);
7387 }
7388 }
7389
op(std::int64_t aa)7390 inline std::size_t Low_bit::op(std::int64_t aa)
7391 { std::uint64_t a;
7392 if (aa == 0) return 0;
7393 else if (aa < 0) a = ~static_cast<std::uint64_t>(aa);
7394 else a = aa;
7395 a = a & (-a); // keeps only the lowest bit
7396 return static_cast<std::size_t>(64-nlz(a));
7397 }
7398
op(std::uint64_t * a)7399 inline std::size_t Logcount::op(std::uint64_t *a)
7400 { std::size_t lena = number_size(a);
7401 std::size_t r = 0;
7402 if (negative(a[lena-1]))
7403 { for (std::size_t i=0; i<lena; i++) r += popcount(~a[i]);
7404 }
7405 else for (std::size_t i=0; i<lena; i++) r += popcount(a[i]);
7406 return r;
7407 }
7408
op(std::int64_t a)7409 inline std::size_t Logcount::op(std::int64_t a)
7410 { if (a < 0) return static_cast<std::size_t>(popcount(~a));
7411 else return static_cast<std::size_t>(popcount(a));
7412 }
7413
op(std::uint64_t * a,std::size_t n)7414 inline bool Logbitp::op(std::uint64_t *a, std::size_t n)
7415 { std::size_t lena = number_size(a);
7416 if (n >= 64*lena) return negative(a[lena-1]);
7417 return (a[n/64] & (static_cast<std::uint64_t>(1) << (n%64))) != 0;
7418 }
7419
op(std::int64_t a,std::size_t n)7420 inline bool Logbitp::op(std::int64_t a, std::size_t n)
7421 { if (n >= 64) return (a < 0);
7422 else return (a & (static_cast<std::uint64_t>(1) << n)) != 0;
7423 }
7424
7425 // Addition when the length of a is art least than that of b.
7426
ordered_bigplus(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7427 inline void ordered_bigplus(const std::uint64_t *a, std::size_t lena,
7428 const std::uint64_t *b, std::size_t lenb,
7429 std::uint64_t *r, std::size_t &lenr)
7430 { arithlib_assert(lena >= lenb);
7431 std::uint64_t carry = 0;
7432 std::size_t i = 0;
7433 // The lowest digits can be added without there being any carry-in.
7434 carry = add_with_carry(a[0], b[0], r[0]);
7435 // Add the digits that (a) and (b) have in common
7436 for (i=1; i<lenb; i++)
7437 carry = add_with_carry(a[i], b[i], carry, r[i]);
7438 // From there on up treat (b) as if it had its sign bit extended to the
7439 // left.
7440 std::uint64_t topb = negative(b[lenb-1]) ? allbits : 0;
7441 for (; i<lena; i++)
7442 carry = add_with_carry(a[i], topb, carry, r[i]);
7443 // And of course (a) must also be treated as being extended by its sign bit.
7444 std::uint64_t topa = negative(a[lena-1]) ? allbits : 0;
7445 // The result calculated here is 1 word longer than (a), and addition
7446 // can never carry further than that.
7447 r[i] = topa + topb + carry;
7448 // However because I am using (2s complement) signed arithmetic the result
7449 // could be shorter, so I will check for that and return the length that
7450 // is actually needed.
7451 while (r[i]==0 && i>0 && positive(r[i-1])) i--;
7452 while (r[i]==allbits && i>0 && negative(r[i-1])) i--;
7453 lenr = i+1;
7454 }
7455
7456 // Add a small number to a bignum
7457
bigplus_small(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7458 inline void bigplus_small(const std::uint64_t *a, std::size_t lena,
7459 std::int64_t n,
7460 std::uint64_t *r, std::size_t &lenr)
7461 { std::uint64_t w[1];
7462 w[0] = static_cast<std::uint64_t>(n);
7463 ordered_bigplus(a, lena, w, 1, r, lenr);
7464 }
7465
7466 // When I do a general addition I will not know which input is longer.
7467
bigplus(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7468 inline void bigplus(const std::uint64_t *a, std::size_t lena,
7469 const std::uint64_t *b, std::size_t lenb,
7470 std::uint64_t *r, std::size_t &lenr)
7471 { if (lena >= lenb) return ordered_bigplus(a, lena, b, lenb, r,
7472 lenr);
7473 else return ordered_bigplus(b, lenb, a, lena, r, lenr);
7474 }
7475
op(std::uint64_t * a,std::uint64_t * b)7476 inline std::intptr_t Plus::op(std::uint64_t *a, std::uint64_t *b)
7477 { std::size_t lena = number_size(a);
7478 std::size_t lenb = number_size(b);
7479 std::size_t n;
7480 if (lena >= lenb) n = lena+1;
7481 else n = lenb+1;
7482 push(a); push(b);
7483 std::uint64_t *p = reserve(n);
7484 pop(b); pop(a);
7485 std::size_t final_n;
7486 bigplus(a, lena, b, lenb, p, final_n);
7487 return confirm_size(p, n, final_n);
7488 }
7489
7490 // At present I implement the op_ii, opt_ib and opt_bi operations
7491 // by converting the integer argument to a 1-word bignum and dropping into
7492 // the general bignum code. This will generally be a long way from the
7493 // most efficient implementation, so at a later stage I will want to hone
7494 // the code to make it better!
7495
op(std::int64_t a,std::int64_t b)7496 inline std::intptr_t Plus::op(std::int64_t a, std::int64_t b)
7497 {
7498 // The two integer arguments will in fact each have been derived from a
7499 // tagged representation, and a consequence of that is that I can add
7500 // them and be certain I will not get arithmetic overflow. However the
7501 // resulting value may no longer be representable as a fixnum.
7502 std::int64_t c = a + b;
7503 if (fits_into_fixnum(c)) return int_to_handle(c);
7504 // Now because there had not been overflow I know that the bignum will
7505 // only need one word.
7506 std::uint64_t *r = reserve(1);
7507 r[0] = c;
7508 return confirm_size(r, 1, 1);
7509 }
7510
op(std::int64_t a,std::uint64_t * b)7511 inline std::intptr_t Plus::op(std::int64_t a, std::uint64_t *b)
7512 { std::uint64_t aa[1];
7513 aa[0] = a;
7514 std::size_t lenb = number_size(b);
7515 push(b);
7516 std::uint64_t *r = reserve(lenb+1);
7517 pop(b);
7518 std::size_t final_n;
7519 bigplus(aa, 1, b, lenb, r, final_n);
7520 return confirm_size(r, lenb+1, final_n);
7521 }
7522
op(std::uint64_t * a,std::int64_t b)7523 inline std::intptr_t Plus::op(std::uint64_t *a, std::int64_t b)
7524 { std::size_t lena = number_size(a);
7525 std::uint64_t bb[1];
7526 bb[0] = b;
7527 push(a);
7528 std::uint64_t *r = reserve(lena+1);
7529 pop(a);
7530 std::size_t final_n;
7531 bigplus(a, lena, bb, 1, r, final_n);
7532 return confirm_size(r, lena+1, final_n);
7533 }
7534
bigplus_small(std::intptr_t aa,std::int64_t b)7535 inline std::intptr_t bigplus_small(std::intptr_t aa, std::int64_t b)
7536 { std::uint64_t *a = vector_of_handle(aa);
7537 std::size_t lena = number_size(a);
7538 push(a);
7539 std::uint64_t *p = reserve(lena+1);
7540 pop(a);
7541 std::size_t final_n;
7542 bigplus_small(a, lena, b, p, final_n);
7543 return confirm_size(p, lena+1, final_n);
7544 }
7545
7546 // For subtraction I implement both a-b and b-a. These work by
7547 // computing a + (~b) + 1 and (~a) + b + 1 respectively.
7548
ordered_bigsubtract(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7549 inline void ordered_bigsubtract(const std::uint64_t *a,
7550 std::size_t lena,
7551 const std::uint64_t *b, std::size_t lenb,
7552 std::uint64_t *r, std::size_t &lenr)
7553 { arithlib_assert(lena >= lenb);
7554 std::uint64_t carry = 1;
7555 std::size_t i;
7556 // Add the digits that (a) and (b) have in common
7557 for (i=0; i<lenb; i++)
7558 carry = add_with_carry(a[i], ~b[i], carry, r[i]);
7559 // From there on up treat (b) as if it had its sign bit extended to the
7560 // left.
7561 std::uint64_t topb = negative(~b[lenb-1]) ? allbits : 0;
7562 for (; i<lena; i++)
7563 carry = add_with_carry(a[i], topb, carry, r[i]);
7564 // And of course (a) must also be treated as being extended by its sign bit.
7565 std::uint64_t topa = negative(a[lena-1]) ? allbits : 0;
7566 // The result calculated here is 1 word longer than (a), and addition
7567 // can never carry further than that.
7568 r[i] = topa + topb + carry;
7569 // However because I am using (2s complement) signed arithmetic the result
7570 // could be shorter, so I will check for that and return the length that
7571 // is actually needed.
7572 while (r[i]==0 && i>0 && positive(r[i-1])) i--;
7573 while (r[i]==allbits && i>0 && negative(r[i-1])) i--;
7574 lenr = i+1;
7575 }
7576
ordered_bigrevsubtract(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7577 inline void ordered_bigrevsubtract(const std::uint64_t *a,
7578 std::size_t lena,
7579 const std::uint64_t *b, std::size_t lenb,
7580 std::uint64_t *r, std::size_t &lenr)
7581 { arithlib_assert(lena >= lenb);
7582 std::uint64_t carry = 1;
7583 std::size_t i;
7584 // Add the digits that (a) and (b) have in common
7585 for (i=0; i<lenb; i++)
7586 carry = add_with_carry(~a[i], b[i], carry, r[i]);
7587 // From there on up treat (b) as if it had its sign bit extended to the
7588 // left.
7589 std::uint64_t topb = negative(b[lenb-1]) ? allbits : 0;
7590 for (; i<lena; i++)
7591 carry = add_with_carry(~a[i], topb, carry, r[i]);
7592 // And of course (a) must also be treated as being extended by its sign bit.
7593 std::uint64_t topa = negative(~a[lena-1]) ? allbits : 0;
7594 // The result calculated here is 1 word longer than (a), and addition
7595 // can never carry further than that.
7596 r[i] = topa + topb + carry;
7597 // However because I am using (2s complement) signed arithmetic the result
7598 // could be shorter, so I will check for that and return the length that
7599 // is actually needed.
7600 while (r[i]==0 && i>0 && positive(r[i-1])) i--;
7601 while (r[i]==allbits && i>0 && negative(r[i-1])) i--;
7602 lenr = i+1;
7603 }
7604
7605 // Subtract a small number from a bignum
7606
bigsubtract_small(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7607 inline void bigsubtract_small(const std::uint64_t *a,
7608 std::size_t lena,
7609 std::int64_t n,
7610 std::uint64_t *r, std::size_t &lenr)
7611 { std::uint64_t w[1];
7612 w[0] = static_cast<std::uint64_t>(n);
7613 ordered_bigsubtract(a, lena, w, 1, r, lenr);
7614 }
7615
7616 // subtract a bignum from a small number
7617
bigrevsubtract_small(const std::uint64_t * a,std::size_t lena,std::int64_t n,std::uint64_t * r,std::size_t & lenr)7618 inline void bigrevsubtract_small(const std::uint64_t *a,
7619 std::size_t lena,
7620 std::int64_t n,
7621 std::uint64_t *r, std::size_t &lenr)
7622 { std::uint64_t w[1];
7623 w[0] = static_cast<std::uint64_t>(n);
7624 ordered_bigrevsubtract(a, lena, w, 1, r, lenr);
7625 }
7626
7627
bigsubtract(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * r,std::size_t & lenr)7628 inline void bigsubtract(const std::uint64_t *a, std::size_t lena,
7629 const std::uint64_t *b, std::size_t lenb,
7630 std::uint64_t *r, std::size_t &lenr)
7631 { if (lena >= lenb) return ordered_bigsubtract(a, lena, b, lenb, r,
7632 lenr);
7633 else return ordered_bigrevsubtract(b, lenb, a, lena, r, lenr);
7634 }
7635
op(std::uint64_t * a,std::uint64_t * b)7636 inline std::intptr_t Difference::op(std::uint64_t *a,
7637 std::uint64_t *b)
7638 { std::size_t lena = number_size(a);
7639 std::size_t lenb = number_size(b);
7640 std::size_t n;
7641 if (lena >= lenb) n = lena+1;
7642 else n = lenb+1;
7643 push(a); push(b);
7644 std::uint64_t *p = reserve(n);
7645 pop(b); pop(a);
7646 std::size_t final_n;
7647 bigsubtract(a, lena, b, lenb, p, final_n);
7648 return confirm_size(p, n, final_n);
7649 }
7650
op(std::int64_t a,std::int64_t b)7651 inline std::intptr_t Difference::op(std::int64_t a, std::int64_t b)
7652 { std::uint64_t aa[1], bb[1];
7653 aa[0] = a;
7654 bb[0] = b;
7655 std::uint64_t *r = reserve(2);
7656 std::size_t final_n;
7657 bigsubtract(aa, 1, bb, 1, r, final_n);
7658 return confirm_size(r, 2, final_n);
7659 }
7660
op(std::int64_t a,std::uint64_t * b)7661 inline std::intptr_t Difference::op(std::int64_t a, std::uint64_t *b)
7662 { std::uint64_t aa[1];
7663 aa[0] = a;
7664 std::size_t lenb = number_size(b);
7665 push(b);
7666 std::uint64_t *r = reserve(lenb+1);
7667 pop(b);
7668 std::size_t final_n;
7669 bigsubtract(aa, 1, b, lenb, r, final_n);
7670 return confirm_size(r, lenb+1, final_n);
7671 }
7672
op(std::uint64_t * a,std::int64_t b)7673 inline std::intptr_t Difference::op(std::uint64_t *a, std::int64_t b)
7674 { std::size_t lena = number_size(a);
7675 std::uint64_t bb[1];
7676 bb[0] = b;
7677 push(a);
7678 std::uint64_t *r = reserve(lena+1);
7679 pop(a);
7680 std::size_t final_n;
7681 bigsubtract(a, lena, bb, 1, r, final_n);
7682 return confirm_size(r, lena+1, final_n);
7683 }
7684
7685
op(std::uint64_t * a,std::uint64_t * b)7686 inline std::intptr_t RevDifference::op(std::uint64_t *a,
7687 std::uint64_t *b)
7688 { std::size_t lena = number_size(a);
7689 std::size_t lenb = number_size(b);
7690 std::size_t n;
7691 if (lena >= lenb) n = lena+1;
7692 else n = lenb+1;
7693 push(a); push(b);
7694 std::uint64_t *p = reserve(n);
7695 pop(b); pop(a);
7696 std::size_t final_n;
7697 bigsubtract(b, lenb, a, lena, p, final_n);
7698 return confirm_size(p, n, final_n);
7699 }
7700
op(std::int64_t a,std::int64_t b)7701 inline std::intptr_t RevDifference::op(std::int64_t a, std::int64_t b)
7702 { std::uint64_t aa[1], bb[1];
7703 aa[0] = a;
7704 bb[0] = b;
7705 std::uint64_t *r = reserve(2);
7706 std::size_t final_n;
7707 bigsubtract(bb, 1, aa, 1, r, final_n);
7708 return confirm_size(r, 2, final_n);
7709 }
7710
op(std::int64_t a,std::uint64_t * b)7711 inline std::intptr_t RevDifference::op(std::int64_t a,
7712 std::uint64_t *b)
7713 { std::uint64_t aa[1];
7714 aa[0] = a;
7715 std::size_t lenb = number_size(b);
7716 push(b);
7717 std::uint64_t *r = reserve(lenb+1);
7718 pop(b);
7719 std::size_t final_n;
7720 bigsubtract(b, lenb, aa, 1, r, final_n);
7721 return confirm_size(r, lenb+1, final_n);
7722 }
7723
op(std::uint64_t * a,std::int64_t b)7724 inline std::intptr_t RevDifference::op(std::uint64_t *a,
7725 std::int64_t b)
7726 { std::size_t lena = number_size(a);
7727 std::uint64_t bb[1];
7728 bb[0] = b;
7729 push(a);
7730 std::uint64_t *r = reserve(lena+1);
7731 pop(a);
7732 std::size_t final_n;
7733 bigsubtract(bb, 1, a, lena, r, final_n);
7734 return confirm_size(r, lena+1, final_n);
7735 }
7736
7737 //=========================================================================
7738 //=========================================================================
7739 // multiplication, squaring and exponentiation.
7740 //=========================================================================
7741 //=========================================================================
7742
7743 // This multiplication code took much longer to write and debug than I had
7744 // expected. Classical multiplication was straightforward to implement,
7745 // but then while Karatsuba is at its heart simple, fitting it into a
7746 // framework where inputs of unequal size can be combined leaves much
7747 // opportunity for mistakes to creep in. Along with that I view this as
7748 // speed-critical, so I take a number of special cases and implement them
7749 // directly rather than via the general code. The earlier drafts of this
7750 // I had ended up with code that was even more tangled and untidy then the
7751 // present version!
7752 // I experimented with versions of classical multiplication that worked using
7753 // pairs of digits at once (and that did not seem to help me), and for
7754 // Karatsuba I tried both a version based and adding the high ane low parts
7755 // of input numbers and the version here that subtracts them.
7756 // Ensuring that the twos complement treatment of signs is supported adds
7757 // further potential confusion!
7758
7759
7760
7761 // Karatsuba multiplication.
7762 // =========================
7763
7764 // The multiplication code has a signature something like
7765 // void mult(uint64_t *a, size_t lena,
7766 // uint64_t *b, size_t lenb,
7767 // uint64_t *c,
7768 // uint64_t *work_vector=NULL)
7769 // where a and b are vectors with lena and lenb words in then respectively.
7770 // c is a vector and lena+lenb words of a product will be written into it.
7771 // Note that sometimes the top digit will end up as either 0 or -1.
7772 // w must be a workspace vector of length lenb+2*log(lenb) [the log is to
7773 // the base 2]. It is not needed if lenb is very short.
7774 //
7775
7776 // For use within the multiplication code I need variants on my
7777 // addition and subtraction code.
7778
7779 // I want:
7780 // kadd(a, lena, c, lenc); // c += a
7781
kadd(const std::uint64_t * a,std::size_t lena,std::uint64_t * c,std::size_t lenc,std::uint64_t carry=0)7782 inline std::uint64_t kadd(const std::uint64_t *a, std::size_t lena,
7783 std::uint64_t *c, std::size_t lenc,
7784 std::uint64_t carry=0)
7785 { std::size_t i;
7786 for (i=0; i<lena; i++)
7787 carry = add_with_carry(a[i], c[i], carry, c[i]);
7788 while (carry!=0 && i<lenc)
7789 { carry = add_with_carry(c[i], carry, c[i]);
7790 i++;
7791 }
7792 return carry;
7793 }
7794
7795 // c = a - b. must have length(a) >= length(b).
7796
ksub(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)7797 inline std::uint64_t ksub(const std::uint64_t *a, std::size_t lena,
7798 const std::uint64_t *b, std::size_t lenb,
7799 std::uint64_t *c)
7800 { arithlib_assert(lena >= lenb);
7801 std::uint64_t borrow = 0;
7802 std::size_t i;
7803 for (i=0; i<lenb; i++)
7804 borrow = subtract_with_borrow(a[i], b[i], borrow, c[i]);
7805 for (; i<lena; i++)
7806 borrow = subtract_with_borrow(a[i], borrow, c[i]);
7807 return borrow;
7808 }
7809
kneg(std::uint64_t * a,std::size_t lena)7810 inline void kneg(std::uint64_t *a, std::size_t lena)
7811 { std::uint64_t carry = 0;
7812 for (std::size_t i=0; i<lena; i++)
7813 a[i] = add_with_carry(~a[i], carry, a[i]);
7814 }
7815
7816 // c = |a - b| and return an indication of which branch of the absolute
7817 // value function was used, ie whether we had a>=b or a<b. If a==b so
7818 // the result is zero the value is not terribly important. Must be
7819 // called with the first argument at least as long as the second.
7820
absdiff(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)7821 inline bool absdiff(const std::uint64_t *a, std::size_t lena,
7822 const std::uint64_t *b, std::size_t lenb,
7823 std::uint64_t *c)
7824 {
7825 // I will do a cheap comparison of a and b first, based on an understanding
7826 // that lena >= lenb. The result will be of length lena.
7827 arithlib_assert(lena >= lenb);
7828 if (lenb < lena ||
7829 b[lenb-1]<=a[lena-1])
7830 {
7831 // If my cheap test suggests that a is the smaller one then I form (b-a).
7832 // If that generates a borrow my "guess" was wrong, so I negate the
7833 // result. For fully random inputs the cheap test is liable to be reliable.
7834 // for values that have long sequences of 0 bits in their binary
7835 // representation, eg values that are close to a power of 2 or ones that
7836 // have a large power of 2 as a factor, the fallback may be activated
7837 // more frequently.
7838 if (ksub(a, lena, b, lenb, c) != 0)
7839 { kneg(c, lena);
7840 return true; // Have computed b-a
7841 }
7842 else return false; // have computed a-b
7843 }
7844 else
7845 { if (ksub(b, lenb, a, lena, c) != 0)
7846 { kneg(c, lenb);
7847 return false; // a-b
7848 }
7849 else return true; // b-a
7850 }
7851 }
7852
7853 // I will have in-line code for a number of very small case on the
7854 // expectation that (a) these will arise expecially often in many
7855 // applications and (b) that the inline code will end up faster
7856 // then general loops.
7857
7858 // Now code that multiplies 2-digit numbers together.
7859 // One version treats them as unsigned, the second as signed.
7860
mul2x2(std::uint64_t a1,std::uint64_t a0,std::uint64_t b1,std::uint64_t b0,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7861 inline void mul2x2(std::uint64_t a1, std::uint64_t a0,
7862 std::uint64_t b1, std::uint64_t b0,
7863 std::uint64_t &c3, std::uint64_t &c2, std::uint64_t &c1,
7864 std::uint64_t &c0)
7865 { std::uint64_t c1a, c2a, c2b, c3a;
7866 multiply64(a0, b0, c1a, c0);
7867 multiply64(a0, b1, c1a, c2a, c1a);
7868 multiply64(a1, b0, c1a, c2b, c1);
7869 multiply64(a1, b1, c2a, c3a, c2a);
7870 c3a += add_with_carry(c2a, c2b, c2);
7871 c3 = c3a;
7872 }
7873
mul2x2S(std::int64_t a1,std::uint64_t a0,std::int64_t b1,std::uint64_t b0,std::int64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7874 inline void mul2x2S(std::int64_t a1, std::uint64_t a0,
7875 std::int64_t b1, std::uint64_t b0,
7876 std::int64_t &c3, std::uint64_t &c2, std::uint64_t &c1,
7877 std::uint64_t &c0)
7878 { std::uint64_t c1a;
7879 multiply64(a0, b0, c1a, c0);
7880 std::uint64_t c2a;
7881 multiply64(a0, static_cast<std::uint64_t>(b1), c1a, c2a, c1a);
7882 std::uint64_t c2b;
7883 multiply64(static_cast<std::uint64_t>(a1), b0, c1a, c2b, c1);
7884 std::int64_t c3a;
7885 signed_multiply64(a1, b1, c2a, c3a, c2a);
7886 c3a = static_cast<std::int64_t>(
7887 static_cast<std::uint64_t>(c3a) + add_with_carry(c2a, c2b, c2a));
7888 // Do the arithmetic in unsigned mode in case of overflow problems.
7889 if (a1 < 0) c3a = static_cast<std::int64_t>(
7890 static_cast<std::uint64_t>(c3a) -
7891 subtract_with_borrow(c2a, b0, c2a));
7892 if (b1 < 0) c3a = static_cast<std::int64_t>(
7893 static_cast<std::uint64_t>(c3a) -
7894 subtract_with_borrow(c2a, a0, c2a));
7895 c2 = c2a;
7896 c3 = c3a;
7897 }
7898
mul3x2(std::uint64_t a2,std::uint64_t a1,std::uint64_t a0,std::uint64_t b1,std::uint64_t b0,std::uint64_t & c4,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7899 inline void mul3x2(std::uint64_t a2, std::uint64_t a1,
7900 std::uint64_t a0,
7901 std::uint64_t b1, std::uint64_t b0,
7902 std::uint64_t &c4, std::uint64_t &c3, std::uint64_t &c2,
7903 std::uint64_t &c1, std::uint64_t &c0)
7904 { std::uint64_t c3a;
7905 mul2x2(a1, a0, b1, b0, c3, c2, c1, c0);
7906 multiply64(a2, b0, c2, c3a, c2);
7907 std::uint64_t carry = add_with_carry(c3, c3a, c3);
7908 multiply64(a2, b1, c3, c4, c3);
7909 c4 += carry;
7910 }
7911
mul3x3(std::uint64_t a2,std::uint64_t a1,std::uint64_t a0,std::uint64_t b2,std::uint64_t b1,std::uint64_t b0,std::uint64_t & c5,std::uint64_t & c4,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7912 inline void mul3x3(std::uint64_t a2, std::uint64_t a1,
7913 std::uint64_t a0,
7914 std::uint64_t b2, std::uint64_t b1, std::uint64_t b0,
7915 std::uint64_t &c5, std::uint64_t &c4, std::uint64_t &c3,
7916 std::uint64_t &c2, std::uint64_t &c1, std::uint64_t &c0)
7917 { std::uint64_t c4a, c3a;
7918 mul2x2(a1, a0, b1, b0, c3, c2, c1, c0);
7919 multiply64(a2, b0, c2, c3a, c2);
7920 std::uint64_t carry = add_with_carry(c3, c3a, c3);
7921 multiply64(a0, b2, c2, c3a, c2);
7922 carry += add_with_carry(c3, c3a, c3);
7923 multiply64(a2, b1, c3, c4, c3);
7924 carry = add_with_carry(c4, carry, c4);
7925 multiply64(a1, b2, c3, c4a, c3);
7926 carry = add_with_carry(c4, c4a, c4);
7927 multiply64(static_cast<std::int64_t>(a2),
7928 static_cast<std::int64_t>(b2), c4, c5, c4);
7929 c5 = static_cast<std::int64_t>(static_cast<std::uint64_t>
7930 (c5) + carry);
7931 }
7932
mul3x3S(std::uint64_t a2,std::uint64_t a1,std::uint64_t a0,std::uint64_t b2,std::uint64_t b1,std::uint64_t b0,std::int64_t & c5,std::uint64_t & c4,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7933 inline void mul3x3S(std::uint64_t a2, std::uint64_t a1,
7934 std::uint64_t a0,
7935 std::uint64_t b2, std::uint64_t b1, std::uint64_t b0,
7936 std::int64_t &c5, std::uint64_t &c4, std::uint64_t &c3,
7937 std::uint64_t &c2, std::uint64_t &c1, std::uint64_t &c0)
7938 { std::uint64_t c4a, c3a;
7939 mul2x2(a1, a0, b1, b0, c3, c2, c1, c0);
7940 multiply64(a2, b0, c2, c3a, c2);
7941 std::uint64_t carry = add_with_carry(c3, c3a, c3);
7942 multiply64(a0, b2, c2, c3a, c2);
7943 carry += add_with_carry(c3, c3a, c3);
7944 multiply64(a2, b1, c3, c4, c3);
7945 carry = add_with_carry(c4, carry, c4);
7946 multiply64(a1, b2, c3, c4a, c3);
7947 carry += add_with_carry(c4, c4a, c4);
7948 signed_multiply64(static_cast<std::int64_t>(a2),
7949 static_cast<std::int64_t>(b2), c4, c5, c4);
7950 c5 = static_cast<std::int64_t>(static_cast<std::uint64_t>
7951 (c5) + carry);
7952 if (negative(b2))
7953 { std::uint64_t borrow = subtract_with_borrow(c3, a0, c3);
7954 borrow = subtract_with_borrow(c4, a1, borrow, c4);
7955 c5 = static_cast<std::int64_t>(
7956 static_cast<std::uint64_t>(c5) - borrow);
7957 }
7958 if (negative(a2))
7959 { std::uint64_t borrow = subtract_with_borrow(c3, b0, c3);
7960 borrow = subtract_with_borrow(c4, b1, borrow, c4);
7961 c5 = static_cast<std::int64_t>(
7962 static_cast<std::uint64_t>(c5) - borrow);
7963 }
7964 }
7965
mul4x4(std::uint64_t a3,std::uint64_t a2,std::uint64_t a1,std::uint64_t a0,std::uint64_t b3,std::uint64_t b2,std::uint64_t b1,std::uint64_t b0,std::uint64_t & c7,std::uint64_t & c6,std::uint64_t & c5,std::uint64_t & c4,std::uint64_t & c3,std::uint64_t & c2,std::uint64_t & c1,std::uint64_t & c0)7966 inline void mul4x4(std::uint64_t a3, std::uint64_t a2,
7967 std::uint64_t a1,
7968 std::uint64_t a0,
7969 std::uint64_t b3, std::uint64_t b2, std::uint64_t b1,
7970 std::uint64_t b0,
7971 std::uint64_t &c7, std::uint64_t &c6, std::uint64_t &c5,
7972 std::uint64_t &c4,
7973 std::uint64_t &c3, std::uint64_t &c2, std::uint64_t &c1,
7974 std::uint64_t &c0)
7975 { std::uint64_t w7, w6, w5a, w5b, w5c, w4a, w4b, w4c,
7976 w3a, w3b, w3c, w2a, w2b, w2c;
7977 mul2x2(a1, a0, b1, b0, w3a, w2a, c1, c0);
7978 mul2x2(a1, a0, b3, b2, w5a, w4a, w3b, w2b);
7979 mul2x2(a3, a2, b1, b0, w5b, w4b, w3c, w2c);
7980 mul2x2(a3, a2, b3, b2, w7, w6, w5c, w4c);
7981 std::uint64_t carry = add_with_carry(w2a, w2b, w2c, c2);
7982 carry = add_with_carry(w3a, w3b, w3c, carry, c3);
7983 carry = add_with_carry(w4a, w4b, w4c, carry, c4);
7984 carry = add_with_carry(w5a, w5b, w5c, carry, c5);
7985 carry = add_with_carry(w6, carry, c6);
7986 c7 = w7 + carry;
7987 }
7988
7989 // c = a*b;
7990
7991 // This forms a product digit by digit.
7992
classical_multiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)7993 inline void classical_multiply(const std::uint64_t *a,
7994 std::size_t lena,
7995 const std::uint64_t *b, std::size_t lenb,
7996 std::uint64_t *c)
7997 { if (lena < lenb)
7998 { std::swap(a, b);
7999 std::swap(lena, lenb);
8000 }
8001 // (1) do the lowest degree term as a separate step
8002 std::uint64_t carry=0, hi, hi1, lo;
8003 multiply64(b[0], a[0], lo, c[0]);
8004 // Now a sequence of stages where at each the number of terms to
8005 // be combined grows.
8006 hi = 0;
8007 for (std::size_t i=1; i<lenb; i++)
8008 { carry = 0;
8009 for (std::size_t j=0; j<=i; j++)
8010 { multiply64(b[j], a[i-j], lo, hi1, lo);
8011 carry += add_with_carry(hi, hi1, hi);
8012 }
8013 c[i] = lo;
8014 lo = hi;
8015 hi = carry;
8016 }
8017 // If the two inputs are not the same size I demand that lena>=lenb and
8018 // there may be some slices to compute in the middle here.
8019 // if lena==lenb the following loop does not get executed at all.
8020 for (std::size_t i=lenb; i<lena; i++)
8021 { carry = 0;
8022 for (std::size_t j=0; j<lenb; j++)
8023 { multiply64(b[j], a[i-j], lo, hi1, lo);
8024 carry += add_with_carry(hi, hi1, hi);
8025 }
8026 c[i] = lo;
8027 lo = hi;
8028 hi = carry;
8029 }
8030 // Now I will have some stages where the number of terms to be combined
8031 // gradually decreases. If lenb==2 the following loop is not executed.
8032 for (std::size_t i=1; i<lenb-1; i++)
8033 { carry = 0;
8034 for (std::size_t j=0; j<lenb-i; j++)
8035 { multiply64(b[i+j], a[lena-j-1], lo, hi1, lo);
8036 carry += add_with_carry(hi, hi1, hi);
8037 }
8038 c[lena+i-1] = lo;
8039 lo = hi;
8040 hi = carry;
8041 }
8042 // Finally the very top term is computed.
8043 multiply64(b[lenb-1], a[lena-1], lo, hi1, c[lena+lenb-2]);
8044 c[lena+lenb-1] = hi + hi1;
8045 }
8046
8047
8048 // c = c + a*b. Potentially carry all the way up to lenc.
8049
classical_multiply_and_add(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc)8050 inline void classical_multiply_and_add(const std::uint64_t *a,
8051 std::size_t lena,
8052 const std::uint64_t *b, std::size_t lenb,
8053 std::uint64_t *c, std::size_t lenc)
8054 { if (lena < lenb)
8055 { std::swap(a, b);
8056 std::swap(lena, lenb);
8057 }
8058 // (1) do the lowest degree term as a separate step
8059 std::uint64_t carry=0, carry1, hi, hi1, lo;
8060 multiply64(b[0], a[0], c[0], lo, c[0]);
8061 // Now a sequence of stages where at each the number of terms to
8062 // be combined grows.
8063 hi = 0;
8064 for (std::size_t i=1; i<lenb; i++)
8065 { carry = 0;
8066 for (std::size_t j=0; j<=i; j++)
8067 { multiply64(b[j], a[i-j], lo, hi1, lo);
8068 carry += add_with_carry(hi, hi1, hi);
8069 }
8070 carry1 = add_with_carry(c[i], lo, c[i]);
8071 hi = add_with_carry(hi, carry1, lo) + carry;
8072 }
8073 // If the two inputs are not the same size I demand that lena>=lenb and
8074 // there may be some slices to compute in the middle here.
8075 for (std::size_t i=lenb; i<lena;
8076 i++) // If lenb==lena this loop is not executed
8077 { carry = 0;
8078 for (std::size_t j=0; j<lenb; j++)
8079 { multiply64(b[j], a[i-j], lo, hi1, lo);
8080 carry += add_with_carry(hi, hi1, hi);
8081 }
8082 carry1 = add_with_carry(c[i], lo, c[i]);
8083 hi = add_with_carry(hi, carry1, lo) + carry;
8084 }
8085 // Now I will have some stages where the number of terms to be combined
8086 // gradually decreases.
8087 for (std::size_t i=1; i<lenb-1;
8088 i++) // If lenb==2 this loop is not executed
8089 { carry = 0;
8090 for (std::size_t j=0; j<lenb-i; j++)
8091 { multiply64(b[i+j], a[lena-j-1], lo, hi1, lo);
8092 carry += add_with_carry(hi, hi1, hi);
8093 }
8094 carry1 = add_with_carry(c[lena+i-1], lo, c[lena+i-1]);
8095 hi = add_with_carry(hi, carry1, lo) + carry;
8096 }
8097 // Finally the very top term is computed.
8098 multiply64(b[lenb-1], a[lena-1], lo, hi1, lo);
8099 carry = add_with_carry(c[lena+lenb-2], lo, c[lena+lenb-2]);
8100 carry = add_with_carry(c[lena+lenb-1], hi+hi1, carry, c[lena+lenb-1]);
8101 for (std::size_t i=lena+lenb; carry!=0 && i<lenc; i++)
8102 carry = add_with_carry(c[i], carry, c[i]);
8103 }
8104
8105 // Now variants that use just a single digit first argument. These may be seen
8106 // as optimized cases.
8107
classical_multiply(std::uint64_t a,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)8108 inline void classical_multiply(std::uint64_t a,
8109 const std::uint64_t *b, std::size_t lenb,
8110 std::uint64_t *c)
8111 { std::uint64_t hi=0;
8112 for (std::size_t j=0; j<lenb; j++)
8113 multiply64(a, b[j], hi, hi, c[j]);
8114 c[lenb] = hi;
8115 }
8116
8117 // c = c + a*b and return any carry.
8118
classical_multiply_and_add(std::uint64_t a,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc)8119 inline void classical_multiply_and_add(std::uint64_t a,
8120 const std::uint64_t *b, std::size_t lenb,
8121 std::uint64_t *c, std::size_t lenc)
8122 { std::uint64_t hi=0, lo;
8123 for (std::size_t j=0; j<lenb; j++)
8124 { multiply64(a, b[j], hi, hi, lo);
8125 hi += add_with_carry(lo, c[j], c[j]);
8126 }
8127 std::uint64_t carry = add_with_carry(hi, c[lenb], c[lenb]);
8128 for (std::size_t i=lenb+1; carry!=0 && i<lenc; i++)
8129 carry = add_with_carry(c[i], carry, c[i]);
8130 }
8131
8132 // I make the variables that indicate when it is worth transitioning from
8133 // classical multiplication to something that is assymptotically faster
8134 // static rather than inline because if different overrides are provided
8135 // via the command line definitions when different source files are
8136 // being processed that could cause linker clashes otherwise.
8137
8138
8139 #if !defined K && !defined K_DEFINED
8140 // I provide a default here but can override it at compile time
8141 static const std::size_t K=18;
8142 #define K_DEFINED 1
8143 #endif
8144
8145 // When I have completed and measured things I am liable to make this a
8146 // "const", but for now it is a simple variable so I can tinker with the
8147 // value during testing and tuning.
8148
8149 #ifndef KARATSUBA_CUTOFF
8150 // It may be defined globally as a severe override of what happens here!
8151 static std::size_t KARATSUBA_CUTOFF = K;
8152 #endif
8153
8154 #if !defined K1 && !defined K1_DEFINED
8155 // I provide a default here but can override it at compile time.
8156
8157 static const std::size_t K1=170;
8158 #define K1_DEFINED 1
8159 #endif
8160
8161 // When I have completed and measured things I am liable to make this a
8162 // "const", but for now it is a simple variable so I can tinker with the
8163 // value during testing and tuning.
8164
8165 #ifndef PARAKARA_CUTOFF
8166 // It may be defined globally as a severe override of what happens here!
8167 // But also if the current host computer does not support at least three
8168 // genuine concurrent activities I will not try use of threads because it
8169 // would not be helpful!
8170
8171 static std::size_t PARAKARA_CUTOFF =
8172 std::thread::hardware_concurrency() >= 3 ? K1 : SIZE_MAX;
8173 #endif
8174
8175 inline void small_or_big_multiply(const std::uint64_t *a,
8176 std::size_t lena,
8177 const std::uint64_t *b, std::size_t lenb,
8178 std::uint64_t *c, std::uint64_t *w);
8179
8180 inline void small_or_big_multiply_and_add(const std::uint64_t *a,
8181 std::size_t lena,
8182 const std::uint64_t *b, std::size_t lenb,
8183 std::uint64_t *c, std::size_t lenc,
8184 std::uint64_t *w);
8185
8186 // The key function here multiplies two numbers that are at least almost
8187 // the same length. The cases that can arise here are
8188 // 2n 2n Easy and neat sub-division
8189 // 2n 2n-1 Treat the second number as if it has a padding zero
8190 // 2n-1 2n-1 Treat both numbers as if padded to size 2n
8191 // Observe that if the two numbers have different lengths then the longer
8192 // one is an even length, so the case (eg) 2n+1,2n will not arise.
8193 // This will also only be used if lenb >= KARATSUBA_CUTOFF.
8194
8195 // When one multiplies {a1,a0}*{b1,b0} the three sub-multiplications to
8196 // be performed are
8197 // a1*b1, a0*b0, |a0-a1|*|b0-b1|
8198
8199
karatsuba(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w)8200 inline void karatsuba(const std::uint64_t *a, std::size_t lena,
8201 const std::uint64_t *b, std::size_t lenb,
8202 std::uint64_t *c, std::uint64_t *w)
8203 { arithlib_assert(lena == lenb ||
8204 (lena%2 == 0 && lenb == lena-1));
8205 arithlib_assert(lena >= 2);
8206 std::size_t n = (lena+1)/2; // size of a "half-number"
8207 std::size_t lenc = lena+lenb;
8208 // lena-n and lenb-n will each be either n or n-1.
8209 if (absdiff(a, n, a+n, lena-n, w) !=
8210 absdiff(b, n, b+n, lenb-n, w+n))
8211 {
8212 // Here I will collect
8213 // a1*b1 (a1*b0 + b1*a0 - a1*b1 - a0*b0) a0*b0
8214 // First write the middle part into place.
8215 small_or_big_multiply(w, n, w+n, n, c+n,
8216 w+2*n); // (a1-a0)*(b0-b1)
8217 // Now I just need to add back in parts of the a1*b1 and a0*b0
8218 small_or_big_multiply(a+n, lena-n, b+n, lenb-n, w, w+2*n); // a1*b1
8219 // First insert the copy at the very top. Part can just be copied because I
8220 // have not yet put anything into c there, the low half then has to be added
8221 // in (and carries could propagate all the way up).
8222 for (std::size_t i=n; i<lenc-2*n; i++) c[2*n+i] = w[i];
8223 kadd(w, n, c+2*n, lenc-2*n);
8224 // Now add in the second copy
8225 kadd(w, lenc-2*n, c+n, lenc-n);
8226 // Now I can deal with the a0*b0.
8227 small_or_big_multiply(a, n, b, n, w, w+2*n); // a0*b0
8228 for (std::size_t i=0; i<n; i++) c[i] = w[i];
8229 kadd(w+n, n, c+n, lenc-n);
8230 kadd(w, 2*n, c+n, lenc-n);
8231 }
8232 else
8233 {
8234 // This case is slightly more awkward because the key parts of the middle
8235 // part are negated.
8236 // a1*b1 (-a1*b0 - b1*a0 + a1*b1 + a0*b0) a0*b0
8237 small_or_big_multiply(w, n, w+n, n, c+n,
8238 w+2*n); // (a1-a0)*(b1-b0)
8239 small_or_big_multiply(a+n, lena-n, b+n, lenb-n, w, w+2*n); // a1*b1
8240 for (std::size_t i=n; i<lenc-2*n; i++) c[2*n+i] = w[i];
8241 // Now I will do {c3,c2,c1} = {c3,w0,0} - {0,c2,c1) which has a mere negation
8242 // step for the c1 digit, but is otherwise a reverse subtraction. Note I had
8243 // just done c3 = w1 so that first term on the RHS is "really" {w1,w0,0}.
8244 // c1 = 0 - c1 [and generate borrow]
8245 // c2 = w0 - c2 - borrow [and generate borrow]
8246 // c3 = c3 - borrow
8247 std::uint64_t borrow = 0;
8248 for (std::size_t i=0; i<n; i++)
8249 borrow = subtract_with_borrow(0, c[n+i], borrow, c[n+i]);
8250 for (std::size_t i=0; i<n; i++)
8251 borrow = subtract_with_borrow(w[i], c[2*n+i], borrow, c[2*n+i]);
8252 for (std::size_t i=0; i<lenc-3*n && borrow!=0; i++)
8253 borrow = subtract_with_borrow(c[3*n+i], borrow, c[3*n+i]);
8254 // Now I can proceed as before
8255 kadd(w, lenc-2*n, c+n, lenc-n);
8256 small_or_big_multiply(a, n, b, n, w, w+2*n); // a0*b0
8257 for (std::size_t i=0; i<n; i++) c[i] = w[i];
8258 kadd(w+n, n, c+n, lenc-n);
8259 kadd(w, 2*n, c+n, lenc-n);
8260 }
8261 }
8262
8263 // The worker_thread() function is started in each of two threads, and
8264 // processes requests until a "quit" request is sent to it.
8265
worker_thread(Worker_data * wd)8266 inline void worker_thread(Worker_data *wd)
8267 {
8268 #ifdef USE_MICROSOFT_MUTEX
8269 WaitForSingleObject(wd->mutex[2], 0xffffffff);
8270 WaitForSingleObject(wd->mutex[3], 0xffffffff);
8271 #else
8272 wd->mutex[2].lock();
8273 wd->mutex[3].lock();
8274 #endif
8275 wd->ready = true;
8276 int receive_count = 0;
8277 for (;;)
8278 {
8279 #ifdef USE_MICROSOFT_MUTEX
8280 WaitForSingleObject(wd->mutex[receive_count], 0xffffffff);
8281 #else
8282 wd->mutex[receive_count].lock();
8283 #endif
8284 if (wd->quit_flag) return;
8285 // This is where I do some work!
8286 small_or_big_multiply(wd->a, wd->lena,
8287 wd->b, wd->lenb,
8288 wd->c, wd->w);
8289 #ifdef USE_MICROSOFT_MUTEX
8290 ReleaseMutex(wd->mutex[receive_count^2]);
8291 #else
8292 wd->mutex[receive_count^2].unlock();
8293 #endif
8294 receive_count = (receive_count + 1) & 3;
8295 }
8296 }
8297
top_level_karatsuba(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w,std::uint64_t * w0,std::uint64_t * w1)8298 inline void top_level_karatsuba(const std::uint64_t *a,
8299 std::size_t lena,
8300 const std::uint64_t *b, std::size_t lenb,
8301 std::uint64_t *c, std::uint64_t *w,
8302 std::uint64_t *w0, std::uint64_t *w1)
8303 {
8304 // Here I have a HUGE case and I should use threads!
8305 arithlib_assert(lena == lenb ||
8306 (lena%2 == 0 && lenb == lena-1));
8307 arithlib_assert(lena >= 2);
8308 DriverData *driverData = getDriverData();
8309 std::size_t n = (lena+1)/2; // size of a "half-number"
8310 std::size_t lenc = lena+lenb;
8311 // I start by arranging that the two threads that can do things in parallel
8312 // can get access to data from here and that I will be able to retrieve
8313 // results. And that the worker threads have workspace to make use of.
8314 driverData->wd_0.a = a;
8315 driverData->wd_0.lena = n;
8316 driverData->wd_0.b = b;
8317 driverData->wd_0.lenb = n;
8318 driverData->wd_0.c = w0;
8319 driverData->wd_0.w = w0+2*n;
8320
8321 driverData->wd_1.a = a+n;
8322 driverData->wd_1.lena = lena-n;
8323 driverData->wd_1.b = b+n;
8324 driverData->wd_1.lenb = lenb-n;
8325 driverData->wd_1.c = w1;
8326 driverData->wd_1.w = w1+2*n;
8327
8328 // Now trigger the two threads to do some work for me. One will be forming
8329 // alo*blo while the other computes ahi*bhi .
8330 driverData->release_workers();
8331 // Now I will work on either |ahi-alo|*|bhi-blo|
8332 // lena-n and lenb-n will each be either n or n-1.
8333 bool signs_differ = absdiff(a, n, a+n, lena-n, w) !=
8334 absdiff(b, n, b+n, lenb-n, w+n);
8335 small_or_big_multiply(w, n, w+n, n, c+n,
8336 w+2*n); // (a1-a0)*(b0-b1)
8337 // That has the product of differences written into the middle of c.
8338 driverData->wait_for_workers();
8339 if (signs_differ)
8340 {
8341 // Here I have
8342 // w0: ; ; alo*blo [2*n]
8343 // w1: ahi*bhi [lenc-2*n] ; ;
8344 // c: ?[lenc-n]; (ahi-alo)*(blo-bhi)[2n] ; ?[n]
8345 // = ?; ahi*blo+bhi*alo -ahi*bhi-alo*blo ; ?[n]
8346 // so I need to add bits from w0 and w1 into c.
8347 //
8348 // First deal with ahi*bhi. The top half can be copied into the very top of
8349 // the result, then I add in the bottom half.
8350 for (std::size_t i=n; i<lenc-2*n; i++) c[2*n+i] = w1[i];
8351 kadd(w1, n, c+2*n, lenc-2*n);
8352 // Now add in the second copy of ahi*bhi
8353 kadd(w1, lenc-2*n, c+n, lenc-n);
8354 // Now something similar with alo*blo
8355 for (std::size_t i=0; i<n; i++) c[i] = w0[i];
8356 kadd(w0+n, n, c+n, lenc-n);
8357 kadd(w0, 2*n, c+n, lenc-n);
8358 }
8359 else
8360 {
8361 // This case is slightly more awkward because the key parts of the middle
8362 // part are negated.
8363 // a1*b1 (-a1*b0 - b1*a0 + a1*b1 + a0*b0) a0*b0
8364 // Call the desired result {c3,c2,c1,c0} then the middle product is in
8365 // {c2,c1} and I can copy the top half of ahi*bhi into c3.
8366 for (std::size_t i=n; i<lenc-2*n; i++) c[2*n+i] = w1[i];
8367 // Now I will do {c3,c2,c1} = {c3,low(ahi*bhi),0} - {0,c2,c1) which has a
8368 // mere negation step for the c1 digit, but is otherwise a reverse
8369 // subtraction. Note I had just done c3 = high(ahi*bhi) so that first term on
8370 // the RHS is "really" {{ahi*bhi},0}.
8371 // c1 = 0 - c1 [and generate borrow]
8372 // c2 = low(w0) - c2 - borrow [and generate borrow]
8373 // c3 = c3 - borrow
8374 std::uint64_t borrow = 0;
8375 for (std::size_t i=0; i<n; i++)
8376 borrow = subtract_with_borrow(0, c[n+i], borrow, c[n+i]);
8377 for (std::size_t i=0; i<n; i++)
8378 borrow = subtract_with_borrow(w1[i], c[2*n+i], borrow, c[2*n+i]);
8379 for (std::size_t i=0; i<lenc-3*n && borrow!=0; i++)
8380 borrow = subtract_with_borrow(c[3*n+i], borrow, c[3*n+i]);
8381 kadd(w1, lenc-2*n, c+n, lenc-n);
8382 for (std::size_t i=0; i<n; i++) c[i] = w0[i];
8383 kadd(w0+n, n, c+n, lenc-n);
8384 kadd(w0, 2*n, c+n, lenc-n);
8385 }
8386 }
8387
karatsuba_and_add(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc,std::uint64_t * w)8388 inline void karatsuba_and_add(const std::uint64_t *a,
8389 std::size_t lena,
8390 const std::uint64_t *b, std::size_t lenb,
8391 std::uint64_t *c, std::size_t lenc, std::uint64_t *w)
8392 { arithlib_assert(lena == lenb ||
8393 (lena%2 == 0 && lenb == lena-1));
8394 arithlib_assert(lena >= 2);
8395 std::size_t n = (lena+1)/2; // size of a "half-number"
8396 std::size_t lenc1 = lena+lenb;
8397 if (absdiff(a, n, a+n, lena-n, w) !=
8398 absdiff(b, n, b+n, lenb-n, w+n))
8399 {
8400 // Here I will collect
8401 // a1*b1 (a1*b0 + b1*a0 - a1*b1 - a0*b0) a0*b0
8402 small_or_big_multiply_and_add(w, n, w+n, n, c+n, lenc-n, w+2*n);
8403 small_or_big_multiply(a+n, lena-n, b+n, lenb-n, w, w+2*n); // a1*b1
8404 kadd(w, lenc1-2*n, c+2*n, lenc-2*n);
8405 kadd(w, lenc1-2*n, c+n, lenc-n);
8406 small_or_big_multiply(a, n, b, n, w, w+2*n); // a0*b0
8407 kadd(w, 2*n, c, lenc);
8408 kadd(w, 2*n, c+n, lenc-n);
8409 }
8410 else
8411 {
8412 // This case is slightly more awkward because the key parts of the middle
8413 // part are negated.
8414 // a1*b1 (-a1*b0 - b1*a0 + a1*b1 + a0*b0) a0*b0
8415 // To perform c=c-w; I go c=~c; c=c+w; c=~c; [that is a NOT rather than
8416 // a MINUS there!].
8417 for (std::size_t i=n; i<lenc; i++) c[i] = ~c[i];
8418 small_or_big_multiply_and_add(w, n, w+n, n, c+n, lenc-n, w+2*n);
8419 for (std::size_t i=n; i<lenc; i++) c[i] = ~c[i];
8420 small_or_big_multiply(a+n, lena-n, b+n, lenb-n, w, w+2*n); // a1*b1
8421 kadd(w, lenc1-2*n, c+2*n, lenc-2*n);
8422 kadd(w, lenc1-2*n, c+n, lenc-n);
8423 small_or_big_multiply(a, n, b, n, w, w+2*n); // a0*b0
8424 kadd(w, 2*n, c, lenc);
8425 kadd(w, 2*n, c+n, lenc-n);
8426 }
8427 }
8428
8429 // Here both inputs are of size at least KARATSUBA_CUTOFF. If their sizes
8430 // match exactly I can use Karatsuba directly. I take the view that if
8431 // the two are of sized (2n) and (2n-1) then I will also use Karatsuba
8432 // directly (treating the shorter input as if it had an initial zero padding
8433 // digit). In all other cases I need to do a sequence of multiplications
8434 // rather along the lines of "short" multiplication treating the size of the
8435 // smaller operand as the digit size.
8436
certainly_big_multiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w)8437 inline void certainly_big_multiply(const std::uint64_t *a,
8438 std::size_t lena,
8439 const std::uint64_t *b, std::size_t lenb,
8440 std::uint64_t *c, std::uint64_t *w)
8441 { if (lena == lenb)
8442 { karatsuba(a, lena, b, lenb, c, w);
8443 return;
8444 }
8445 if (lena < lenb)
8446 { std::swap(a, b);
8447 std::swap(lena, lenb);
8448 }
8449 // Now b is the shorter operand. The case (2n)*(2n-1) will be handled
8450 // using Karatsuba merely by treating the smaller number as if padded with
8451 // a leading zero.
8452 if (lena%2==0 && lenb==lena-1)
8453 { karatsuba(a, lena, b, lenb, c, w);
8454 return;
8455 }
8456 // If the two inputs are unbalanced in length I will perform multiple
8457 // balanced operations each of which can be handled specially. I will
8458 // try to make each subsidiary multiplication as big as possible.
8459 // This will be lenb rounded up to an even number.
8460 // I will be willing to do chunks that are of an even size that is
8461 // either lenb or lenb+1.
8462 std::size_t len = lenb + (lenb & 1);
8463 const std::uint64_t *a1 = a;
8464 std::uint64_t *c1 = c;
8465 std::size_t lena1 = lena;
8466 // Multiply-and-add will be (slightly) more expensive than just Multiply,
8467 // so I do a sequence of multiplications where their outputs will not overlap
8468 // first, and then do the interleaved multiplications adding in.
8469 for (;;)
8470 {
8471 // I may have rounded the size of b up by 1, and if I have I would generate
8472 // 2*len-1 digits not 2*len and hence risk leaving a 1-word gap between filled
8473 // in data. I zero that here to avoid trouble. However I must not do this
8474 // for if the multiplication I am about to do will write in the very top
8475 // digits of the final answer, because if I did that would be a sort of
8476 // buffer overrun.
8477 if (len < lena1) c1[2*len-1] = 0;
8478 karatsuba(a1, len, b, lenb, c1, w);
8479 c1 += 2*len;
8480 // I will keep going provided the next multiplication I will do will fully fit.
8481 if (lena1 < 3*len) break;
8482 a1 += 2*len;
8483 lena1 -= 2*len;
8484 }
8485 if (lena1 > 2*len)
8486 { a1 += 2*len;
8487 lena1 -= 2*len;
8488 // Do a shorter nice Multiply (without Add) to reach the end of input a.
8489 small_or_big_multiply(a1, lena1, b, lenb, c1, w);
8490 }
8491 else if (lena1!=len)
8492 {
8493 // I may need to pad with zeros when the top digit to be generated will be
8494 // put there using multiply_and_add.
8495 for (std::size_t i=c1-c; i<lena+lenb; i++) c[i] = 0;
8496 }
8497 // Now I need to do much the same for the odd numbered digits of a, but
8498 // adding the products in rather than writing them into place.
8499 a1 = a + len;
8500 c1 = c + len;
8501 std::size_t lenc1 = lena+lenb-len;
8502 // I know that I had lena>lenb at the start. This means I have at
8503 // least a part block to process here, but there is no guarantee that
8504 // I have a whole one.
8505 lena1 = lena - len;
8506 for (;;)
8507 { if (lena1 < len) break;
8508 karatsuba_and_add(a1, len, b, lenb, c1, lenc1, w);
8509 if (lena1 <= 2*len)
8510 { lena1 = 0;
8511 break;
8512 }
8513 c1 += 2*len;
8514 lenc1 -= 2*len;
8515 a1 += 2*len;
8516 lena1 -= 2*len;
8517 }
8518 if (lena1!=0)
8519 small_or_big_multiply_and_add(a1, lena1, b, lenb, c1, lenc1, w);
8520 }
8521
top_level_certainly_big_multiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w,std::uint64_t * w0,std::uint64_t * w1)8522 inline void top_level_certainly_big_multiply(const std::uint64_t *a,
8523 std::size_t lena,
8524 const std::uint64_t *b, std::size_t lenb,
8525 std::uint64_t *c, std::uint64_t *w,
8526 std::uint64_t *w0, std::uint64_t *w1)
8527 { if (lena == lenb)
8528 { top_level_karatsuba(a, lena, b, lenb, c, w, w0, w1);
8529 return;
8530 }
8531 if (lena < lenb)
8532 { std::swap(a, b);
8533 std::swap(lena, lenb);
8534 }
8535 // Now b is the shorter operand. The case (2n)*(2n-1) will be handled
8536 // using Karatsuba merely by treating the smaller number as if padded with
8537 // a leading zero.
8538 if (lena%2==0 && lenb==lena-1)
8539 { top_level_karatsuba(a, lena, b, lenb, c, w, w0, w1);
8540 return;
8541 }
8542 // If the two inputs are unbalanced in length I will perform multiple
8543 // balanced operations each of which can be handled specially. I will
8544 // try to make each subsidiary multiplication as big as possible.
8545 // This will be lenb rounded up to an even number.
8546 // I will be willing to do chunks that are of an even size that is
8547 // either lenb or lenb+1.
8548 std::size_t len = lenb + (lenb & 1);
8549 const std::uint64_t *a1 = a;
8550 std::uint64_t *c1 = c;
8551 std::size_t lena1 = lena;
8552 // Multiply-and-add will be (slightly) more expensive than just Multiply,
8553 // so I do a sequence of multiplications where their outputs will not overlap
8554 // first, and then do the interleaved multiplications adding in.
8555 for (;;)
8556 {
8557 // I may have rounded the size of b up by 1, and if I have I would generate
8558 // 2*len-1 digits not 2*len and hence risk leaving a 1-word gap between filled
8559 // in data. I zero that here to avoid trouble. However I must not do this
8560 // for if the multiplication I am about to do will write in the very top
8561 // digits of the final answer, because if I did that would be a sort of
8562 // buffer overrun.
8563 if (len < lena1) c1[2*len-1] = 0;
8564 top_level_karatsuba(a1, len, b, lenb, c1, w, w0, w1);
8565 c1 += 2*len;
8566 // I will keep going provided the next multiplication I will do will fully fit.
8567 if (lena1 < 3*len) break;
8568 a1 += 2*len;
8569 lena1 -= 2*len;
8570 }
8571 if (lena1 > 2*len)
8572 { a1 += 2*len;
8573 lena1 -= 2*len;
8574 // Do a shorter nice Multiply (without Add) to reach the end of input a.
8575 small_or_big_multiply(a1, lena1, b, lenb, c1, w);
8576 }
8577 else if (lena1!=len)
8578 {
8579 // I may need to pad with zeros when the top digit to be generated will be
8580 // put there using multiply_and_add.
8581 for (std::size_t i=c1-c; i<lena+lenb; i++) c[i] = 0;
8582 }
8583 // Now I need to do much the same for the odd numbered digits of a, but
8584 // adding the products in rather than writing them into place.
8585 a1 = a + len;
8586 c1 = c + len;
8587 std::size_t lenc1 = lena+lenb-len;
8588 // I know that I had lena>lenb at the start. This means I have at
8589 // least a part block to process here, but there is no guarantee that
8590 // I have a whole one.
8591 lena1 = lena - len;
8592 for (;;)
8593 { if (lena1 < len) break;
8594 karatsuba_and_add(a1, len, b, lenb, c1, lenc1, w);
8595 if (lena1 <= 2*len)
8596 { lena1 = 0;
8597 break;
8598 }
8599 c1 += 2*len;
8600 lenc1 -= 2*len;
8601 a1 += 2*len;
8602 lena1 -= 2*len;
8603 }
8604 if (lena1!=0)
8605 small_or_big_multiply_and_add(a1, lena1, b, lenb, c1, lenc1, w);
8606 }
8607
certainly_big_multiply_and_add(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc,std::uint64_t * w)8608 inline void certainly_big_multiply_and_add(const std::uint64_t *a,
8609 std::size_t lena,
8610 const std::uint64_t *b, std::size_t lenb,
8611 std::uint64_t *c, std::size_t lenc,
8612 std::uint64_t *w)
8613 { if (lena == lenb)
8614 { karatsuba_and_add(a, lena, b, lenb, c, lenc, w);
8615 return;
8616 }
8617 if (lena < lenb)
8618 { std::swap(a, b);
8619 std::swap(lena, lenb);
8620 }
8621 // Now b is the shorter operand. The case (2n)*(2n-1) will be handled
8622 // using Karatsuba merely by treating the smaller number as if padded with
8623 // a leading zero.
8624 if (lena%2==0 && lenb==lena-1)
8625 { karatsuba_and_add(a, lena, b, lenb, c, lenc, w);
8626 return;
8627 }
8628 // If the two inputs are unbalanced in length I will perform multiple
8629 // balanced operations each of which can be handled specially. I will
8630 // try to make each subsidiary multiplication as big as possible.
8631 // This will be lenb rounded up to an even number.
8632 // I will be willing to do chunks that are of an even size that is
8633 // either lenb or lenb+1.
8634 std::size_t len = lenb + (lenb & 1);
8635 const std::uint64_t *a1 = a;
8636 std::uint64_t *c1 = c;
8637 std::size_t lena1 = lena, lenc1 = lenc;
8638 // because this is "certainly big" I know I can do at least one
8639 // Karatsuba stage.
8640 for (;;)
8641 { karatsuba_and_add(a1, len, b, lenb, c1, lenc1, w);
8642 c1 += len;
8643 lenc1 -= len;
8644 a1 += len;
8645 lena1 -= len;
8646 // I will keep going provided the next multiplication I will do will fully fit.
8647 if (lena1 < len) break;
8648 }
8649 // Do a shorter nice Multiply (without Add) to reach the end of input a.
8650 if (lena1 != 0)
8651 small_or_big_multiply_and_add(a1, lena1, b, lenb, c1, lenc1, w);
8652 }
8653
8654 // I am going to hope that the compiler turns this into a tail-call to
8655 // either certainly_big_multiply or classical_multiply with very
8656 // little overhead.
8657
small_or_big_multiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::uint64_t * w)8658 inline void small_or_big_multiply(const std::uint64_t *a,
8659 std::size_t lena,
8660 const std::uint64_t *b, std::size_t lenb,
8661 std::uint64_t *c, std::uint64_t *w)
8662 { if (lena < KARATSUBA_CUTOFF || lenb < KARATSUBA_CUTOFF)
8663 { if (lena==1) classical_multiply(a[0], b, lenb, c);
8664 else if (lenb==1) classical_multiply(b[0], a, lena, c);
8665 else classical_multiply(a, lena, b, lenb, c);
8666 }
8667 else certainly_big_multiply(a, lena, b, lenb, c, w);
8668 }
8669
small_or_big_multiply_and_add(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t lenc,std::uint64_t * w)8670 inline void small_or_big_multiply_and_add(const std::uint64_t *a,
8671 std::size_t lena,
8672 const std::uint64_t *b, std::size_t lenb,
8673 std::uint64_t *c, std::size_t lenc,
8674 std::uint64_t *w)
8675 { if (lena < KARATSUBA_CUTOFF || lenb < KARATSUBA_CUTOFF)
8676 { if (lena==1) classical_multiply_and_add(a[0], b, lenb, c, lenc);
8677 else if (lenb==1) classical_multiply_and_add(b[0], a, lena, c, lenc);
8678 else classical_multiply_and_add(a, lena, b, lenb, c, lenc);
8679 }
8680 else certainly_big_multiply_and_add(a, lena, b, lenb, c, lenc, w);
8681 }
8682
8683 // FIXED_LENGTH_LIMIT: Can multiply inputs with up to this number of
8684 // 64-bit digits using the fixed workspace.
8685 // WORKSPACE_SIZE: Length of the "w" work-vector needed for the above
8686 // which is a bit over twice the length of the inputs.
8687 //
8688 // These need to be such that it is OK to put three arrays of length
8689 // KARA_WORKSPACE_SIZE*sizeof(uint64_t) on the stack without that feeling
8690 // embarassing. The settings I use here can use around 50 Kbytes of stack.
8691
8692 INLINE_VAR const std::size_t KARA_FIXED_LENGTH_LIMIT = 1000;
8693 INLINE_VAR const std::size_t KARA_WORKSPACE_SIZE = 2050;
8694
8695 // These two functions allocate workspace for Karatsuba on the stack and
8696 // are called when the inputs are short enough for that to feel reasonable.
8697
allocate_one_array(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)8698 inline void allocate_one_array(const std::uint64_t *a,
8699 std::size_t lena,
8700 const std::uint64_t *b, std::size_t lenb,
8701 std::uint64_t *c)
8702 { std::uint64_t kara_workspace[KARA_WORKSPACE_SIZE];
8703 certainly_big_multiply(a, lena, b, lenb, c, kara_workspace);
8704 }
8705
allocate_three_arrays(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c)8706 inline void allocate_three_arrays(const std::uint64_t *a,
8707 std::size_t lena,
8708 const std::uint64_t *b, std::size_t lenb,
8709 std::uint64_t *c)
8710 { std::uint64_t kara_workspace[KARA_WORKSPACE_SIZE];
8711 std::uint64_t kara_workspace0[KARA_WORKSPACE_SIZE];
8712 std::uint64_t kara_workspace1[KARA_WORKSPACE_SIZE];
8713 top_level_certainly_big_multiply(a, lena, b, lenb, c,
8714 kara_workspace, kara_workspace0, kara_workspace1);
8715 }
8716
BY(int m,int n)8717 inline constexpr int BY(int m, int n)
8718 { return m + 4*n;
8719 }
8720
8721 // Finally I can provide the top-level entrypoint that accepts signed
8722 // integers that may not be the same size.
8723
8724 // This is the main entrypoint to the integer multiplication code. It
8725 // takes two signed numbers and forms their product.
8726
bigmultiply(const std::uint64_t * a,std::size_t lena,const std::uint64_t * b,std::size_t lenb,std::uint64_t * c,std::size_t & lenc)8727 inline void bigmultiply(const std::uint64_t *a, std::size_t lena,
8728 const std::uint64_t *b, std::size_t lenb,
8729 std::uint64_t *c, std::size_t &lenc)
8730 {
8731 // If a and/or be are negative then I can treat their true values as
8732 // a = sa + va b = sb + vb
8733 // where sa and sb and the signs - represented here as 0 for a positive
8734 // number and -2^(64*len) for a negative one. va and vb are then the simple
8735 // bit-patterns for a and b but now interpreted as unsigned values. So if
8736 // instead of using 64-bit digits I was using 8 bit ones, the value -3
8737 // would be stored as 0xfd and that would be spit up as -128 + 253.
8738 // Then a*b = sa*sb + sa*vb + sb*va + va*vb.
8739 // The last item there is just the product of a and b when treated as
8740 // unsigned values, and so is what I compute first here rather simply.
8741 // If sa and/or sb is non-zero it is just the negative of a power of 2^64,
8742 // and so I can correct the unsigned product into a signed one by (sometimes)
8743 // subtracting a shifted version of a or b from it.
8744 // If both arguments are tiny I write out the code in-line. The timings I
8745 // have taken suggest that this makes a significant difference to costs, and
8746 // I view it as plausible that "rather small" cases will often dominate.
8747 if (lena <= 4 && lenb <= 4) switch (lena + 4*lenb)
8748 {
8749 // Length 2 result
8750 case BY(1, 1):
8751 { std::int64_t c1;
8752 std::uint64_t c0;
8753 signed_multiply64(a[0], b[0], c1, c0);
8754 c[0] = c0;
8755 if (shrinkable(c1, c0)) lenc = 1;
8756 else
8757 { c[1] = static_cast<std::uint64_t>(c1);
8758 lenc = 2;
8759 }
8760 return;
8761 }
8762
8763 // Length 3 result
8764 case BY(1, 2):
8765 std::swap(a, b);
8766 // drop through.
8767 case BY(2, 1):
8768 { if (b[0]==0)
8769 { c[0] = 0;
8770 lenc = 1;
8771 return;
8772 }
8773 std::int64_t c2;
8774 std::uint64_t c1, c0;
8775 multiply64(b[0], a[0], c1, c0);
8776 c[0] = c0;
8777 signed_multiply64(b[0], a[1], c1, c2, c1);
8778 if (negative(b[0]))
8779 c2 = static_cast<std::int64_t>(
8780 static_cast<std::uint64_t>(c2) -
8781 subtract_with_borrow(c1, a[0], c1));
8782 c[1] = c1;
8783 // If you have an input like 2^63 it will be represented as a 2-word
8784 // bignum {0,2^63} so that the most significant bit of the most significant
8785 // digits is 0 to show that it is positive. If you multiply two such numbers
8786 // the direct result you get is {0, 0, 2^62, 0} and the top non-zero digit
8787 // now does not have its top bit set. So TWO words can be trimmed from the
8788 // top. This issue may not arise in the length-1 by length-2 case here, but
8789 // I leave the test in to feel safe and tidy.
8790 if (shrinkable(c2, c1))
8791 { if (shrinkable(c1, c0)) lenc = 1;
8792 else lenc = 2;
8793 }
8794 else
8795 { c[2] = static_cast<std::uint64_t>(c2);
8796 lenc = 3;
8797 }
8798 return;
8799 }
8800
8801 // Length 4 result
8802 case BY(2, 2):
8803 { std::int64_t c3;
8804 std::uint64_t c2, c1;
8805 mul2x2S(static_cast<std::int64_t>(a[1]), a[0],
8806 static_cast<std::int64_t>(b[1]), b[0],
8807 c3, c2, c1, c[0]);
8808 c[1] = c1;
8809 c[2] = c2;
8810 if (shrinkable(c3, c2))
8811 { if (shrinkable(c2, c1)) lenc = 2;
8812 else lenc = 3;
8813 }
8814 else
8815 { c[3] = static_cast<std::uint64_t>(c3);
8816 lenc = 4;
8817 }
8818 return;
8819 }
8820
8821 case BY(1, 3):
8822 std::swap(a, b);
8823 // drop through.
8824 case BY(3, 1):
8825 { if (b[0]==0)
8826 { c[0] = 0;
8827 lenc = 1;
8828 return;
8829 }
8830 std::int64_t c3;
8831 std::uint64_t c2, c1;
8832 multiply64(b[0], a[0], c1, c[0]);
8833 multiply64(b[0], a[1], c1, c2, c1);
8834 signed_multiply64(b[0], a[2], c2, c3, c2);
8835 if (negative(b[0]))
8836 { std::uint64_t borrow = subtract_with_borrow(c1, a[0], c1);
8837 borrow = subtract_with_borrow(c2, a[1], borrow, c2);
8838 c3 = static_cast<std::int64_t>(
8839 static_cast<std::uint64_t>(c3) - borrow);
8840 }
8841 c[1] = c1;
8842 c[2] = c2;
8843 if (shrinkable(c3, c2))
8844 { if (shrinkable(c2, c1)) lenc = 2;
8845 else lenc = 3;
8846 }
8847 else
8848 { c[3] = static_cast<std::uint64_t>(c3);
8849 lenc = 4;
8850 }
8851 return;
8852 }
8853
8854 // Length 5 result
8855 case BY(1, 4):
8856 std::swap(a, b);
8857 case BY(4, 1):
8858 { if (b[0]==0)
8859 { c[0] = 0;
8860 lenc = 1;
8861 return;
8862 }
8863 std::int64_t c4;
8864 std::uint64_t c3, c2, c1;
8865 multiply64(b[0], a[0], c1, c[0]);
8866 multiply64(b[0], a[1], c1, c2, c1);
8867 multiply64(b[0], a[2], c2, c3, c2);
8868 signed_multiply64(b[0], a[3], c3, c4, c3);
8869 if (negative(b[0]))
8870 { std::uint64_t borrow = subtract_with_borrow(c1, a[0], c1);
8871 borrow = subtract_with_borrow(c2, a[1], borrow, c2);
8872 borrow = subtract_with_borrow(c3, a[2], borrow, c3);
8873 c4 = static_cast<std::int64_t>(
8874 static_cast<std::uint64_t>(c4) - borrow);
8875 }
8876 c[1] = c1;
8877 c[2] = c2;
8878 c[3] = c3;
8879 if (shrinkable(c4, c3))
8880 { if (shrinkable(c3, c2)) lenc = 3;
8881 else lenc = 4;
8882 }
8883 else
8884 { c[4] = static_cast<std::uint64_t>(c4);
8885 lenc = 5;
8886 }
8887 return;
8888 }
8889
8890 case BY(2, 3):
8891 std::swap(a, b);
8892 case BY(3, 2):
8893 { std::int64_t c4;
8894 std::uint64_t c3, c3a, c2;
8895 mul2x2(a[1], a[0], b[1], b[0],
8896 c3, c2, c[1], c[0]);
8897 multiply64(a[2], b[0], c2, c3a, c2);
8898 std::uint64_t carry = add_with_carry(c3, c3a, c3);
8899 signed_multiply64(static_cast<std::int64_t>(a[2]),
8900 static_cast<std::int64_t>(b[1]), c3, c4, c3);
8901 c4 = static_cast<std::int64_t>(
8902 static_cast<std::uint64_t>(c4) + carry);
8903 if (negative(b[1]))
8904 { std::uint64_t borrow = subtract_with_borrow(c2, a[0], c2);
8905 borrow = subtract_with_borrow(c3, a[1], borrow, c3);
8906 c4 = static_cast<std::int64_t>(
8907 static_cast<std::uint64_t>(c4) - borrow);
8908 }
8909 if (negative(a[2]))
8910 { std::uint64_t borrow = subtract_with_borrow(c3, b[0], c3);
8911 c4 = static_cast<std::int64_t>(
8912 static_cast<std::uint64_t>(c4) - borrow);
8913 }
8914 c[2] = c2;
8915 c[3] = c3;
8916 if (shrinkable(c4, c3))
8917 { if (shrinkable(c3, c2)) lenc = 3;
8918 else lenc = 4;
8919 }
8920 else
8921 { c[4] = static_cast<std::uint64_t>(c4);
8922 lenc = 5;
8923 }
8924 return;
8925 }
8926
8927 // Length 6 results
8928 case BY(2, 4):
8929 case BY(4, 2):
8930 // I do not have to implement all the cases here - any that I either choose
8931 // not to or have not got around to can merely go "break;" and join in the
8932 // generic path.
8933 break;
8934
8935 case BY(3, 3):
8936 { std::int64_t c5;
8937 std::uint64_t c4, c3, c2, c1;
8938 mul3x3S(a[2], a[1], a[0], b[2], b[1], b[0],
8939 c5, c4, c3, c2, c1, c[0]);
8940 c[1] = c1;
8941 c[2] = c2;
8942 c[3] = c3;
8943 c[4] = c4;
8944 if (shrinkable(c5, c4))
8945 { if (shrinkable(c4, c3)) lenc = 4;
8946 else lenc = 5;
8947 }
8948 else
8949 { c[5] = static_cast<std::uint64_t>(c5);
8950 lenc = 6;
8951 }
8952 return;
8953 }
8954
8955 // Length 7 results
8956 case BY(3, 4):
8957 case BY(4, 3):
8958 // As above, cases that have not been coded here do not cause failure,
8959 // they just lead to that case being handled by the general (loopy) code.
8960 break;
8961
8962
8963 // Length 8 result
8964 case BY(4, 4):
8965 { std::uint64_t c7, c6, c5, c4;
8966 mul4x4(a[3], a[2], a[1], a[0],
8967 b[3], b[2], b[1], b[0],
8968 c7, c6, c5, c4, c[3], c[2], c[1], c[0]);
8969 if (negative(a[3]))
8970 { std::uint64_t borrow = subtract_with_borrow(c4, b[0], c4);
8971 borrow = subtract_with_borrow(c5, b[1], borrow, c5);
8972 borrow = subtract_with_borrow(c6, b[2], borrow, c6);
8973 c7 = static_cast<std::int64_t>(
8974 static_cast<std::uint64_t>(c7) - b[3] - borrow);
8975 }
8976 if (negative(b[3]))
8977 { std::uint64_t borrow = subtract_with_borrow(c4, a[0], c4);
8978 borrow = subtract_with_borrow(c5, a[1], borrow, c5);
8979 borrow = subtract_with_borrow(c6, a[2], borrow, c6);
8980 c7 = static_cast<std::int64_t>(
8981 static_cast<std::uint64_t>(c7) - a[3] - borrow);
8982 }
8983 c[4] = c4;
8984 c[5] = c5;
8985 c[6] = c6;
8986 if (shrinkable(c7, c6))
8987 { if (shrinkable(c6, c5)) lenc = 6;
8988 else lenc = 7;
8989 }
8990 else
8991 { c[7] = static_cast<std::uint64_t>(c7);
8992 lenc = 8;
8993 }
8994 return;
8995 }
8996
8997 default:
8998 // The default label should never be activated!
8999 ;
9000 }
9001
9002 // If the smaller input is reasonably small I will merely use classical
9003 // multiplication.
9004 // It is necessary to make a special case for multiplication by a 1-word
9005 // bignum for two reasons. (a) multiplication by zero yields a zero result
9006 // regardless of the magnitude of the second operand, and (b) one of
9007 // my implementations of classical_multiplication must be called in
9008 // a separate overloaded version to multiply by just one digit.
9009 if (lena < KARATSUBA_CUTOFF || lenb < KARATSUBA_CUTOFF)
9010 { if (lena==1)
9011 { if (a[0]==0)
9012 { c[0] = 0;
9013 lenc = 1;
9014 return;
9015 }
9016 else classical_multiply(a[0], b, lenb, c);
9017 }
9018 else if (lenb==1)
9019 { if (b[0]==0)
9020 { c[0] = 0;
9021 lenc = 1;
9022 return;
9023 }
9024 else classical_multiply(b[0], a, lena, c);
9025 }
9026 else classical_multiply(a, lena, b, lenb, c);
9027 // I do NOT return here because for a non-zero result I will need to adjust
9028 // if one or both of the input numbers were negative.
9029 }
9030 else
9031 {
9032 // For many smaller cases I will just use some static pre-allocated workspace
9033 // and hence avoid potential storage management overheads.
9034 if (lena <= KARA_FIXED_LENGTH_LIMIT ||
9035 lenb <= KARA_FIXED_LENGTH_LIMIT)
9036 { if (lena < PARAKARA_CUTOFF ||
9037 lenb < PARAKARA_CUTOFF)
9038 allocate_one_array(a, lena, b, lenb, c);
9039 else allocate_three_arrays(a, lena, b, lenb, c);
9040 }
9041 else
9042 { push(a); push(b);
9043 std::size_t lenw;
9044 if (lena < lenb) lenw = lena;
9045 else lenw = lenb;
9046 for (std::size_t i=lenw; i>8; i=i/2) lenw++;
9047 // I give myself workspace as long as the shorter input + log of that. The
9048 // extra logarithmic bit is because each time I split a number into its top
9049 // and bottom parts I may have an odd number and so the workspace needed
9050 // gets rounded up by a constant amount for each level of division.
9051 std::uint64_t *w = reserve(2*lenw);
9052 if (lena < PARAKARA_CUTOFF ||
9053 lenb < PARAKARA_CUTOFF)
9054 { pop(b); pop(a);
9055 certainly_big_multiply(a, lena, b, lenb, c, w);
9056 }
9057 else
9058 { std::uint64_t *w0 = reserve(4*lenw);
9059 std::uint64_t *w1 = reserve(4*lenw);
9060 pop(b); pop(a);
9061 top_level_certainly_big_multiply(a, lena, b, lenb, c,
9062 w, w0, w1);
9063 abandon(w1);
9064 abandon(w0);
9065 }
9066 abandon(w);
9067 }
9068 }
9069 // Now adapt for the situation where one or both of the inputs had been
9070 // negative.
9071 if (negative(a[lena-1]))
9072 { std::uint64_t borrow = 0;
9073 for (std::size_t i=0; i<lenb; i++)
9074 borrow = subtract_with_borrow(c[i+lena], b[i], borrow, c[i+lena]);
9075 }
9076 if (negative(b[lenb-1]))
9077 { std::uint64_t borrow = 0;
9078 for (std::size_t i=0; i<lena; i++)
9079 borrow = subtract_with_borrow(c[i+lenb], a[i], borrow, c[i+lenb]);
9080 }
9081 // The actual value may be 1 or 2 words shorter than this. So test the top
9082 // digit of c and if necessary reduce lenc.
9083 // Also note that the pending result is at least of length 2 here because
9084 // various small cases had been processed in-line earlier.
9085 // eg {0, 0x8000000000000000} times itself is {0, 0, 0x4000000000000000, 0}
9086 // and both leading zeros can be trimmed.
9087 lenc = lena + lenb;
9088 if (lenc > 1 && shrinkable(c[lenc-1], c[lenc-2]))
9089 { lenc--;
9090 if (lenc > 1 && shrinkable(c[lenc-1], c[lenc-2])) lenc--;
9091 }
9092 }
9093
9094 //===========================================================================
9095 //===========================================================================
9096
op(std::uint64_t * a,std::uint64_t * b)9097 inline std::intptr_t Times::op(std::uint64_t *a, std::uint64_t *b)
9098 { std::size_t lena = number_size(a);
9099 std::size_t lenb = number_size(b);
9100 std::size_t n = lena+lenb;
9101 push(a); push(b);
9102 std::uint64_t *p = reserve(n);
9103 pop(b); pop(a);
9104 std::size_t final_n;
9105 // I might like to optimise the 2*2 case here or even 2*3 and 3*3?
9106 bigmultiply(a, lena, b, lenb, p, final_n);
9107 return confirm_size(p, n, final_n);
9108 }
9109
op(std::int64_t a,std::int64_t b)9110 inline std::intptr_t Times::op(std::int64_t a, std::int64_t b)
9111 { std::int64_t hi;
9112 std::uint64_t lo;
9113 signed_multiply64(a, b, hi, lo);
9114 if ((hi==0 && positive(lo)) ||
9115 (hi==-1 && negative(lo)))
9116 { if (fits_into_fixnum(static_cast<std::int64_t>(lo)))
9117 return int_to_handle(static_cast<std::int64_t>(lo));
9118 std::uint64_t *r = reserve(1);
9119 r[0] = lo;
9120 return confirm_size(r, 1, 1);
9121 }
9122 std::uint64_t *r = reserve(2);
9123 r[0] = lo;
9124 r[1] = hi;
9125 return confirm_size(r, 2, 2);
9126 }
9127
op(std::int64_t a,std::uint64_t * b)9128 inline std::intptr_t Times::op(std::int64_t a, std::uint64_t *b)
9129 { std::size_t lenb = number_size(b);
9130 push(b);
9131 std::uint64_t *c = reserve(lenb+1);
9132 pop(b);
9133 std::uint64_t hi = 0;
9134 for (std::size_t i=0; i<lenb; i++)
9135 multiply64(a, b[i], hi, hi, c[i]);
9136 c[lenb] = hi;
9137 if (negative(a))
9138 { std::uint64_t carry = 1;
9139 for (std::size_t i=0; i<lenb; i++)
9140 carry = add_with_carry(c[i+1], ~b[i], carry, c[i+1]);
9141 }
9142 if (negative(b[lenb-1])) c[lenb] -= a;
9143 std::size_t lenc = lenb+1;
9144 truncate_positive(c, lenc);
9145 truncate_negative(c, lenc);
9146 return confirm_size(c, lenb+1, lenc);
9147 }
9148
op(std::uint64_t * a,std::int64_t b)9149 inline std::intptr_t Times::op(std::uint64_t *a, std::int64_t b)
9150 { return Times::op(b, a);
9151 }
9152
9153 // For big multi-digit numbers squaring can be done almost twice as fast
9154 // as general multiplication.
9155 // eg (a0,a1,a2,a3)^2 can be expressed as
9156 // a0^2+a1^2+a2^2+a3^2 + 2*(a0*a1+a0*a2+a0*a3+a1*a2+a1*a3+a2*a3)
9157 // where the part that has been doubled uses symmetry to reduce the work.
9158 //
9159 // For negative inputs I can form the product first treating the inputs
9160 // as if they had been unsigned, and then subtract 2*2^w*a from the result.
9161 //
9162 // I think my view here is that I should still be willing to move across
9163 // to Karatsuba, but only at a distinctly larger threshold than for
9164 // simple multiplication. Just where that threshold should be i snot really
9165 // clear to me, but for now I am setting it as 3 times the point at which
9166 // ordinary multiplications moves on from classical methods.
9167
bigsquare(std::uint64_t * a,std::size_t lena,std::uint64_t * r,std::size_t & lenr)9168 inline void bigsquare(std::uint64_t *a, std::size_t lena,
9169 std::uint64_t *r, std::size_t &lenr)
9170 { if (lena > 3*KARATSUBA_CUTOFF)
9171 { bigmultiply(a, lena, a, lena, r, lenr);
9172 return;
9173 }
9174 for (std::size_t i=0; i<2*lena; i++) r[i] = 0;
9175 std::uint64_t carry;
9176 lenr = 2*lena;
9177 for (std::size_t i=0; i<lena; i++)
9178 { std::uint64_t hi = 0;
9179 // Note that all the terms I add in here will need to be doubled in the
9180 // final accounting.
9181 for (std::size_t j=i+1; j<lena; j++)
9182 { std::uint64_t lo;
9183 multiply64(a[i], a[j], hi, hi, lo);
9184 hi += add_with_carry(lo, r[i+j], r[i+j]);
9185 }
9186 r[i+lena] = hi;
9187 }
9188 // Double the part that has been computed so far.
9189 carry = 0;
9190 for (std::size_t i=0; i<2*lena; i++)
9191 { std::uint64_t w = r[i];
9192 r[i] = (w << 1) | carry;
9193 carry = w >> 63;
9194 }
9195 // Now add in the bits that do not get doubled.
9196 carry = 0;
9197 std::uint64_t hi = 0;
9198 for (std::size_t i=0; i<lena; i++)
9199 { std::uint64_t lo;
9200 multiply64(a[i], a[i], r[2*i], hi, lo);
9201 carry = add_with_carry(lo, carry, r[2*i]);
9202 carry = add_with_carry(hi, r[2*i+1], carry, r[2*i+1]);
9203 }
9204 // Now if the input had been negative I have a correction to apply...
9205 // I subtract 2a from the top half of the result.
9206 if (negative(a[lena-1]))
9207 { std::uint64_t carry = 1;
9208 int fromprev = 0;
9209 for (std::size_t i=0; i<lena; i++)
9210 { std::uint64_t d = a[i];
9211 std::uint64_t w = (d<<1) | fromprev;
9212 fromprev = static_cast<int>(d>>63);
9213 carry = add_with_carry(r[lena+i], ~w, carry, r[lena+i]);
9214 }
9215 }
9216 // The actual value may be 1 word shorter than this.
9217 // test top digit or r and if necessary reduce lenr.
9218 truncate_positive(r, lenr);
9219 truncate_negative(r, lenr);
9220 }
9221
op(std::uint64_t * a)9222 inline std::intptr_t Square::op(std::uint64_t *a)
9223 { std::size_t lena = number_size(a);
9224 std::size_t n = 2*lena;
9225 push(a);
9226 std::uint64_t *p = reserve(n);
9227 pop(a);
9228 std::size_t final_n;
9229 bigsquare(a, lena, p, final_n);
9230 return confirm_size(p, n, final_n);
9231 }
9232
op(std::int64_t a)9233 inline std::intptr_t Square::op(std::int64_t a)
9234 { std::uint64_t hi, lo;
9235 multiply64(a, a, hi, lo);
9236 if (a < 0) hi -= 2u*static_cast<std::uint64_t>(a);
9237 // Now I have a 128-bit product of the inputs
9238 if ((hi == 0 && positive(lo)) ||
9239 (hi == static_cast<std::uint64_t>(-1) && negative(lo)))
9240 { if (fits_into_fixnum(static_cast<std::int64_t>(lo)))
9241 return int_to_handle(static_cast<std::int64_t>(lo));
9242 else
9243 { std::uint64_t *p = reserve(1);
9244 p[0] = lo;
9245 return confirm_size(p, 1, 1);
9246 }
9247 }
9248 std::uint64_t *p = reserve(2);
9249 p[0] = lo;
9250 p[1] = hi;
9251 return confirm_size(p, 2, 2);
9252 }
9253
op(std::uint64_t * a)9254 inline std::intptr_t Isqrt::op(std::uint64_t *a)
9255 { std::size_t lena = number_size(a);
9256 if (lena == 1) return Isqrt::op(static_cast<std::int64_t>(a[0]));
9257 std::size_t lenx = (lena+1)/2;
9258 push(a);
9259 std::uint64_t *x = reserve(lenx);
9260 pop(a);
9261 for (std::size_t i=0; i<lenx; i++) x[i] = 0;
9262 std::size_t bitstop = a[lena-1]==0 ? 0 : 64 - nlz(a[lena-1]);
9263 bitstop /= 2;
9264 if ((lena%2) == 0) bitstop += 32;
9265 x[lenx-1] = static_cast<std::uint64_t>(1) << bitstop;
9266 if (bitstop == 63) x[lenx-1]--; // ensure it is still positive!
9267 // I now have a first approximation to the square root as a number that is
9268 // a power of 2 with about half the bit-length of a. I will degenerate into
9269 // using generic arithmetic here even though that may have extra costs.
9270 //
9271 // I could perhaps reasonably use uint64_t arithmetic for a first few
9272 // iterations, only looking at the most significant digit of the input.
9273 // That would save time, however at present I do not expect this function
9274 // to be time critical in any plausible application, and so I will keep
9275 // things simple(er).
9276 Bignum biga(true, vector_to_handle(a));
9277 Bignum bigx(true, confirm_size(x, lenx, lenx));
9278 // I will do the first step outside the loop to guarantee that my
9279 // approximation is an over-estimate before I try the end-test.
9280 // bigx = (bigx + biga/bigx) >> 1;
9281 // The push/pop mess here feels extreme and I should probably re-code this
9282 // using lower level interfaces.
9283 push(bigx.val); push(biga.val);
9284 Bignum w1 = biga/bigx;
9285 pop(biga.val); pop(bigx.val);
9286 push(bigx.val); push(biga.val);
9287 w1 = bigx + w1;
9288 pop(biga.val); pop(bigx.val);
9289 push(bigx.val); push(biga.val);
9290 bigx = w1 >> 1;
9291 pop(biga.val); pop(bigx.val);
9292 for (;;)
9293 { push(bigx.val); push(biga.val);
9294 w1 = biga/bigx;
9295 pop(biga.val); pop(bigx.val);
9296 push(bigx.val); push(biga.val);
9297 w1 = bigx + w1;
9298 pop(biga.val); pop(bigx.val);
9299 push(bigx.val); push(biga.val);
9300 Bignum y = w1 >> 1;
9301 pop(biga.val); pop(bigx.val);
9302 if (y >= bigx) break;
9303 bigx = y;
9304 }
9305 // The Bignum "biga" encapsulated my argument: when its destructor is called
9306 // I do not want the input vector "a" to be clobbered, so I clobber the
9307 // bignum first to break the link. Ditto bigx.
9308 biga.val = 0;
9309 std::intptr_t r = bigx.val;
9310 bigx.val = 0;
9311 return r;
9312 }
9313
op(std::int64_t aa)9314 inline std::intptr_t Isqrt::op(std::int64_t aa)
9315 { if (aa <= 0) return int_to_bignum(0);
9316 std::uint64_t a = static_cast<std::uint64_t>(aa);
9317 std::size_t w = 64 - nlz(a);
9318 std::uint64_t x0 = a >> (w/2);
9319 // The iteration here converges to sqrt(a) from above, but I believe that
9320 // when the value stops changing it will be at floor(sqrt(a)). There are
9321 // some cases where the sequence ends up alternating between two adjacent
9322 // values. Because my input is at most 2^63-1 the number of iterations
9323 // written here will always suffice.
9324 std::uint64_t x1 = (x0 + a/x0)/2;
9325 std::uint64_t x2 = (x1 + a/x1)/2;
9326 if (x2 >= x1) return unsigned_int_to_bignum(x1);
9327 std::uint64_t x3 = (x2 + a/x2)/2;
9328 if (x3 >= x2) return unsigned_int_to_bignum(x2);
9329 std::uint64_t x4 = (x3 + a/x3)/2;
9330 if (x4 >= x3) return unsigned_int_to_bignum(x3);
9331 std::uint64_t x5 = (x4 + a/x4)/2;
9332 if (x5 >= x4) return unsigned_int_to_bignum(x4);
9333 return unsigned_int_to_bignum(x5);
9334 }
9335
9336 // This raises a bignum to a positive integer power. If the power is n then
9337 // the size of the output may be n*lena. The two vectors v and w are workspace
9338 // and must both be of size (at least) the size that the result could end
9339 // up as. Well with greater subtlty we can see that the sum of their sizes
9340 // must be at least the size of the result, but it is not clear that any
9341 // useful saving spece saving can be found down that path.
9342
bigpow(std::uint64_t * a,std::size_t lena,std::uint64_t n,std::uint64_t * v,std::uint64_t * w,std::uint64_t * r,std::size_t & lenr,std::size_t maxlenr)9343 inline void bigpow(std::uint64_t *a, std::size_t lena,
9344 std::uint64_t n,
9345 std::uint64_t *v,
9346 std::uint64_t *w,
9347 std::uint64_t *r, std::size_t &lenr, std::size_t maxlenr)
9348 { if (n == 0)
9349 { r[0] = 0;
9350 lenr = 1;
9351 return;
9352 }
9353 internal_copy(a, lena, v);
9354 std::size_t lenv = lena;
9355 w[0] = 1;
9356 std::size_t lenw = 1;
9357 while (n > 1)
9358 { if (n%2 == 0)
9359 { bigsquare(v, lenv, r, lenr);
9360 arithlib_assert(lenr <= maxlenr);
9361 internal_copy(r, lenr, v);
9362 lenv = lenr;
9363 n = n/2;
9364 }
9365 else
9366 { bigmultiply(v, lenv, w, lenw, r, lenr);
9367 arithlib_assert(lenr <= maxlenr);
9368 internal_copy(r, lenr, w);
9369 lenw = lenr;
9370 bigsquare(v, lenv, r, lenr);
9371 arithlib_assert(lenr <= maxlenr);
9372 internal_copy(r, lenr, v);
9373 lenv = lenr;
9374 n = (n-1)/2;
9375 }
9376 }
9377 bigmultiply(v, lenv, w, lenw, r, lenr);
9378 }
9379
9380 // In cases where n is too large this can fail. At present I deal with that
9381 // with arithlib_assert() statements rather than any comfortable scheme for
9382 // reporting the trouble.
9383
9384 // The code that dispatches into here should have filtered cases such that
9385 // the exponent n is not 0, 1 or 2 here.
9386
op(std::uint64_t * a,std::int64_t n)9387 inline std::intptr_t Pow::op(std::uint64_t *a, std::int64_t n)
9388 { std::size_t lena = number_size(a);
9389 // 1^(-n) == 1,
9390 // (-1)^(-n) == 1 if n is even or -1 if n is odd.
9391 // a^(-n) == 0 when a is not -1, 0 or 1.
9392 if (n < 0)
9393 { int z = 0;
9394 if (lena == 0)
9395 { if (static_cast<std::int64_t>(a[0]) == 1) z = 1;
9396 else if (static_cast<std::int64_t>(a[0]) == -1)
9397 z = (n%1==0 ? 1 : -1);
9398 else arithlib_assert(a[0] != 0u);
9399 }
9400 // 0^(-n) is an error
9401 // 1^(-n) = 1
9402 // (-1)^(-n) = +1 or -1 depending on whether n is odd or even
9403 // x^(-n) = 0 otherwise.
9404 return int_to_bignum(z);
9405 }
9406 // 6^n = 0
9407 std::size_t bitsa = bignum_bits(a, lena);
9408 std::uint64_t hi, bitsr;
9409 multiply64(n, bitsa, hi, bitsr);
9410 arithlib_assert(hi ==
9411 0); // Check that size is at least somewhat sane!
9412 // I estimate the largest size that my result could be, but then add
9413 // an extra word because the internal working of multiplication can
9414 // write a zero beyond its true result - eg if you are multiplying a pair
9415 // of 1-word numbers together it will believe that the result could be 2
9416 // words wide even if in fact you know it will not be.
9417 std::uint64_t lenr1 = 2 + bitsr/64;
9418 std::size_t lenr = static_cast<std::size_t>(lenr1);
9419 // if size_t was more narrow than 64-bits I could lose information in
9420 // truncating from uint64_t to size_t.
9421 arithlib_assert(lenr == lenr1);
9422 std::uint64_t olenr = lenr;
9423 push(a);
9424 std::uint64_t *r = reserve(lenr);
9425 push(r);
9426 std::uint64_t *v = reserve(lenr);
9427 push(v);
9428 std::uint64_t *w = reserve(lenr);
9429 pop(v); pop(r); pop(a);
9430 bigpow(a, lena, static_cast<std::uint64_t>(n), v, w, r, lenr, lenr);
9431 arithlib_assert(lenr <= olenr);
9432 abandon(w);
9433 abandon(v);
9434 return confirm_size(r, olenr, lenr);
9435 }
9436
9437 // Again the cases n = 0, 1 and 2 have been filtered out
9438
op(std::int64_t a,std::int64_t n)9439 inline std::intptr_t Pow::op(std::int64_t a, std::int64_t n)
9440 { if (n < 0)
9441 { int z = 0;
9442 if (a == 1) z = 1;
9443 else if (a == -1) z = (n%1==0 ? 1 : 0);
9444 else arithlib_assert(a != 0);
9445 return int_to_handle(z);
9446 }
9447 if (a == 0) return int_to_handle(0);
9448 else if (a == 1) return int_to_handle(a);
9449 else if (n == 0) return int_to_handle(1);
9450 std::uint64_t absa = (a < 0 ? -static_cast<std::uint64_t>
9451 (a) : static_cast<std::uint64_t>(a));
9452 std::size_t bitsa = 64 - nlz(absa);
9453 std::uint64_t hi, bitsr;
9454 multiply64(n, bitsa, hi, bitsr);
9455 arithlib_assert(hi ==
9456 0); // Check that size is at least somewhat sane!
9457 std::uint64_t lenr1 = 2 + bitsr/64;
9458 if (bitsr < 64) // Can do all the work as machine integers.
9459 { std::int64_t result = 1;
9460 for (;;)
9461 { if (n%2 != 0) result *= a;
9462 if ((n = n/2) == 0) break;
9463 a *= a;
9464 }
9465 return int_to_bignum(result);
9466 }
9467 std::size_t lenr = static_cast<std::size_t>(lenr1);
9468 // if size_t was more narrow than 64-bits I could lose information in
9469 // truncating from uint64_t to size_t.
9470 arithlib_assert(lenr == lenr1);
9471 std::uint64_t olenr = lenr;
9472 std::uint64_t *r = reserve(lenr);
9473 push(r);
9474 std::uint64_t *v = reserve(lenr);
9475 push(v);
9476 std::uint64_t *w = reserve(lenr);
9477 pop(v); pop(r);
9478 std::uint64_t aa[1] = {static_cast<std::uint64_t>(a)};
9479 bigpow(aa, 1, static_cast<std::uint64_t>(n), v, w, r, lenr, lenr);
9480 arithlib_assert(lenr <= olenr);
9481 abandon(w);
9482 abandon(v);
9483 return confirm_size(r, olenr, lenr);
9484 }
9485
op(std::uint64_t * a,double n)9486 inline double Pow::op(std::uint64_t *a, double n)
9487 { return std::pow(Double::op(a), n);
9488 }
9489
op(std::int64_t a,double n)9490 inline double Pow::op(std::int64_t a, double n)
9491 { return std::pow(Double::op(a), n);
9492 }
9493
9494 //=========================================================================
9495 //=========================================================================
9496
9497
9498 //
9499 // Division with the main number representation being 2s complement
9500 // turns out to have many messy special cases! Here are some of the
9501 // underlying issues:
9502 // . Inputs may have had initial 0 or -1 digits pre-pended to allow
9503 // for positive values with their top bit set and negative ones with
9504 // it clear. So if I had 8-bit words the number 128 would have an
9505 // unsigned representation of [0x80], but it has to be stored as a
9506 // two digit number [0x00,0x80]. Similarly some negative numbers
9507 // need an initial 0xff attached just so that it can be seen that they
9508 // are negative.
9509 // . If a result (quotient or remainder) is as above then space can be
9510 // needed for the prefix digit.
9511 // . Long division needs to have a dividend with at least 3 digits
9512 // (after scaling) and a divisor with at least 2. That means that various
9513 // small cases have to be treated specially.
9514 // . An operation as basic as taking the absolute value of an integer
9515 // generally involves allocating memory, and I would like to avoid that
9516 // as much as I can.
9517 // . quotient and remainder operations are very similar indeed, but I ought
9518 // to be able to safe memory allocation in one or the other. Specifically
9519 // if I am computing a remainder I can discard quotient digits as I go
9520 // rather than having anywhere to put them.
9521 // . On many useful platforms I will have an integer type that is 128 bits
9522 // wide and I can use that for a 128-by-64 division operation that is
9523 // really helpful when working with 64-bit digits. It is possible that
9524 // if I do not have 128-bit arithmetic available it would be best to
9525 // treat my numbers as being in base 2^32 so that 64-by-32 division is
9526 // the order of the day as a basic primitive.
9527 // . For Lisp purposes I will have "fixnums" as well as "bignums" so special
9528 // paths for arithmetic that involves values -2^59 to 2^59-1 will be
9529 // required.
9530 //
9531 // Well perhaps I am fussing about all the above. But my first drafts of this
9532 // code has not thought through all the cases carefully enough!
9533
9534
9535 // Divide the bignum a by the b, returning a quotient or a remainder or
9536 // both. Note that at this stage a may still be negative! The value b is
9537 // passed in sign and magnitide form as {b, b_negative}
9538
unsigned_short_division(std::uint64_t * a,std::size_t lena,std::uint64_t b,bool b_negative,bool want_q,std::uint64_t * & q,std::size_t & olenq,std::size_t & lenq,bool want_r,std::uint64_t * & r,std::size_t & olenr,std::size_t & lenr)9539 inline void unsigned_short_division(std::uint64_t *a,
9540 std::size_t lena,
9541 std::uint64_t b, bool b_negative,
9542 bool want_q, std::uint64_t *&q,
9543 std::size_t &olenq, std::size_t &lenq,
9544 bool want_r, std::uint64_t *&r,
9545 std::size_t &olenr, std::size_t &lenr)
9546 { std::uint64_t hi = 0;
9547 bool a_negative = false;
9548 std::uint64_t *aa;
9549 if (negative(a[lena-1]))
9550 { a_negative = true;
9551 // Take absolute value of a if necessary.
9552 push(a);
9553 aa = reserve(lena);
9554 pop(a);
9555 internal_negate(a, lena, aa);
9556 a = aa;
9557 }
9558 // Now both a and b are positive so I can do the division fairly simply.
9559 // Allocate space for the quotient if I need that, and then do standard
9560 // short division.
9561 std::size_t i=lena-1;
9562 if (want_q)
9563 { olenq = lena;
9564 push(a);
9565 q = reserve(olenq);
9566 pop(a);
9567 }
9568 for (;;)
9569 { std::uint64_t d;
9570 divide64(hi, a[i], b, d, hi);
9571 if (want_q) q[i] = d;
9572 if (i == 0) break;
9573 i--;
9574 }
9575 // If the original a had been negative I allocated space to store its
9576 // absolute value, and I can discard that now.
9577 if (a_negative) abandon(aa);
9578 if (want_q)
9579 { lenq = lena;
9580 // The quotient will be negative if divisor and dividend had different signs.
9581 if (a_negative != b_negative)
9582 { internal_negate(q, lenq, q);
9583 truncate_negative(q, lenq);
9584 }
9585 else truncate_positive(q, lenq);
9586 }
9587 if (want_r)
9588 {
9589 // The remainder is now present as an unsigned value in hi. The sign it
9590 // must end up having must match the sign of a (the dividend). Furthermore
9591 // the remainder will be strictly smaller then b, and the largest possible
9592 // value for b is 0xffffffffffffffff. The remainder may need to be returned
9593 // as a 2-digit bignum.
9594 if (want_q) push(q);
9595 if (a_negative)
9596 { hi = -hi;
9597 if (positive(hi) && hi!=0)
9598 { olenr = lenr = 2;
9599 r = reserve(olenr);
9600 r[0] = hi;
9601 r[1] = -1;
9602 }
9603 else
9604 { olenr = lenr = 1;
9605 r = reserve(olenr);
9606 r[0] = hi;
9607 }
9608 }
9609 else
9610 { if (negative(hi))
9611 { olenr = lenr = 2;
9612 r = reserve(olenr);
9613 r[0] = hi;
9614 r[1] = 0;
9615 }
9616 else
9617 { olenr = lenr = 1;
9618 r = reserve(olenr);
9619 r[0] = hi;
9620 }
9621 }
9622 if (want_q) pop(q);
9623 }
9624 }
9625
signed_short_division(std::uint64_t * a,std::size_t lena,std::int64_t b,bool want_q,std::uint64_t * & q,std::size_t & olenq,std::size_t & lenq,bool want_r,std::uint64_t * & r,std::size_t & olenr,std::size_t & lenr)9626 inline void signed_short_division(std::uint64_t *a, std::size_t lena,
9627 std::int64_t b,
9628 bool want_q, std::uint64_t *&q,
9629 std::size_t &olenq, std::size_t &lenq,
9630 bool want_r, std::uint64_t *&r,
9631 std::size_t &olenr, std::size_t &lenr)
9632 { if (b > 0) unsigned_short_division(a, lena,
9633 static_cast<std::uint64_t>(b),
9634 false,
9635 want_q, q, olenq, lenq,
9636 want_r, r, olenr, lenr);
9637 else unsigned_short_division(a, lena, -static_cast<std::uint64_t>(b),
9638 true,
9639 want_q, q, olenq, lenq,
9640 want_r, r, olenr, lenr);
9641 }
9642
9643 inline void unsigned_long_division(std::uint64_t *a,
9644 std::size_t &lena,
9645 std::uint64_t *b, std::size_t &lenb,
9646 bool want_q, std::uint64_t *q,
9647 std::size_t &olenq, std::size_t &lenq);
9648
9649 // The following is a major entrypoint to the division code. (a) and (b) are
9650 // vectors of digits such that the top digit of a number is treated as signed
9651 // and the lower ones as unsigned. To cope with this there will sometimes
9652 // be a sort of initial padder digit. The two boolean values indicate whether
9653 // either or both of quotient and remainder are required. if want_q is set
9654 // then this creates a new vector for q and return it via q/lenq. Similarly
9655 // for want_r. The inputs a and b can be bignums of any size and are allowed
9656 // to be positive or negative - this sorts everything out.
9657
9658 // Divide a by b to obtain a quotient q and a remainder r.
9659
division(std::uint64_t * a,std::size_t lena,std::uint64_t * b,std::size_t lenb,bool want_q,std::uint64_t * & q,std::size_t & olenq,std::size_t & lenq,bool want_r,std::uint64_t * & r,std::size_t & olenr,std::size_t & lenr)9660 inline void division(std::uint64_t *a, std::size_t lena,
9661 std::uint64_t *b, std::size_t lenb,
9662 bool want_q, std::uint64_t *&q, std::size_t &olenq, std::size_t &lenq,
9663 bool want_r, std::uint64_t *&r, std::size_t &olenr, std::size_t &lenr)
9664 { arithlib_assert(want_q || want_r);
9665 // First I will filter out a number of cases where the divisor is "small".
9666 // I only want to proceed into the general case code if it is a "genuine"
9667 // big number with at least two digits. This bit of the code is messier
9668 // than one might have imagined because of the 2s complement representation
9669 // I use and the fact that extreme values that almost fit in a single
9670 // digit can ends up as 2-digit values with a degenerate top digit.
9671 //
9672 // The first case is when the single digit if b is a signed value in the
9673 // range -2^63 to 2^63-1.
9674 if (lenb == 1)
9675 {
9676 // At present I cause an attempt to divide by zero to crash with an
9677 // arithlib_assert failure if I have build in debug mode or to do who
9678 // knows what (possibly raise an exception) otherwise. This maybe needs
9679 // review. I wonder if I should throw a "division by zero" exception?
9680 arithlib_assert(b[0] != 0); // would be division by zero
9681 signed_short_division(a, lena, static_cast<std::int64_t>(b[0]),
9682 want_q, q, olenq, lenq,
9683 want_r, r, olenr, lenr);
9684 return;
9685 }
9686 // Next I have b in the range 2^63 to 2^64-1. Such values can be represented
9687 // in uint64_t.
9688 else if (lenb == 2 && b[1]==0)
9689 { unsigned_short_division(a, lena, b[0], false,
9690 want_q, q, olenq, lenq,
9691 want_r, r, olenr, lenr);
9692 return;
9693 }
9694 // Now for b in -2^64 to -2^63-1. The 2s complement representetation will be
9695 // of the form (-1,nnn) with nnn an unsigned 64-bit value.
9696 else if (lenb == 2 && b[1]==static_cast<std::uint64_t>(-1))
9697 {
9698 // -b(0) is an unsigned representation of the absolute value of b. There is
9699 // one special case when -b(0) is zero, and that corresponds to division
9700 // by -2^64, so I will need to detect that and turn the division into a
9701 // combination of shift and negate operations.
9702 if (b[0] == 0)
9703 { if (want_q)
9704 { lenq = lena;
9705 olenq = lena;
9706 push(a);
9707 q = reserve(lena);
9708 pop(a);
9709 // The next line took me some while to arrive at!
9710 std::uint64_t carry = !negative(a[lena-1]) || a[0]==0 ? 1 : 0;
9711 for (std::size_t i=1; i<lena; i++)
9712 carry = add_with_carry(~a[i], carry, q[i-1]);
9713 q[lena-1] = negative(a[lena-1]) ? 0 : -static_cast<std::uint64_t>(1);
9714 truncate_positive(q, lenq);
9715 truncate_negative(q, lenq);
9716 }
9717 if (want_r)
9718 { std::uint64_t rr = a[0], padr = 0;
9719 lenr = 1;
9720 if (negative(a[lena-1]) && positive(rr))
9721 { padr = -1;
9722 lenr++;
9723 }
9724 else if (positive(a[lena-1]) && negative(rr))
9725 { padr = 0;
9726 lenr++;
9727 }
9728 if (want_q) push(q);
9729 r = reserve(lenr);
9730 if (want_q) pop(q);
9731 olenr = lenr;
9732 r[0] = rr;
9733 if (lenr != 1) r[1] = padr;
9734 }
9735 return;
9736 }
9737 unsigned_short_division(a, lena, -b[0], true,
9738 want_q, q, olenq, lenq,
9739 want_r, r, olenr, lenr);
9740 return;
9741 }
9742 // Now the absolute value of b will be at least 2 digits of 64-bits with the
9743 // high digit non-zero. I need to make a copy of it because I will scale
9744 // it during long division.
9745 std::uint64_t *bb = NULL;
9746 std::size_t lenbb = lenb;
9747 bool b_negative = negative(b[lenb-1]);
9748 if (b_negative)
9749 {
9750 // In the case that b is negative I will want its absolute value. Especially
9751 // in a multi-thread world I must not disturb or overwrite the input vector,
9752 // so a create a temporary copy of b to negate. In my full 2s complement
9753 // representation negating -2^(64*n-1) would lead to what was supposed to be
9754 // a positive value but it would have its top bit set so it would require
9755 // and extra leading 0. Because the value I generate here is to be treated
9756 // as unsigned this leading top bit does not matter and so the absolute value
9757 // of b fits in the same amount of space that b did with no risk of overflow.
9758 push(a); push(b);
9759 bb = reserve(lenb);
9760 pop(b); pop(a);
9761 olenr = lenb;
9762 internal_negate(b, lenb, bb);
9763 if (bb[lenbb-1] == 0) lenbb--;
9764 }
9765 else if (b[lenb-1] == 0) lenbb--;
9766 arithlib_assert(lenbb >= 2);
9767 // Now I should look at the dividend. If it is shorter than the divisor
9768 // then I know that the quotient will be zero and the dividend will be the
9769 // remainder. If I had made this test before normalizing the divisor I could
9770 // have needed to worry about the case of (-2^(64n-1))/(2^(64n-1)) where the
9771 // divisor would have had an initial padding zero so would have shown up
9772 // as longer than the dividend but the quotient would have needed to come out
9773 // as 1. But here with the divisor made tidy this test is safe!
9774 if (lena < lenbb)
9775 { if (want_q)
9776 { q = reserve(1);
9777 olenq = 1;
9778 q[0] = 0;
9779 lenq = 1;
9780 }
9781 if (want_r)
9782 { push(a);
9783 r = reserve(lena);
9784 pop(a);
9785 olenr = lena;
9786 internal_copy(a, lena, r);
9787 lenr = lena;
9788 }
9789 if (b_negative) abandon(bb);
9790 return;
9791 }
9792 // Now lena >= lenb >= 2 and I will need to do a genuine long division. This
9793 // will need me to allocate some workspace.
9794 //
9795 // Because I will scale the divisor I need that to be a copy of the
9796 // original data even if that has been positive and so I had not copied
9797 // it yet. I delay creation of that copy until now because that lets my
9798 // avoid a spurious allocation in the various small cases.
9799 if (!b_negative)
9800 { push(a); push(b);
9801 bb = reserve(lenb);
9802 pop(b); pop(a);
9803 olenr = lenb;
9804 internal_copy(b, lenbb, bb);
9805 }
9806 #ifdef DEBUG_OVERRUN
9807 if (debug_arith) arithlib_assert(bb[olenr] == 0xaaaaaaaaaaaaaaaa);
9808 #endif
9809 // If I actually return the quotient I may need to add a leading 0 or -1 to
9810 // make its 2s complement representation valid. Hence the "+2" rather than
9811 // the more obvious "+1" here.
9812 if (want_q)
9813 { lenq = lena - lenb + 2;
9814 push(a); push(b); push(bb);
9815 q = reserve(lenq);
9816 pop(bb); pop(b); pop(a);
9817 olenq = lenq;
9818 }
9819 // I will need space where I store something that starts off as a scaled
9820 // copy of the dividend and gradually have values subtracted from it until
9821 // it ends up as the remainder.
9822 lenr = lena;
9823 push(a); push(b); push(bb);
9824 if (want_q) push(q);
9825 r = reserve(lenr+1);
9826 if (want_q) pop(q);
9827 pop(bb); pop(b); pop(a);
9828 bool a_negative = negative(a[lena-1]);
9829 if (a_negative) internal_negate(a, lena, r);
9830 else internal_copy(a, lena, r);
9831 unsigned_long_division(r, lenr, bb, lenbb, want_q, q, olenq, lenq);
9832 #ifdef DEBUG_OVERRUN
9833 if (debug_arith) arithlib_assert(r[lena+1] == 0xaaaaaaaaaaaaaaaa);
9834 #endif
9835 // While performing the long division I will have had three vectors that
9836 // were newly allocated. r starts off containing a copy of a but ends up
9837 // holding the remainder. It is rather probable that this remainder will
9838 // often be a distinctly shorter vector than a was. The vector q is only
9839 // created and used if want_q was set, and it ends up holding the quotient.
9840 // finally bb holds the absolute value of the divisor but scaled up by a
9841 // power of 2 so that its leading digit has its top bit set. Well the actual
9842 // remainder is smaller than the divisor and so it will be a closer fit into
9843 // bb than r. So copy it into there so that the allocate/abandon and
9844 // size confirmation code is given less extreme things to cope with.
9845 arithlib_assert(lenr<=lenb);
9846 if (want_r) internal_copy(r, lenr, bb);
9847 abandon(r);
9848 if (want_q)
9849 { if (negative(q[lenq-1]))
9850 { arithlib_assert(lenq < olenq);
9851 q[lenq++] = 0;
9852 }
9853 if (a_negative != b_negative)
9854 { internal_negate(q, lenq, q);
9855 truncate_negative(q, lenq);
9856 }
9857 else truncate_positive(q, lenq);
9858 }
9859 // else abandon(q);
9860 if (want_r)
9861 { r = bb;
9862 if (negative(r[lenr-1]))
9863 { arithlib_assert(lenr < olenr);
9864 r[lenr++] = 0;
9865 }
9866 if (a_negative)
9867 { internal_negate(r, lenr, r);
9868 truncate_negative(r, lenr);
9869 }
9870 else truncate_positive(r, lenr);
9871 }
9872 else abandon(bb);
9873 }
9874
9875 // During long division I will scale my numbers by shifting left by an
9876 // amount s. I do that in place. The shift amount will be such that
9877 // the divisor ends up with the top bit of its top digit set. The
9878 // dividend will need to extend into an extra digit, and I deal with that
9879 // by returning the overflow word as a result of the scaling function. Note
9880 // that the shift amount will be in the range 0-63.
9881
9882
scale_for_division(std::uint64_t * r,std::size_t lenr,int s)9883 inline std::uint64_t scale_for_division(std::uint64_t *r,
9884 std::size_t lenr,
9885 int s)
9886 {
9887 // There are two reasons for me to treat a shift by zero specially. The
9888 // first is that it is cheap because no data needs moving at all. But the
9889 // more subtle reason is that if I tried using the general code as below
9890 // that would execute a right shift by 64, which is out of the proper range
9891 // for C++ right shifts.
9892 if (s == 0) return 0;
9893 std::uint64_t carry = 0;
9894 for (std::size_t i=0; i<lenr; i++)
9895 { std::uint64_t w = r[i];
9896 r[i] = (w << s) | carry;
9897 carry = w >> (64-s);
9898 }
9899 return carry;
9900 }
9901
9902 // r = r - b*q*base^(lena-lenb-1).
9903
multiply_and_subtract(std::uint64_t * r,std::size_t lenr,std::uint64_t q0,std::uint64_t * b,std::size_t lenb)9904 inline void multiply_and_subtract(std::uint64_t *r, std::size_t lenr,
9905 std::uint64_t q0,
9906 std::uint64_t *b, std::size_t lenb)
9907 { arithlib_assert(lenr > lenb);
9908 std::uint64_t hi = 0, lo, carry = 1;
9909 for (std::size_t i=0; i<lenb; i++)
9910 { multiply64(b[i], q0, hi, hi, lo);
9911 // lo is now the next digit of b*q, and hi needs to be carried up to the
9912 // next one.
9913 carry = add_with_carry(r[i+lenr-lenb-1], ~lo, carry,
9914 r[i+lenr-lenb-1]);
9915 }
9916 r[lenr-1] = r[lenr-1] + ~hi + carry;
9917 }
9918
9919 // add_back_correction() is used when a quotient digit was mis-predicted by
9920 // 1 and I detect that when I calculate r = r - b*q and end up with r negative
9921 // result. I fix things up by decrementing q and going
9922 // r = r + (b<<(lenr-lenb-1))
9923
add_back_correction(std::uint64_t * r,std::size_t lenr,std::uint64_t * b,std::size_t lenb)9924 inline void add_back_correction(std::uint64_t *r, std::size_t lenr,
9925 std::uint64_t *b, std::size_t lenb)
9926 { arithlib_assert(lenr > lenb);
9927 std::uint64_t carry = 0;
9928 for (std::size_t i=0; i<lenb; i++)
9929 carry = add_with_carry(r[i+lenr-lenb-1], b[i], carry,
9930 r[i+lenr-lenb-1]);
9931 r[lenr-1] += carry;
9932 }
9933
next_quotient_digit(std::uint64_t * r,std::size_t & lenr,std::uint64_t * b,std::size_t lenb)9934 inline std::uint64_t next_quotient_digit(std::uint64_t *r,
9935 std::size_t &lenr,
9936 std::uint64_t *b, std::size_t lenb)
9937 { arithlib_assert(lenr > lenb);
9938 arithlib_assert(lenb >= 2);
9939 arithlib_assert(b[lenb-1] != 0);
9940 std::uint64_t q0, r0;
9941 if (r[lenr-1] == b[lenb-1])
9942 { q0 = static_cast<std::uint64_t>(-1);
9943 r0 = r[lenr-2] + b[lenb-1];
9944 // Here perhaps q0 is still an over-estimate by 1?
9945 }
9946 else
9947 { divide64(r[lenr-1], r[lenr-2], b[lenb-1], q0, r0);
9948 // At this stage q0 may be correct or it may be an over-estimate by 1 or 2,
9949 // but never any worse than that.
9950 //
9951 // The tests on the next lines should detect all case where q0 was in error
9952 // by 2 and most when it was in error by 1.
9953 std::uint64_t hi, lo;
9954 multiply64(q0, b[lenb-2], hi, lo);
9955 if (hi > r0 ||
9956 (hi == r0 && lo > r[lenr-3])) q0--;
9957 }
9958 //
9959 // Now I want to go "r = r - b*q0*2^(64*(lenr-lenb));" so that r
9960 // is set to an accurate remainder after using q0 as (part of) the
9961 // quotient. This may carry an overshoot into atop and if so I will need
9962 // to reduce q0 again and compensate.
9963 //
9964 multiply_and_subtract(r, lenr, q0, b, lenb);
9965 if (negative(r[lenr-1]))
9966 { q0--;
9967 add_back_correction(r, lenr, b, lenb);
9968 }
9969 lenr--; // a is now one digit shorter.
9970 return q0;
9971 }
9972
9973 // r is an unsigned number. Shift right (in place) by s bits, where s
9974 // is in the range 0 - 63. The bits shifted out to the right should all
9975 // be zero.
9976
unscale_for_division(std::uint64_t * r,std::size_t & lenr,int s)9977 inline void unscale_for_division(std::uint64_t *r, std::size_t &lenr,
9978 int s)
9979 { if (s != 0)
9980 { std::uint64_t carry = 0;
9981 std::size_t i = lenr-1;
9982 for (;;)
9983 { std::uint64_t w = r[i];
9984 r[i] = (w >> s) | carry;
9985 carry = w << (64-s);
9986 if (i == 0) break;
9987 i--;
9988 }
9989 arithlib_assert(carry==0);
9990 }
9991 truncate_unsigned(r, lenr);
9992 }
9993
9994 // This function does long division on unsigned values, computing the
9995 // quotient (a/b). In doing so it updates (a) so that at the end it holds
9996 // the remainder. It only fills in a value for the quotient q if want_q is
9997 // true. Note also that this code will scale (b) so that the top bit of its
9998 // highest digit is a "1", so b must be an array that can be overwritten
9999 // without disturbing code elsewhere.
10000
unsigned_long_division(std::uint64_t * a,std::size_t & lena,std::uint64_t * b,std::size_t & lenb,bool want_q,std::uint64_t * q,std::size_t & olenq,std::size_t & lenq)10001 inline void unsigned_long_division(std::uint64_t *a,
10002 std::size_t &lena,
10003 std::uint64_t *b, std::size_t &lenb,
10004 bool want_q, std::uint64_t *q,
10005 std::size_t &olenq, std::size_t &lenq)
10006 { arithlib_assert(lenb >= 2);
10007 arithlib_assert(lena >= lenb);
10008 // I will multiply a and b by a scale factor that gets the top digit of "b"
10009 // reasonably large. The value stored in "a" can become one digit longer,
10010 // but there is space to store that.
10011 //
10012 // The scaling is done here using a shift, which seems cheaper to sort out
10013 // then multiplication by a single-digit value.
10014 arithlib_assert(b[lenb-1] != 0);
10015 int ss = nlz(b[lenb-1]);
10016 // When I scale the dividend expands into an extra digit but the scale
10017 // factor has been chosen so that the divisor does not.
10018 a[lena] = scale_for_division(a, lena, ss);
10019 lena++;
10020 arithlib_assert(scale_for_division(b, lenb, ss) == 0);
10021 lenq = lena-lenb; // potential length of quotient.
10022 std::size_t m = lenq-1;
10023 for (;;)
10024 { std::uint64_t qd = next_quotient_digit(a, lena, b, lenb);
10025 // If I am only computing the remainder I do not need to store the quotient
10026 // digit that I have just found.
10027 if (want_q) q[m] = qd;
10028 if (m == 0) break;
10029 m--;
10030 }
10031 unscale_for_division(a, lena, ss);
10032 // The quotient is OK correct now but has been computed as an unsigned value
10033 // so if its top digit has its top bit set I need to prepend a zero;
10034 if (want_q)
10035 { if (negative(q[lenq-1])) q[lenq++] = 0;
10036 else truncate_unsigned(q, lenq);
10037 }
10038 if (negative(a[lena-1])) a[lena++] = 0;
10039 else truncate_unsigned(a, lena);
10040 }
10041
10042 // Use unsigned_long_division when all that is required is the remainder.
10043 // Here a>b and b is at least 2 words. The code corrupts b and replaces
10044 // a with remainder(a, b).
10045
unsigned_long_remainder(std::uint64_t * a,std::size_t & lena,std::uint64_t * b,std::size_t & lenb)10046 inline void unsigned_long_remainder(std::uint64_t *a,
10047 std::size_t &lena,
10048 std::uint64_t *b, std::size_t &lenb)
10049 { std::size_t w;
10050 unsigned_long_division(a, lena, b, lenb,
10051 false, NULL, w, w);
10052 }
10053
op(std::uint64_t * a,std::uint64_t * b)10054 inline std::intptr_t Quotient::op(std::uint64_t *a, std::uint64_t *b)
10055 { std::size_t lena = number_size(a);
10056 std::size_t lenb = number_size(b);
10057 std::uint64_t *q, *r;
10058 std::size_t olenq, olenr, lenq, lenr;
10059 division(a, lena, b, lenb,
10060 true, q, olenq, lenq,
10061 false, r, olenr, lenr);
10062 return confirm_size(q, olenq, lenq);
10063 }
10064
op(std::uint64_t * a,std::int64_t b)10065 inline std::intptr_t Quotient::op(std::uint64_t *a, std::int64_t b)
10066 { std::size_t lena = number_size(a);
10067 std::uint64_t *q, *r;
10068 std::size_t olenq, olenr, lenq, lenr;
10069 std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
10070 division(a, lena, bb, 1,
10071 true, q, olenq, lenq,
10072 false, r, olenr, lenr);
10073 return confirm_size(q, olenq, lenq);
10074 }
10075
10076 // A fixnum divided by a bignum ought always to yield 0, except that
10077 // maybe -0x8000000000000000} / {0,0x8000000000000000) => -1
10078
op(std::int64_t a,std::uint64_t * b)10079 inline std::intptr_t Quotient::op(std::int64_t a, std::uint64_t *b)
10080 { if (number_size(b)==1 &&
10081 b[0]==-static_cast<std::uint64_t>(a)) return int_to_handle(-1);
10082 return int_to_handle(0);
10083 }
10084
10085 // unpleasantly -0x8000000000000000 / -1 => a bignum
10086
op(std::int64_t a,std::int64_t b)10087 inline std::intptr_t Quotient::op(std::int64_t a, std::int64_t b)
10088 { if (b==-1 && a == MIN_FIXNUM) return int_to_bignum(-a);
10089 else return int_to_handle(a / b);
10090 }
10091
op(std::uint64_t * a,std::uint64_t * b)10092 inline std::intptr_t Remainder::op(std::uint64_t *a, std::uint64_t *b)
10093 { std::size_t lena = number_size(a);
10094 std::size_t lenb = number_size(b);
10095 std::uint64_t *q, *r;
10096 std::size_t olenq, olenr, lenq, lenr;
10097 division(a, lena, b, lenb,
10098 false, q, olenq, lenq,
10099 true, r, olenr, lenr);
10100 return confirm_size(r, olenr, lenr);
10101 }
10102
op(std::uint64_t * a,std::int64_t b)10103 inline std::intptr_t Remainder::op(std::uint64_t *a, std::int64_t b)
10104 { std::size_t lena = number_size(a);
10105 std::uint64_t *q, *r;
10106 std::size_t olenq, olenr, lenq, lenr;
10107 std::uint64_t bb[1] = {static_cast<std::uint64_t>(b)};
10108 division(a, lena, bb, 1,
10109 false, q, olenq, lenq,
10110 true, r, olenr, lenr);
10111 return confirm_size(r, olenr, lenr);
10112 }
10113
op(std::int64_t a,std::uint64_t * b)10114 inline std::intptr_t Remainder::op(std::int64_t a, std::uint64_t *b)
10115 { if (number_size(b)==1 &&
10116 b[0]==-static_cast<std::uint64_t>(a)) return int_to_handle(0);
10117 return int_to_handle(a);
10118 }
10119
op(std::int64_t a,std::int64_t b)10120 inline std::intptr_t Remainder::op(std::int64_t a, std::int64_t b)
10121 { return int_to_handle(a % b);
10122 }
10123
10124
10125
10126 #ifdef LISP
10127
10128 // In LISP mode I provide a function that returns both quotient and
10129 // remainder. In the other two modes I support the same idea but
10130 // as a function that delivers the quotient as its result and saves
10131 // the remainder via an additional argument.
10132
10133 }
10134
10135 namespace arithlib_implementation
10136 {
10137
op(std::uint64_t * a,std::uint64_t * b)10138 inline std::intptr_t Divide::op(std::uint64_t *a, std::uint64_t *b)
10139 { std::size_t lena = number_size(a);
10140 std::size_t lenb = number_size(b);
10141 std::uint64_t *q, *r;
10142 std::size_t olenq, olenr, lenq, lenr;
10143 division(a, lena, b, lenb,
10144 true, q, olenq, lenq,
10145 true, r, olenr, lenr);
10146 std::intptr_t rr = confirm_size(r, olenr, lenr);
10147 std::intptr_t qq = confirm_size_x(q, olenq, lenq);
10148 #ifdef ZAPPA
10149 return cons(qq, rr).v;
10150 #else
10151 return cons(qq, rr);
10152 #endif
10153 }
10154
op(std::uint64_t * a,std::int64_t bb)10155 inline std::intptr_t Divide::op(std::uint64_t *a, std::int64_t bb)
10156 { std::size_t lena = number_size(a);
10157 std::uint64_t *q, *r;
10158 std::size_t olenq, olenr, lenq, lenr;
10159 std::uint64_t b[1] = {static_cast<std::uint64_t>(bb)};
10160 division(a, lena, b, 1,
10161 true, q, olenq, lenq,
10162 true, r, olenr, lenr);
10163 std::intptr_t rr = confirm_size(r, olenr, lenr);
10164 std::intptr_t qq = confirm_size_x(q, olenq, lenq);
10165 #ifdef ZAPPA
10166 return cons(qq, rr).v;
10167 #else
10168 return cons(qq, rr);
10169 #endif
10170 }
10171
op(std::int64_t aa,std::uint64_t * b)10172 inline std::intptr_t Divide::op(std::int64_t aa, std::uint64_t *b)
10173 { std::size_t lenb = number_size(b);
10174 std::uint64_t *q, *r;
10175 std::size_t olenq, olenr, lenq, lenr;
10176 std::uint64_t a[1] = {static_cast<std::uint64_t>(aa)};
10177 division(a, 1, b, lenb,
10178 true, q, olenq, lenq,
10179 true, r, olenr, lenr);
10180 std::intptr_t rr = confirm_size(r, olenr, lenr);
10181 std::intptr_t qq = confirm_size_x(q, olenq, lenq);
10182 #ifdef ZAPPA
10183 return cons(qq, rr).v;
10184 #else
10185 return cons(qq, rr);
10186 #endif
10187 }
10188
op(std::int64_t aa,std::int64_t bb)10189 inline std::intptr_t Divide::op(std::int64_t aa, std::int64_t bb)
10190 { std::uint64_t *q, *r;
10191 std::size_t olenq, olenr, lenq, lenr;
10192 std::uint64_t a[1] = {static_cast<std::uint64_t>(aa)};
10193 std::uint64_t b[1] = {static_cast<std::uint64_t>(bb)};
10194 division(a, 1, b, 1,
10195 true, q, olenq, lenq,
10196 true, r, olenr, lenr);
10197 std::intptr_t rr = confirm_size(r, olenr, lenr);
10198 std::intptr_t qq = confirm_size_x(q, olenq, lenq);
10199 #ifdef ZAPPA
10200 return cons(qq, rr).v;
10201 #else
10202 return cons(qq, rr);
10203 #endif
10204 }
10205
10206 #else
10207
op(std::uint64_t * a,std::uint64_t * b,std::intptr_t & rem)10208 inline std::intptr_t Divide::op(std::uint64_t *a, std::uint64_t *b,
10209 std::intptr_t &rem)
10210 { std::size_t lena = number_size(a);
10211 std::size_t lenb = number_size(b);
10212 std::uint64_t *q, *r;
10213 std::size_t olenq, olenr, lenq, lenr;
10214 division(a, lena, b, lenb,
10215 true, q, olenq, lenq,
10216 true, r, olenr, lenr);
10217 rem = confirm_size(r, olenr, lenr);
10218 return confirm_size_x(q, olenq, lenq);
10219 }
10220
10221 #endif
10222
10223 // a = a - b*q.
10224
reduce_for_gcd(std::uint64_t * a,std::size_t lena,std::uint64_t q,std::uint64_t * b,std::size_t lenb)10225 inline bool reduce_for_gcd(std::uint64_t *a, std::size_t lena,
10226 std::uint64_t q,
10227 std::uint64_t *b, std::size_t lenb)
10228 { arithlib_assert(lena == lenb || lena == lenb+1);
10229 std::uint64_t hi = 0, hi1, lo, borrow = 0;
10230 for (std::size_t i=0; i<lenb; i++)
10231 { multiply64(b[i], q, hi1, lo);
10232 hi1 += subtract_with_borrow(a[i], hi, a[i]);
10233 borrow = subtract_with_borrow(a[i], lo, borrow, a[i]);
10234 hi = hi1;
10235 }
10236 // In the cases where this is used the difference |a - q*b| should be
10237 // less than a. Well if q was computed accurately then it will be less
10238 // than b. And if q is large it will at least me much less than a. So I
10239 // am confident that testing the top bit if a[lena-1] after the subtraction
10240 // will be a reliable test for overshoot. I might want to formalize this
10241 // argument a bit better!
10242 if (lena > lenb) a[lena-1] = a[lena-1] - hi - borrow;
10243 return negative(a[lena-1]);
10244 }
10245
10246 // The next function performs a = a = b*(q<<shift), but
10247 // it computes it more as a = a - (b<<shift)*q.
10248 // It will be used with 0 < shift < 64, ie only when a genuine shift
10249 // between digits is required.
shifted_reduce_for_gcd(std::uint64_t * a,std::size_t lena,std::uint64_t q,std::uint64_t * b,std::size_t lenb,int shift)10250 inline bool shifted_reduce_for_gcd(std::uint64_t *a, std::size_t lena,
10251 std::uint64_t q,
10252 std::uint64_t *b, std::size_t lenb,
10253 int shift)
10254 { arithlib_assert(lena == lenb+1 || lena == lenb+2);
10255 std::uint64_t hi = 0, hi1, lo, borrow = 0;
10256 for (std::size_t i=0; i<=lenb; i++)
10257 { multiply64(shifted_digit(b, lenb, shift, i), q, hi1, lo);
10258 hi1 += subtract_with_borrow(a[i], hi, a[i]);
10259 borrow = subtract_with_borrow(a[i], lo, borrow, a[i]);
10260 hi = hi1;
10261 }
10262 // In the cases where this is used the difference |a - q*b| should be
10263 // less than a. Well if q was computed accurately then it will be less
10264 // than b. And if q is large it will at least me much less than a. So I
10265 // am confident that testing the top bit if a[lena-1] after the subtraction
10266 // will be a reliable test for overshoot. I might want to formalize this
10267 // argument a bit better!
10268 if (lena > lenb+1) a[lena-1] = a[lena-1] - hi - borrow;
10269 return negative(a[lena-1]);
10270 }
10271
10272 // Here we compute r = u*a - v*b, where lenr >= min(lena, lenb). This
10273 // is for use in Lehmer reductions.
10274 // In general this will be used as in
10275 // ua_minus_vb(a, u1, b, v1, temp);
10276 // ua_minus_vb(a, u2, b, v2, a);
10277 // copy from temp to b
10278 // so note that the destination may be the same vector as one of the inputs.
10279 // This will only be used when a and b are almost the same length. I leave
10280 // a result of length lena even though I very much expect that in at least
10281 // almost all cases the result will be almost 128 bits smaller!
10282
ua_minus_vb(std::uint64_t * a,std::size_t lena,std::uint64_t u,std::uint64_t * b,std::size_t lenb,std::uint64_t v,std::uint64_t * r,std::size_t & lenr)10283 inline bool ua_minus_vb(std::uint64_t *a, std::size_t lena,
10284 std::uint64_t u,
10285 std::uint64_t *b, std::size_t lenb,
10286 std::uint64_t v,
10287 std::uint64_t *r, std::size_t &lenr)
10288 { arithlib_assert(lena == lenb || lena == lenb+1);
10289 std::uint64_t hia, loa, ca = 0, hib, lob, cb = 0, borrow = 0;
10290 for (std::size_t i=0; i<lenb; i++)
10291 { multiply64(a[i], u, hia, loa);
10292 // hia is the high part of a product so carrying 1 into it can not cause it
10293 // to overflow. Just!
10294 hia += add_with_carry(loa, ca, loa);
10295 multiply64(b[i], v, hib, lob);
10296 hib += add_with_carry(lob, cb, lob);
10297 borrow = subtract_with_borrow(loa, lob, borrow, r[i]);
10298 ca = hia;
10299 cb = hib;
10300 }
10301 lenr = lenb;
10302 // I want to report in whether u*a-v*b was negative. To do that I will
10303 // first note that the result that I am computing should be less than the
10304 // value of a, so I do not get too much messy overflow. I will look at the
10305 // borrow out from the top word of the result.
10306 if (lena > lenb)
10307 { multiply64(a[lena-1], u, hia, loa);
10308 hia += add_with_carry(loa, ca, loa);
10309 borrow = subtract_with_borrow(loa, cb, borrow, r[lena-1]);
10310 lenr = lena;
10311 return negative(hia - borrow);
10312 }
10313 return negative(ca - cb - borrow);
10314 }
10315
10316 // Since the code here is quite short I will also provide a version
10317 // for r = -u*a + b*v;
10318 // Again this supposes that a is at least as long as b.
10319
minus_ua_plus_vb(std::uint64_t * a,std::size_t lena,std::uint64_t u,std::uint64_t * b,std::size_t lenb,std::uint64_t v,std::uint64_t * r,std::size_t & lenr)10320 inline bool minus_ua_plus_vb(std::uint64_t *a, std::size_t lena,
10321 std::uint64_t u,
10322 std::uint64_t *b, std::size_t lenb,
10323 std::uint64_t v,
10324 std::uint64_t *r, std::size_t &lenr)
10325 { arithlib_assert(lena == lenb || lena == lenb+1);
10326 std::uint64_t hia, loa, ca = 0, hib, lob, cb = 0, borrow = 0;
10327 for (std::size_t i=0; i<lenb; i++)
10328 { multiply64(a[i], u, hia, loa);
10329 hia += add_with_carry(loa, ca, loa);
10330 multiply64(b[i], v, hib, lob);
10331 hib += add_with_carry(lob, cb, lob);
10332 borrow = subtract_with_borrow(lob, loa, borrow, r[i]);
10333 ca = hia;
10334 cb = hib;
10335 }
10336 lenr = lenb;
10337 if (lena > lenb)
10338 { multiply64(a[lena-1], u, hia, loa);
10339 hia += add_with_carry(loa, ca, loa);
10340 borrow = subtract_with_borrow(cb, loa, borrow, r[lena-1]);
10341 lenr = lena;
10342 // It will be perfectly reasonable for hia to be zero and borrow to be zero
10343 // and hence the overall result positive.
10344 return negative(- hia - borrow);
10345 }
10346 return negative(cb - ca - borrow);
10347 }
10348
10349 // gcd_reduction starts with a > b and |b| >=2. It must reset a and
10350 // b (and their lengths) to be smaller. The basic Euclidean algorithm
10351 // would go
10352 // a = a % b; // otherwise a = a-q*b; for some useful value of q
10353 // // and then if q was "too large" go a = |a|;
10354 // swap(a, b);
10355 // but a Lehmer-style scheme can go distinctly faster overall.
10356
gcd_reduction(std::uint64_t * & a,std::size_t & lena,std::uint64_t * & b,std::size_t & lenb,std::size_t & olena,std::size_t & olenb,std::uint64_t * & temp,std::size_t & lentemp)10357 inline void gcd_reduction(std::uint64_t *&a, std::size_t &lena,
10358 std::uint64_t *&b, std::size_t &lenb,
10359 std::size_t &olena, std::size_t &olenb,
10360 std::uint64_t *&temp, std::size_t &lentemp)
10361 {
10362 // I will start by collecting high bits from a and b. If I collect the
10363 // contents of the top 3 words (ie 192 bits in all) then I will be able
10364 // to normalize that to get 128 bits to work with however the top bits
10365 // of a and b lie within the words.
10366 std::uint64_t a0=a[lena-1], a1=a[lena-2], a2=(lena>2 ? a[lena-3] : 0);
10367 int lza = nlz(a0);
10368 std::uint64_t b0=b[lenb-1], b1=b[lenb-2], b2=(lenb>2 ? b[lenb-3] : 0);
10369 int lzb = nlz(b0);
10370 // I will sort out how many more bits are involved in a than in b. If
10371 // this number is large I will invent a number q of the form q=q0*2^q1
10372 // with q0 using almost all of 64 bits and go "a = a - q*b;". This
10373 // will involve "virtually shifting" b left by q1 bits so what I actually
10374 // do is "a = a = q0*(b<<q1);". It will be obvious that the idea is that
10375 // q should be chosen so that the new value of a is as small as possible.
10376 // Given that q will be an estimate for the correct quotient and so may
10377 // occasionally be incorrect I will allow that it might in fact be
10378 // too large. In such a case the value of a computed will end up negative,
10379 // in which caseI do a final step that goes "a = -a;" to fix that. If I
10380 // manage to make q a round-to-nearest approximation to the quotient this
10381 // might happen a significant fraction of the time, ideally getting me
10382 // 1 extra bit in reduction in the size of a for each step.
10383 // If the estimated quotient is accurate enough the this will leave
10384 // a < b and by swapping a and b we have a new pair ready to continue from.
10385 std::int64_t diff = 64*(lena-lenb) + lzb - lza;
10386 // If however the length-difference between a and b is small a subtraction
10387 // "a = a - q0*(b<<0);" would often find q0 rather small and completing
10388 // the remainder sequence would take many steps. So in such cases I take
10389 // the top 128 bits of a and (128-diff) bits from b and start forming
10390 // a remainder sequence using 128-bit arithmetic until a term in it
10391 // fits in 64-bits. If the last 2 terms in that remainder sequence are
10392 // p and q (with p having >64 bits and q <= 64 bits) I can have
10393 // p = |Ua - Vb|, q = |-Wa + Xb|.
10394 // where U, V, W and X should all fit in 64-bits. That gives me a new
10395 // pair of values - expected to be up to 128-bits shorter - to continue
10396 // my remainder sequence. Because my stopping condition for the
10397 // approximate remainder sequence is not guaranteed perfect I can not
10398 // be certain that q < p, so I will need to compare the values and
10399 // swap as appropriate.
10400 //
10401 if (diff < 60)
10402 {
10403 // Try for Lehmer. The pair of values that will be 2-word surrogates
10404 // for a and b here will be the top 128 bits of a and however many bits of
10405 // b align with that. However if a has only 2 digits then I must NOT shift it
10406 // left, because that would make it seem to have a power of 2 as a factor
10407 // beyond any real such factors.
10408 // It could be that lenb < lena, but because a and b different in lengths
10409 // by at most 60 bits in that case lenb==lena-1. So adjust values so as to
10410 // align.
10411 if (lena != lenb)
10412 { b2 = b1;
10413 b1 = b0;
10414 b0 = 0;
10415 }
10416 if (lena > 2)
10417 { a0 = a0<<lza;
10418 if (lza!=0) a0 |= (a1>>(64-lza));
10419 a1 = a1<<lza;
10420 if (lza!=0) a1 |= (a2>>(64-lza));
10421 b0 = b0<<lza;
10422 if (lza!=0) b0 |= (b1>>(64-lza));
10423 b1 = b1<<lza;
10424 if (lza!=0) b1 |= (b2>>(64-lza));
10425 }
10426 // I will maintain an identity
10427 // a = ua*A + va*B
10428 // b = ub*A + vb*B
10429 // where A and B are the initial values in my remainder sequence and a and b
10430 // are working ones calculated along the way. Note horribly well here that
10431 // I am keeping these values as signed... but the code U have above that
10432 // calculates u*a-b*v will take unsigned inputs!
10433 std::int64_t ua = 1, va = 0, ub = 0, vb = 1;
10434 while (b0!=0 || b1!=0)
10435 { std::uint64_t q;
10436 // Here I want to set q = {a0,a1}/{b0,b1}, and I expect that the answer
10437 // is a reasonably small integer. But it could potentially be huge.
10438 // At least I have filtered away the possibility {b0,b1}={0,0}.
10439 // I will grab the top 64 bits of a and the top corresponding bits of b,
10440 // because then I can do a (cheap) 64-by-64 division.
10441 int lza1 = a0==0 ? 64+nlz(a1) : nlz(a0);
10442 int lzb1 = b0==0 ? 64+nlz(b1) : nlz(b0);
10443 if (lzb1 > lza1+60) break; // quotient will be too big
10444 std::uint64_t ahi, bhi;
10445 if (lza1 == 0) ahi = a0;
10446 else if (lza1 < 64) ahi = (a0<<lza1) | (a1>>(64-lza1));
10447 else if (lza1 == 64) ahi = a1;
10448 else ahi = a1<<(lza1-64);
10449 if (lza1 == 0) bhi = b0;
10450 else if (lza1 < 64) bhi = (b0<<lza1) | (b1>>(64-lza1));
10451 else if (lza1 == 64) bhi = b1;
10452 else bhi = b1<<(lza1-64);
10453 if (bhi == 0) break;
10454 // q could end up and over-estimate for the true quotient because bhi has
10455 // been truncated and so under-represents b. If that happens then a-q*b will
10456 // end up negative.
10457 q = ahi/bhi;
10458 if (negative(q)) break;
10459 arithlib_assert(q != 0);
10460 // Now I need to go
10461 // ua -= q*va;
10462 // ub -= q*vb;
10463 // {a0,a1} -= q*{b0,b1}
10464 // Then if a is negative I will negate a and ua and ub.
10465 // Finally, if (as I mostly expect) now a<b I swap a<=>b, ua<=>ub and va<=>vb
10466 // If I would get an overflow in updating ua or ub I will break out of the
10467 // loop.
10468 std::int64_t h;
10469 std::uint64_t l1, l2;
10470 signed_multiply64(q, va, h, l1);
10471 if (static_cast<std::uint64_t>(h) + (l1>>63) != 0) break;
10472 // There could be overflow in the following subtraction... So I check
10473 // if that was about to happen and break out of the loop if so.
10474 if (ua >= 0)
10475 { if (ua - INT64_MAX > static_cast<std::int64_t>(l1)) break;
10476 }
10477 else if (ua - INT64_MIN < static_cast<std::int64_t>(l1)) break;
10478 signed_multiply64(q, vb, h, l2);
10479 if (static_cast<std::uint64_t>(h) + (l2>>63) != 0) break;
10480 if (ub >= 0)
10481 { if (ub - INT64_MAX > static_cast<std::int64_t>(l2)) break;
10482 }
10483 else if (ub - INT64_MIN < static_cast<std::int64_t>(l2)) break;
10484 // I must either update both or neither of ua, ub.
10485 ua -= l1;
10486 ub -= l2;
10487 std::uint64_t hi, lo;
10488 multiply64(q, b1, hi, lo);
10489 hi += subtract_with_borrow(a1, lo, a1);
10490 std::uint64_t borrow = subtract_with_borrow(a0, hi, a0);
10491 borrow += subtract_with_borrow(a0, q*b0, a0);
10492 // Now borrow!=0 if a had become negative
10493 if (borrow != 0)
10494 { if ((a1 = -a1) == 0) a0 = -a0;
10495 else a0 = ~a0;
10496 ua = -ua;
10497 ub = -ub;
10498 }
10499 if (b0 > a0 ||
10500 (b0 == a0 && b1 > a1))
10501 { std::swap(a0, b0);
10502 std::swap(a1, b1);
10503 std::swap(ua, va);
10504 std::swap(ub, vb);
10505 }
10506 }
10507 // Ahah now I am almost done. I want to go
10508 // a' = |ua*a + ub*b|;
10509 // b' = |va*a + vb*b|;
10510 // if (a' > b') [a, b] = [a', b'];
10511 // else [a, b] = [b', a'];
10512 // and in the first two lines I need to be aware that one or the other
10513 // (but not both) or ua and ub will be negative so I really have a subtraction,
10514 // and similarly for v1, vb.
10515 if (temp == NULL)
10516 { push(a);
10517 push(b);
10518 temp = reserve(lena>lenb ? lena : lenb);
10519 pop(b);
10520 pop(a);
10521 }
10522 if (ub < 0)
10523 { arithlib_assert(ua >= 0);
10524 if (ua_minus_vb(a, lena, ua, b, lenb, -ub, temp, lentemp))
10525 internal_negate(temp, lentemp, temp);
10526 }
10527 else
10528 { arithlib_assert(ua <= 0);
10529 if (minus_ua_plus_vb(a, lena, -ua, b, lenb, ub, temp, lentemp))
10530 internal_negate(temp, lentemp, temp);
10531 }
10532 truncate_unsigned(temp, lentemp);
10533 if (vb < 0)
10534 { arithlib_assert(va >= 0);
10535 if (ua_minus_vb(a, lena, va, b, lenb, -vb, a, lena))
10536 internal_negate(a, lena, a);
10537 }
10538 else
10539 { arithlib_assert(va <= 0);
10540 if (minus_ua_plus_vb(a, lena, -va, b, lenb, vb, a, lena))
10541 internal_negate(a, lena, a);
10542 }
10543 truncate_unsigned(a, lena);
10544 internal_copy(temp, lentemp, b);
10545 lenb = lentemp;
10546 return;
10547 }
10548 // If I drop through to here I will do a simple reduction. This happens
10549 // either if the initial quotient a/b is huge (over 2^60) or if as I start
10550 // setting up for the Lehmer step I find I can not make enough progress
10551 // with that to be useful. For instance if the next two steps would have
10552 // q=1 and then q=<huge> I can not combine in the huge step to make Lehmer
10553 // style progress and I should drop down and do the "q=1" reduction first
10554 // (followed by the next huge one).
10555 //
10556 // This is the "a = a - q*b;" case.
10557 // Collect the top 128 bits of both a and b.
10558 b0 = b0<<lzb;
10559 if (lzb!=0) b0 |= (b1>>(64-lzb));
10560 b1 = b1<<lzb;
10561 if (lzb!=0) b1 |= (b2>>(64-lzb));
10562 a0 = a0<<lza;
10563 if (lza!=0) a0 |= (a1>>(64-lza));
10564 a1 = a1<<lza;
10565 if (lza!=0) a1 |= (a2>>(64-lza));
10566 a2 = a2<<lza;
10567 // When I have done this b0 will have its top bit set and I will
10568 // want to have a0<b0 because I will be dividing {a0,a1}/b0 and I want the
10569 // quotient to fit within a single 64-bit word.
10570 if (a0 >= b0)
10571 { a2 = (a2>>1) | (a1<<63);
10572 a1 = (a1>>1) | (a0<<63);
10573 a0 = a0>>1;
10574 lza = lza-1;
10575 diff = diff+1;
10576 }
10577 std::uint64_t q, r;
10578 // I want to get as close an approximation to the full quotient as I can,
10579 // and a "correction" of the form {a0,a1} -= a0*b1/b0 should do the trick.
10580 multiply64(a0, b1, q, r);
10581 divide64(q, r, b0, q, r);
10582 r = a1 - q;
10583 if (r > a1) a0--;
10584 a1 = r;
10585 divide64(a0, a1, b0, q, r);
10586 // Now I want to go "a = a - q*b*2^(diff-64);". The "-64" there is because
10587 // the quotient I computed in q is essentially to be viewed as a fraction.
10588 // So if diff<=64 I will need to do something special.
10589 if (diff <= 64)
10590 { std::size_t bits_to_lose = 64 - diff;
10591 // I will shift q right, but doing so in such a way that I try to round to
10592 // nearest.
10593 if (bits_to_lose != 0)
10594 { q = q >> (bits_to_lose-1);
10595 q = (q >> 1) + (q & 1);
10596 }
10597 // Now just do "a = a-q*b;", then ensure that the result is positive
10598 // and clear away any leading zeros left in its representation.
10599 if (reduce_for_gcd(a, lena, q, b, lenb))
10600 internal_negate(a, lena, a);
10601 truncate_unsigned(a, lena);
10602 }
10603 else
10604 {
10605 // Here I need to do a reduction but the quotient in the step is very large
10606 // so I will use the value of q I have as basically the top 60+ bits of the
10607 // quotient I need but with "diff" bits stuck on the end. If diff is a
10608 // multiple of 64 then this is merely a shift by some whole number of words.
10609 if ((diff%64) == 0)
10610 { std::size_t diffw = diff/64;
10611 if (reduce_for_gcd(a+diffw-1, lena+1-diffw, q, b, lenb))
10612 internal_negate(a, lena, a);
10613 truncate_unsigned(a, lena);
10614 }
10615 else
10616 { std::size_t diffw = diff/64;
10617 diff = diff%64;
10618 if (shifted_reduce_for_gcd(a+diffw-1, lena+1-diffw,
10619 q, b, lenb, diff))
10620 internal_negate(a, lena, a);
10621 truncate_unsigned(a, lena);
10622 }
10623 }
10624 }
10625
10626 // A bit of stray commentary here:
10627 // The simplest GCD scheme is direct Euclidean with the central loop
10628 // being
10629 // q = a/b;
10630 // {a, b} = {b, a - q*b};
10631 // There are those who observe that on average the quotient q will be
10632 // small, so they replace this with
10633 // {a, b} = {a, a - b};
10634 // swap a and b if necessary so that a>=b.
10635 // This takes more steps but each is a subtraction not a division/remainder
10636 // operation and so might sometimes be a win.
10637 // A "least-remainder" scheme is
10638 // q = (a + b/2)/b;
10639 // {a, b} = {b, |a - q*b|};
10640 // where the calculation of q just means round the quotient to nearest
10641 // rather than truncate it towards zero. At the cost of the extra absolute
10642 // value calculation this will reduce the number of steps. I believe that
10643 // using the Euclidean scheme each step shrinks the inputs by an average of
10644 // about 1.7 bits, while the least remainder scheme shrinks values by
10645 // 2.4 or 2.5 bits per step, ie it saves around 30% of the steps, albeit at
10646 // the cost of some absolute value calculations, which could go some way to
10647 // balance out the savings.
10648 // The quotient q will in general be small. In the case where it is very large
10649 // then calculating it becomes tedious. So in such cases it will make sense
10650 // to calculate a leading-digit approximation to it and reduce using that. A
10651 // step of that nature would be essentially what wa happening in long division
10652 // anyway, but now if the guessed quotient is not perfect all will be well
10653 // because subsequent reduction steps will correct for it automatically.
10654 // A Lehmer-style scheme will be useful when the firts several quotients in a
10655 // sequence will all be small - it consolidates big-number arithmetic over
10656 // what are logically multiple individual reduction steps.
10657
10658
op(std::uint64_t * a,std::uint64_t * b)10659 inline std::intptr_t Gcd::op(std::uint64_t *a, std::uint64_t *b)
10660 { if (number_size(b) == 1) return Gcd::op(a,
10661 static_cast<std::int64_t>(b[0]));
10662 // I will start by making copies of |a| and |b| that I can overwrite
10663 // during the calculation and use part of in my result.
10664 std::size_t lena = number_size(a), lenb = number_size(b);
10665 std::size_t olena = lena, olenb = lenb;
10666 if (olena == olenb &&
10667 // See comments later for an explanation of this!
10668 negative(a[lena-1]) && negative(b[lenb-1]) &&
10669 a[lena-1] == b[lenb-1]) olena++;
10670 push(a); push(b);
10671 std::uint64_t *av = reserve(olena);
10672 pop(b); pop(a);
10673 if (negative(a[lena-1])) internal_negate(a, lena, av);
10674 else internal_copy(a, lena, av);
10675 push(av); push(b);
10676 std::uint64_t *bv = reserve(olenb);
10677 pop(b); pop(av);
10678 if (negative(b[lenb-1])) internal_negate(b, lenb, bv);
10679 else internal_copy(b, lenb, bv);
10680 a = av;
10681 b = bv;
10682 if (big_unsigned_greaterp(b, lenb, a, lena))
10683 { std::swap(a, b);
10684 std::swap(lena, lenb);
10685 std::swap(olena, olenb);
10686 }
10687 // Now a >= b and both numbers are in freshly allocated memory. I will
10688 // remember the sizes of these two arrays.
10689 // Remove any leading zero digits, and if that reduces the situation to
10690 // a 1-word case handle that specially..
10691 if (b[lenb-1] == 0) lenb--;
10692 if (a[lena-1] == 0) lena--;
10693 if (lenb == 1)
10694 { std::uint64_t bb = b[0];
10695 std::uint64_t hi = 0, q;
10696 for (std::size_t i=lena-1;; i--)
10697 { divide64(hi, a[i], bb, q, hi);
10698 if (i == 0) break;
10699 }
10700 while (hi != 0)
10701 { std::uint64_t cc = bb % hi;
10702 bb = hi;
10703 hi = cc;
10704 }
10705 abandon(a);
10706 abandon(b);
10707 return unsigned_int_to_bignum(bb);
10708 }
10709 // In some cases performing a reduction will require a workspace vector.
10710 // I will only allocate this as and when first needed.
10711 std::uint64_t *temp = NULL;
10712 std::size_t lentemp = lena;
10713 // Now at last a and b and genuine unsigned vectors without leading digits
10714 // and with a > b. The next line is the key iteration in this whole procedure.
10715 while (lenb != 1)
10716 { gcd_reduction(a, lena, b, lenb, olena, olenb, temp, lentemp);
10717 if (big_unsigned_greaterp(b, lenb, a, lena))
10718 { std::swap(a, b);
10719 std::swap(lena, lenb);
10720 std::swap(olena, olenb);
10721 }
10722 }
10723 if (temp != NULL) abandon(temp);
10724 // One possibility is that b==0 and then a holds the GCD. There is a
10725 // pathological case where an input was -2^(64*n-1), which fits within n
10726 // words, and the GCD ends up as +2^(64*n-1) which needs an extra word.
10727 // If the other input had been bigger I can copy my result into it and
10728 // survive.. the very messy situation would be if both inputs were
10729 // -2^(64*n-1) so had I worked in the obvious way I would not have enough
10730 // space for the result. To allow for this I arrange that if both inputs
10731 // start off the same size (and ideally I would check if both had a value
10732 // of the form -2^(64*n-1), but doing that check is probably more expensive
10733 // that occasionally over-allocating memory!) I enlarge one of the inputs by
10734 // one word.
10735 if (b[0] == 0)
10736 { if (negative(a[lena-1]))
10737 { if (lena == olena)
10738 { internal_copy(a, lena, b);
10739 abandon(a);
10740 a = b;
10741 olena = olenb;
10742 }
10743 else abandon(b);
10744 #ifdef DEBUG_OVERRUN
10745 if (debug_arith)
10746 { arithlib_assert(a[olena] == 0xaaaaaaaaaaaaaaaaU);
10747 }
10748 #endif
10749 a[lena++] = 0;
10750 #ifdef DEBUG_OVERRUN
10751 if (debug_arith)
10752 { arithlib_assert(a[olena] == 0xaaaaaaaaaaaaaaaaU);
10753 }
10754 #endif
10755 }
10756 else abandon(b);
10757 return confirm_size(a, olena, lena);
10758 }
10759 // If b is not zero here then it represents a value up to 2^64-1, and I can
10760 // complete the GCD by doing a long-by-short remainder and then a short-num
10761 // GCD...
10762 std::uint64_t bb = b[0];
10763 abandon(b);
10764 std::uint64_t hi = 0, q;
10765 for (std::size_t i=lena-1;; i--)
10766 { divide64(hi, a[i], bb, q, hi);
10767 if (i == 0) break;
10768 }
10769 abandon(a);
10770 while (hi != 0)
10771 { std::uint64_t cc = bb % hi;
10772 bb = hi;
10773 hi = cc;
10774 }
10775 return unsigned_int_to_bignum(bb);
10776 }
10777
op(std::uint64_t * a,std::int64_t bb)10778 inline std::intptr_t Gcd::op(std::uint64_t *a, std::int64_t bb)
10779 {
10780 // This case involved doing a long-by-short remainder operation and then
10781 // it reduces to the small-small case. The main problem is the handling of
10782 // negative inputs.
10783 std::uint64_t b = bb < 0 ? -bb : bb;
10784 std::size_t lena = number_size(a);
10785 bool signa = negative(a[lena-1]);
10786 std::uint64_t hi = 0, q;
10787 for (std::size_t i=lena-1;; i--)
10788 { divide64(hi, (signa ? ~a[i] : a[i]), b, q, hi);
10789 if (i == 0) break;
10790 }
10791 // Now if a had been positive we have hi=a%b. If a had been negative we
10792 // have (~a)%b == (-a-1)%b which is about |a|%b -1
10793 if (signa) hi = (hi+1)%b;
10794 return Gcd::op(b, hi);
10795 }
10796
op(std::int64_t a,std::uint64_t * b)10797 inline std::intptr_t Gcd::op(std::int64_t a, std::uint64_t *b)
10798 { return Gcd::op(b, a);
10799 }
10800
op(std::int64_t a,std::int64_t b)10801 inline std::intptr_t Gcd::op(std::int64_t a, std::int64_t b)
10802 {
10803 // Take absolute values of both arguments.
10804 std::uint64_t aa = a < 0 ? -static_cast<std::uint64_t>(a) : a;
10805 std::uint64_t bb = b < 0 ? -static_cast<std::uint64_t>(b) : b;
10806 // Ensure that aa >= bb
10807 if (bb > aa) std::swap(aa, bb);
10808 // Do simple Euclidean algorithm
10809 while (bb != 0)
10810 { std::uint64_t cc = aa % bb;
10811 aa = bb;
10812 bb = cc;
10813 }
10814 // A messy case is gcd(-MIX_FIXNUM, MIN_FIXNUM) which yields -MIN_FIXNUM
10815 // which is liable to be MAX_FIXNUM+1 and so has to be returned as a bignum.
10816 return unsigned_int_to_bignum(aa);
10817 }
10818
10819 // I think I have space-leaks within my code for LCM. For use in
10820 // a system with garbage collection that will not matter (hoorah) but at
10821 // some stage I need to come back here and look harder and tidy things up.
10822
op(std::uint64_t * a,std::uint64_t * b)10823 inline std::intptr_t Lcm::op(std::uint64_t *a, std::uint64_t *b)
10824 { push(a); push(b);
10825 std::intptr_t g = Gcd::op(a, b);
10826 pop(b);
10827 std::intptr_t q = op_dispatch2<Quotient,std::intptr_t>
10828 (vector_to_handle(b),
10829 g);
10830 pop(a);
10831 q = op_dispatch2<Times,std::intptr_t>(vector_to_handle(a), q);
10832 return op_dispatch1<Abs,std::intptr_t>(q);
10833 }
10834
op(std::uint64_t * a,std::int64_t b)10835 inline std::intptr_t Lcm::op(std::uint64_t *a, std::int64_t b)
10836 { push(a);
10837 std::intptr_t g = Gcd::op(a, b);
10838 std::intptr_t q = op_dispatch2<Quotient,std::intptr_t>(int_to_handle(
10839 b), g);
10840 pop(a);
10841 q = op_dispatch2<Times,std::intptr_t>(vector_to_handle(a), q);
10842 return op_dispatch1<Abs,std::intptr_t>(q);
10843 }
10844
op(std::int64_t a,std::uint64_t * b)10845 inline std::intptr_t Lcm::op(std::int64_t a, std::uint64_t *b)
10846 { return Lcm::op(b, a);
10847 }
10848
op(std::int64_t a,std::int64_t b)10849 inline std::intptr_t Lcm::op(std::int64_t a, std::int64_t b)
10850 { std::intptr_t g = Gcd::op(a, b);
10851 // The GCD can only be a bignum if a = b = MIN_FIXNUM.
10852 if (stored_as_fixnum(g))
10853 { b = b / int_of_handle(g);
10854 std::intptr_t q = Times::op(a, b); // possibly a bignum now
10855 return op_dispatch1<Abs,std::intptr_t>(q);
10856 }
10857 else return unsigned_int_to_bignum(-static_cast<std::uint64_t>
10858 (MIN_FIXNUM));
10859 }
10860
10861 #ifdef CSL
10862 // Support for calculations modulo some integer value...
10863
10864 // While initially developing this bit of code I will assume C++17 and
10865 // hence that inline variables are supported.
10866
10867 static const int modulus_32 = 0;
10868 static const int modulus_64 = 1;
10869 static const int modulus_big = 2;
10870 // On Windows these thread-locals may introduce serious overhead. I
10871 // will worry about that later if needbe.
10872 thread_local inline int modulus_size = 0;
10873 thread_local inline std::uint64_t small_modulus = 2;
10874 thread_local inline std::vector<std::uint64_t>
10875 large_modulus_vector;
10876
large_modulus()10877 inline std::uint64_t *large_modulus()
10878 { return 1 + reinterpret_cast<std::uint64_t *>
10879 (large_modulus_vector.data());
10880 }
10881
value_of_current_modulus()10882 inline std::intptr_t value_of_current_modulus()
10883 { if (modulus_size == modulus_big)
10884 { std::size_t n = number_size(large_modulus());
10885 std::uint64_t *r = reserve(n);
10886 std::memcpy(r, large_modulus(), n*sizeof(std::uint64_t));
10887 return confirm_size(r, n, n);
10888 }
10889 else return int_to_handle(small_modulus);
10890 }
10891
op(std::int64_t n)10892 inline std::intptr_t SetModulus::op(std::int64_t n)
10893 { if (n <= 1) return (std::intptr_t)aerror1("Invalid arg to set-modulus",
10894 int_to_handle(n));
10895 std::intptr_t r = value_of_current_modulus();
10896 small_modulus = n;
10897 if (n <= 0xffffffffU) modulus_size = modulus_32;
10898 else modulus_size = modulus_64;
10899 return r;
10900 }
10901
op(std::uint64_t * n)10902 inline std::intptr_t SetModulus::op(std::uint64_t *n)
10903 { if (Minusp::op(n))
10904 return (std::intptr_t)aerror1("Invalid arg to set-modulus", vector_to_handle(n));
10905 std::intptr_t r = value_of_current_modulus();
10906 std::size_t lenn = number_size(n);
10907 std::size_t bytes = (lenn+1)*sizeof(std::uint64_t);
10908 if (bytes > large_modulus_vector.size())
10909 large_modulus_vector.resize(bytes);
10910 std::memcpy(large_modulus_vector.data(), &n[-1], bytes);
10911 modulus_size = modulus_big;
10912 return r;
10913 }
10914
op(std::int64_t a)10915 inline std::intptr_t ModularNumber::op(std::int64_t a)
10916 { if (a >= 0)
10917 { if (modulus_size == modulus_big) return int_to_handle(a);
10918 else return int_to_handle(a % small_modulus);
10919 }
10920 if (modulus_size == modulus_big) return Plus::op(large_modulus(), a);
10921 else
10922 { a = a % small_modulus;
10923 if (a < 0) a += small_modulus;
10924 return int_to_handle(a);
10925 }
10926 }
10927
op(std::uint64_t * a)10928 inline std::intptr_t ModularNumber::op(std::uint64_t *a)
10929 { if (Minusp::op(a))
10930 { std::intptr_t r = Remainder::op(a, large_modulus());
10931 if (Minusp::op(r))
10932 { std::intptr_t r1 = op_dispatch1<Plus,std::intptr_t>(r,
10933 large_modulus());
10934 abandon(r);
10935 return r1;
10936 }
10937 else return r;
10938 }
10939 else return Remainder::op(a, large_modulus());
10940 }
10941
op(std::int64_t a,std::int64_t b)10942 inline std::intptr_t ModularPlus::op(std::int64_t a, std::int64_t b)
10943 { std::uint64_t ua = a, ub = b;
10944 // Because a and b are 64-bit signed values and they should be positive,
10945 // their sum will fit within a 64-bit unsigned integer, but if the modulus
10946 // is large it could be just a 1-word bignum...
10947 if (modulus_size == modulus_big)
10948 { std::uint64_t r = ua + ub;
10949 if (number_size(large_modulus()) == 1 &&
10950 r >= large_modulus()[0]) r -= large_modulus()[0];
10951 return unsigned_int_to_bignum(r);
10952 }
10953 std::uint64_t r = ua + ub;
10954 if (r >= small_modulus) r -= small_modulus;
10955 return int_to_handle(static_cast<std::int64_t>(r));
10956 }
10957
op(std::int64_t a,std::uint64_t * b)10958 inline std::intptr_t ModularPlus::op(std::int64_t a, std::uint64_t *b)
10959 {
10960 // One of the inputs here is a bignum, and that can only be valid if we
10961 // have a large modulus.
10962 if (modulus_size != modulus_big) return (std::intptr_t)aerror1("bad arg for modular-plus",
10963 vector_to_handle(b));
10964 std::intptr_t r = Plus::op(a, b);
10965 if (op_dispatch1<Geq,bool>(r, large_modulus()))
10966 { std::intptr_t r1 = op_dispatch1<Difference,std::intptr_t>(r,
10967 large_modulus());
10968 abandon(r);
10969 return r1;
10970 }
10971 else return r;
10972 }
10973
op(std::uint64_t * a,std::int64_t b)10974 inline std::intptr_t ModularPlus::op(std::uint64_t *a, std::int64_t b)
10975 { return ModularPlus::op(b, a);
10976 }
10977
op(std::uint64_t * a,std::uint64_t * b)10978 inline std::intptr_t ModularPlus::op(std::uint64_t *a,
10979 std::uint64_t *b)
10980 { if (modulus_size != modulus_big)
10981 return (std::intptr_t)aerror1("bad arg for modular-plus",
10982 vector_to_handle(a));
10983 std::intptr_t r = Plus::op(a, b);
10984 if (op_dispatch1<Geq, bool>(r, large_modulus()))
10985 { std::intptr_t r1 = op_dispatch1<Difference,std::intptr_t>(r,
10986 large_modulus());
10987 abandon(r);
10988 return r1;
10989 }
10990 else return r;
10991 }
10992
op(std::int64_t a,std::int64_t b)10993 inline std::intptr_t ModularDifference::op(std::int64_t a,
10994 std::int64_t b)
10995 { if (a >= b) return int_to_handle(a - b);
10996 if (modulus_size == modulus_big) return Plus::op(large_modulus(),
10997 a - b);
10998 else return int_to_handle(small_modulus - b + a);
10999 }
11000
op(std::int64_t a,std::uint64_t * b)11001 inline std::intptr_t ModularDifference::op(std::int64_t a,
11002 std::uint64_t *b)
11003 { if (modulus_size != modulus_big)
11004 return (std::intptr_t)aerror1("bad arg for modular-plus",
11005 vector_to_handle(b));
11006 std::intptr_t r = Difference::op(b, a);
11007 std::intptr_t r1 = op_dispatch1<RevDifference,std::intptr_t>(r,
11008 large_modulus());
11009 abandon(r);
11010 return r1;
11011 }
11012
op(std::uint64_t * a,std::int64_t b)11013 inline std::intptr_t ModularDifference::op(std::uint64_t *a,
11014 std::int64_t b)
11015 { if (modulus_size != modulus_big)
11016 return (std::intptr_t)aerror1("bad arg for modular-plus",
11017 vector_to_handle(a));
11018 return Difference::op(a, b);
11019 }
11020
op(std::uint64_t * a,std::uint64_t * b)11021 inline std::intptr_t ModularDifference::op(std::uint64_t *a,
11022 std::uint64_t *b)
11023 { if (modulus_size != modulus_big)
11024 return (std::intptr_t)aerror1("bad arg for modular-plus",
11025 vector_to_handle(a));
11026 if (Geq::op(a, b)) return Difference::op(a, b);
11027 std::intptr_t r = Difference::op(b, a);
11028 std::intptr_t r1 = op_dispatch1<RevDifference,std::intptr_t>(r,
11029 large_modulus());
11030 abandon(r);
11031 return r1;
11032 }
11033
11034
op(std::int64_t a,std::int64_t b)11035 inline std::intptr_t ModularTimes::op(std::int64_t a, std::int64_t b)
11036 { switch (modulus_size)
11037 { case modulus_32:
11038 return int_to_handle(static_cast<std::uint64_t>(a)
11039 *static_cast<std::uint64_t>
11040 (b) % small_modulus);
11041 case modulus_64:
11042 { std::uint64_t hi, lo, q, r;
11043 multiply64(static_cast<std::uint64_t>(a),
11044 static_cast<std::uint64_t>(b), hi,
11045 lo);
11046 divide64(hi, lo, small_modulus, q, r);
11047 return int_to_handle(r);
11048 }
11049 default:
11050 case modulus_big:
11051 { std::intptr_t w = Times::op(a, b);
11052 std::intptr_t r = op_dispatch1<Remainder,std::intptr_t>(w,
11053 large_modulus());
11054 abandon(w);
11055 return r;
11056 }
11057 }
11058 }
11059
op(std::int64_t a,std::uint64_t * b)11060 inline std::intptr_t ModularTimes::op(std::int64_t a,
11061 std::uint64_t *b)
11062 { std::intptr_t w = Times::op(a, b);
11063 std::intptr_t r = op_dispatch1<Remainder,std::intptr_t>(w,
11064 large_modulus());
11065 abandon(w);
11066 return r;
11067 }
11068
op(std::uint64_t * a,std::int64_t b)11069 inline std::intptr_t ModularTimes::op(std::uint64_t *a,
11070 std::int64_t b)
11071 { return ModularTimes::op(b, a);
11072 }
11073
op(std::uint64_t * a,std::uint64_t * b)11074 inline std::intptr_t ModularTimes::op(std::uint64_t *a,
11075 std::uint64_t *b)
11076 { std::intptr_t w = Times::op(a, b);
11077 std::intptr_t r = op_dispatch1<Remainder,std::intptr_t>(w,
11078 large_modulus());
11079 abandon(w);
11080 return r;
11081 }
11082
11083
op(std::int64_t a,std::int64_t b)11084 inline std::intptr_t ModularExpt::op(std::int64_t a, std::int64_t b)
11085 { return (std::intptr_t)aerror("incomplete");
11086 }
11087
op(std::int64_t a,std::uint64_t * b)11088 inline std::intptr_t ModularExpt::op(std::int64_t a, std::uint64_t *b)
11089 { return (std::intptr_t)aerror("incomplete");
11090 }
11091
op(std::uint64_t * a,std::int64_t b)11092 inline std::intptr_t ModularExpt::op(std::uint64_t *a, std::int64_t b)
11093 { return (std::intptr_t)aerror("incomplete");
11094 }
11095
op(std::uint64_t * a,std::uint64_t * b)11096 inline std::intptr_t ModularExpt::op(std::uint64_t *a,
11097 std::uint64_t *b)
11098 { return (std::intptr_t)aerror("incomplete");
11099 }
11100
11101
op(std::int64_t a,std::int64_t b)11102 inline std::intptr_t ModularQuotient::op(std::int64_t a,
11103 std::int64_t b)
11104 { return (std::intptr_t)aerror("incomplete");
11105 }
11106
op(std::int64_t a,std::uint64_t * b)11107 inline std::intptr_t ModularQuotient::op(std::int64_t a,
11108 std::uint64_t *b)
11109 { return (std::intptr_t)aerror("incomplete");
11110 }
11111
op(std::uint64_t * a,std::int64_t b)11112 inline std::intptr_t ModularQuotient::op(std::uint64_t *a,
11113 std::int64_t b)
11114 { return (std::intptr_t)aerror("incomplete");
11115 }
11116
op(std::uint64_t * a,std::uint64_t * b)11117 inline std::intptr_t ModularQuotient::op(std::uint64_t *a,
11118 std::uint64_t *b)
11119 { return (std::intptr_t)aerror("incomplete");
11120 }
11121
11122
op(std::int64_t a)11123 inline std::intptr_t ModularMinus::op(std::int64_t a)
11124 { if (a == 0) return int_to_handle(a);
11125 if (modulus_size == modulus_big)
11126 return Difference::op(large_modulus(), a);
11127 else return int_to_handle(small_modulus - a);
11128 }
11129
op(std::uint64_t * a)11130 inline std::intptr_t ModularMinus::op(std::uint64_t *a)
11131 { if (modulus_size != modulus_big)
11132 return (std::intptr_t)aerror1("bad argument for modular-minus", vector_to_handle(a));
11133 return Difference::op(large_modulus(), a);
11134 }
11135
general_modular_reciprocal(std::intptr_t a)11136 inline std::intptr_t general_modular_reciprocal(std::intptr_t a)
11137 { return (std::intptr_t)aerror("not coded yet");
11138 }
11139
op(std::int64_t aa)11140 inline std::intptr_t ModularReciprocal::op(std::int64_t aa)
11141 { if (aa <= 0) return (std::intptr_t)aerror1("bad argument to modular-reciprocal",
11142 int_to_handle(aa));
11143 else if (modulus_size == modulus_big)
11144 return general_modular_reciprocal(int_to_handle(aa));
11145 std::int64_t a = small_modulus,
11146 b = aa,
11147 x = 0,
11148 y = 1;
11149 while (b != 1)
11150 { std::uint64_t w, t;
11151 if (b == 0)
11152 return (std::intptr_t)aerror2("non-prime modulus in modular-reciprocal",
11153 int_to_handle(small_modulus),
11154 int_to_handle(aa));
11155 w = a / b;
11156 t = b;
11157 b = a - b*w;
11158 a = t;
11159 t = y;
11160 y = x - y*w;
11161 x = t;
11162 }
11163 if (y < 0) y += small_modulus;
11164 return int_to_handle(y);
11165 }
11166
op(std::uint64_t * a)11167 inline std::intptr_t ModularReciprocal::op(std::uint64_t *a)
11168 { return general_modular_reciprocal(vector_to_handle(a));
11169 }
11170
11171 #endif // CSL
11172
11173 } // end of namespace arithlib_implementation
11174
11175 // I want a namespace that the user can activate via "using" that only
11176 // gives access to things that ought to be exported by this library. So
11177 // arithlib_implementation is to be thought of as somewhat low level and
11178 // private, while just plain arithlib may be enough for the typical C++
11179 // user who is just going to be using the "Bignum" type.
11180 //
11181 // [The issue of whether I have everything I need included in this list
11182 // remains uncertain, however a user can either add to the section here
11183 // or use the arithlib_implementation namespace directly in case of upset]
11184
11185 namespace arithlib
11186 {
11187 using arithlib_implementation::operator"" _Z;
11188 using arithlib_implementation::Bignum;
11189
11190 using arithlib_implementation::mersenne_twister;
11191 using arithlib_implementation::reseed;
11192 using arithlib_implementation::uniform_uint64;
11193 using arithlib_implementation::uniform_positive;
11194 using arithlib_implementation::uniform_signed;
11195 using arithlib_implementation::random_upto_bits_bignum;
11196
11197 using arithlib_implementation::display;
11198 using arithlib_implementation::fix_bignum;
11199 }
11200
11201 // I am putting in names that CSL uses here...
11202
11203 namespace arithlib_lowlevel
11204 {
11205 using arithlib_implementation::Plus;
11206 using arithlib_implementation::Difference;
11207 using arithlib_implementation::RevDifference;
11208 using arithlib_implementation::Times;
11209 using arithlib_implementation::Quotient;
11210 using arithlib_implementation::Remainder;
11211 using arithlib_implementation::Divide;
11212 using arithlib_implementation::Gcd;
11213 using arithlib_implementation::Lcm;
11214 using arithlib_implementation::Logand;
11215 using arithlib_implementation::Logor;
11216 using arithlib_implementation::Logxor;
11217 using arithlib_implementation::Logeqv;
11218 using arithlib_implementation::Zerop;
11219 using arithlib_implementation::Onep;
11220 using arithlib_implementation::Minusp;
11221 using arithlib_implementation::Evenp;
11222 using arithlib_implementation::Oddp;
11223 using arithlib_implementation::Eqn;
11224 using arithlib_implementation::Neqn;
11225 using arithlib_implementation::Geq;
11226 using arithlib_implementation::Greaterp;
11227 using arithlib_implementation::Leq;
11228 using arithlib_implementation::Lessp;
11229 using arithlib_implementation::Add1;
11230 using arithlib_implementation::Sub1;
11231 using arithlib_implementation::Minus;
11232 using arithlib_implementation::Abs;
11233 using arithlib_implementation::Square;
11234 using arithlib_implementation::Isqrt;
11235 using arithlib_implementation::Lognot;
11236 using arithlib_implementation::Pow;
11237 using arithlib_implementation::LeftShift;
11238 using arithlib_implementation::RightShift;
11239 using arithlib_implementation::IntegerLength;
11240 using arithlib_implementation::Low_bit;
11241 using arithlib_implementation::Logbitp;
11242 using arithlib_implementation::Logcount;
11243 using arithlib_implementation::Float;
11244 using arithlib_implementation::Double;
11245 using arithlib_implementation::Frexp;
11246 #ifdef CSL
11247 using arithlib_implementation::ModularPlus;
11248 using arithlib_implementation::ModularDifference;
11249 using arithlib_implementation::ModularTimes;
11250 using arithlib_implementation::ModularExpt;
11251 using arithlib_implementation::ModularQuotient;
11252 using arithlib_implementation::ModularMinus;
11253 using arithlib_implementation::ModularReciprocal;
11254 using arithlib_implementation::ModularNumber;
11255 using arithlib_implementation::SetModulus;
11256 #endif // CSL
11257
11258 using arithlib_implementation::bignum_to_string;
11259 using arithlib_implementation::bignum_to_string_length;
11260 using arithlib_implementation::bignum_to_string_hex;
11261 using arithlib_implementation::bignum_to_string_hex_length;
11262 using arithlib_implementation::bignum_to_string_octal;
11263 using arithlib_implementation::bignum_to_string_octal_length;
11264 using arithlib_implementation::bignum_to_string_binary;
11265 using arithlib_implementation::bignum_to_string_binary_length;
11266
11267 using arithlib_implementation::round_double_to_int;
11268 using arithlib_implementation::trunc_double_to_int;
11269 using arithlib_implementation::floor_double_to_int;
11270 using arithlib_implementation::ceiling_double_to_int;
11271
11272 #ifdef softfloat_h
11273 using arithlib_implementation::Float128;
11274 using arithlib_implementation::Frexp128;
11275 using arithlib_implementation::round_float128_to_int;
11276 using arithlib_implementation::trunc_float128_to_int;
11277 using arithlib_implementation::floor_float128_to_int;
11278 using arithlib_implementation::ceiling_float128_to_int;
11279 // These next few are just raw float128_t operations.
11280 using arithlib_implementation::f128_0;
11281 using arithlib_implementation::f128_half;
11282 using arithlib_implementation::f128_mhalf;
11283 using arithlib_implementation::f128_1;
11284 using arithlib_implementation::f128_m1;
11285 using arithlib_implementation::f128_zero;
11286 using arithlib_implementation::f128_infinite;
11287 using arithlib_implementation::f128_nan;
11288 using arithlib_implementation::frexp;
11289 using arithlib_implementation::ldexp;
11290 using arithlib_implementation::modf;
11291 #endif // softfloat_h
11292
11293 //using arithlib_implementation::negative;
11294 //using arithlib_implementation::number_size;
11295
11296 using arithlib_implementation::cast_to_float;
11297 }
11298
11299 #endif // __arithlib_hpp
11300
11301 // end of arithlib.hpp
11302