1 // newallocate.cpp                         Copyright (C) 2018-2021 Codemist
2 //
3 // Code to deal with storage allocation, both grabbing memory at the start
4 // or a run and significant aspects of garbage collection.
5 //
6 
7 // I have planned all this and even started work on it several times, with
8 // leter rounds of work reflecting gradually clearing understanding of some
9 // of the challenges and trade-offs. However I am concerned that the
10 // iteration may have left some blocks of comments or fragments of code from
11 // earlier versions still in place. I will try to go through and tidy up
12 // on that front, but to some extent that will best happen if and when I get
13 // a fairly fully working version so that there starts to be real stability.
14 
15 
16 
17 // Development game-plan for this stuff - showing the steps or stages
18 // to go through:
19 // [Steps I have achieved are deleted from this list!]
20 //
21 // (3) Arrange that every garbage collection will be a major one and
22 //     re-work the allocation and re-allocation of memory blocks for that.
23 //     By keeping the code with precise list-bases for everything that
24 //     matters the ambiguous pointers should never be the only reason
25 //     for saving any data - they just pin things and hence mess up memory
26 //     layout. Get that version of GC working. Note that write barriers
27 //     may collect information but it is never used!
28 // (4) Make some collections minor ones, thus needing to cope with the
29 //     consequences of the write barrier.
30 // (5) Put in explicit test cases for data that is only preserved via an
31 //     ambiguous list-base.
32 // (6) Thread-synchronization for GC entry.
33 // (7) Thread-local support for fluid bindings, and simple code in the Lisp
34 //     for creating threads, even though almost everything is not thread-safe.
35 // (8) [in various orders] get rid of push/pop stuff if the main code in
36 //     favour of just letting conservative memory management cope. And
37 //     migrate more status to thread-local and/or protect it with critical
38 //     regions.
39 // (9) Fix ccomp.red and bytes2.cpp regarding new treatment of fluids and
40 //     for thread safety.
41 // (10)Protection of blocking calls so that GC can still happen.
42 
43 
44 // Well by the time I have got started at all on that list the issue of
45 // the exact sequence towards the end will become clearer! Really it is the
46 // first 3 that are my initial plan.
47 
48 
49 /**************************************************************************
50  * Copyright (C) 2021, Codemist.                         A C Norman       *
51  *                                                                        *
52  * Redistribution and use in source and binary forms, with or without     *
53  * modification, are permitted provided that the following conditions are *
54  * met:                                                                   *
55  *                                                                        *
56  *     * Redistributions of source code must retain the relevant          *
57  *       copyright notice, this list of conditions and the following      *
58  *       disclaimer.                                                      *
59  *     * Redistributions in binary form must reproduce the above          *
60  *       copyright notice, this list of conditions and the following      *
61  *       disclaimer in the documentation and/or other materials provided  *
62  *       with the distribution.                                           *
63  *                                                                        *
64  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    *
65  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      *
66  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS      *
67  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE         *
68  * COPYRIGHT OWNERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,   *
69  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,   *
70  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS  *
71  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
72  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR  *
73  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF     *
74  * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH   *
75  * DAMAGE.                                                                *
76  *************************************************************************/
77 
78 // $Id: newallocate.cpp 5757 2021-04-08 16:36:14Z arthurcnorman $
79 
80 /*!!! csl
81 */
82 
83 #include "headers.h"
84 
85 #ifdef HAVE_CXX17
86 
87 // In code that at present (February 2021) is disabled with "#if 0"
88 // I use the Mersenne Twister generator as set up in arithlib.hpp
89 // to set up some ransomized testing for this code. That will only
90 // compile if you have a "modern" C++ compiler. For use here arithlib.hpp
91 // need to know it is being used from within CSL...
92 
93 #ifndef LISP
94 #define LISP 1
95 #endif
96 #ifndef CSL
97 #define CSL 1
98 #endif // CSL
99 #include "arithlib.hpp"  // For random number support
100 #endif
101 
102 //
103 // This is a place for my latest round of thinking about a new storage
104 // management scheme. The ideal I set now is that garbage collection should
105 // be both conservative and generational and that it should be able to
106 // support multiple mutator threads. I have some ideas about using multiple
107 // threads during garbage collection but serious worries about possible
108 // synchronization overheads so that will not be an early target. And I have
109 // no plans to try and garbage collect and compute concurrently - so every
110 // garbage collection will need to synchronize and stop all worker threads.
111 
112 // The design here has to mingle plans for global storage allocation,
113 // the CONS operation as well as operations involving Lisp atoms, the
114 // way that Lisp threads can access the values of variables, synchronization
115 // matters and so on. In general it seems that almost all aspects from the
116 // lowest to the most global level interact. Ugh!
117 
118 // So here is an overview. Well I fear I may have written bits of this already
119 // in several places, but writing it out helps me to plan. All of the numeric
120 // values given here are indicative rather than guaanteed final.
121 
122 // Memory is grabbed from the operating system in large chunks, with a few
123 // hundred megabytes for the first chunk and larger ones potentially sized
124 // in geometric progression, so that with a maximum of 16 chunks I can end up
125 // with as much memory as can ever be useful. The limit on the total number
126 // of chunks used as so that given a value that might be a pointer it will
127 // be possible to identify the chunk it might point within using a small
128 // number of comparison operations. The mega-chunk allocation is such that the
129 // system can start with fairly sane amounts of memory but expand on need.
130 //
131 // Each chunk is divided into 8Mbyte pages. At any one moment one page will
132 // be "Current", a second will be "Previous", some others will be "Busy" and
133 // the rest "Free". All mutator allocation happens within the Current page.
134 // When that becomes full live data is evacuated from the Previous page into
135 // Busy memory and what was Current becomes Previous. The old Previous page
136 // becomes Free and a Free page is chosen to be the new Current. Note this
137 // might be done by swapping the sense of Current and Previous, but if I
138 // did that then if those two pages get severly fragmented that that situation
139 // would persist, so I will want to arrange that pages get moved to be
140 // extensions to the Busy pool from time to time. Which will tend to happen
141 // naturally but may need forcing in pathological case!
142 //
143 // Because garbage collection is conservative there can be ambiguous pointers
144 // that refer into Previous. Data at the locations so addressed must not be
145 // moved: I will refer to it as "pinned". The Current and each Free page may
146 // have some pinned items present. When a page becomes Current it will have
147 // two pointers - gFringe and gLimit. gFringe identifies the first
148 // point at which data could be allocated, gLimit points either one beyond
149 // the end of the page or to the start address of the next pinned chunk.
150 // If there is a pinned chunk then it will contain one field that indicates
151 // its length and a second one that points to any pinned chunk beyond it.
152 // In Previous and Busy each page will be kept full by placement of padder
153 // pseudo-objects anywhere where there could be gaps: that is so that it is
154 // possible to perform a sequential scan of the page with each item within
155 // it identifiable by inspection of its first (header) word. This linear scan
156 // is needed because pinning works by pinning significant-size chunks of
157 // memory were the chunk always has an object starting at its start address
158 // and is always neatly filled. The garbage collectore has to treat all
159 // pointer-containing fields in such a chunk as roots. I argue that if some
160 // data is pinned then pinning other data that is close to it in memory will
161 // probably not lead to TOO much additional waste! But until I have it
162 // coded and tested I might not know.
163 //
164 // Allocation within Current is by incrementing gFringe and if necessary
165 // skipping on to gNext. By making gFringe atomic this allocation can be
166 // thread-safe.
167 //
168 // Within a page allocation (via gFringe) allocated 16Kbyte chunks to each
169 // thread. The thread holds variable fringe and limit to allow it to perform
170 // simple sequential allocation within the chunk with low overhead. During
171 // garbage collection it is necessary to access and update fringe and limit
172 // values for every thread, and to support that I have data such as the
173 // array fringeBis[] that is indexed by a thread-number. Especially when
174 // garbage collection is being triggered data can be moved from the simple
175 // (albeit thread-local) variable into this array. Because each thread has
176 // its own chunk most allocation within a chunk doe snot involve any locks.
177 // When a chunk overflows a fresh one needs to be allocated. If the request
178 // that triggered this was for n bytes of memory that the next chunk that is
179 // grabbed is of size 16K+n so that 16Kbytes are left available after the
180 // potentiallt large allocation. As regards space use the worst case for this
181 // is if there are many sequential requests each needing just over 16Kbytes
182 // apiece. In that case around 50% of memory in Current end up unused. This
183 // space will remain unused in Previous, but when data is evacuated to Busy
184 // it will be packed properly closely, so across the whole system the worst
185 // possible "waste" that way will be 8Mbytes. Of course most allocations will
186 // be of much smaller objects and so I am not worried in this area.
187 //
188 // The trigger for garbage collection is when at least one thread attempts
189 // an allocation and moved gFringe on beyond gLimit. As a C pedant I ough to
190 // worry if that led to a pointer beyond the range of the mega-chunk that
191 // had been grabbed from the OS, or (worse) if it led to overflow in address
192 // arithmetic. If I was super-cautious I would leave the top of each huge
193 // chunk as unused buffer against just such concerns. The amount I would
194 // need to reserve would be VECTOR_CHUNK_BYTES (2Mb) times the maximum number
195 // of threads (64), and that may be way more that I am happy with! So just
196 // for now I will duck that concern. But I will want the higgest-addressed
197 // huge chunk to end at least 128M before the end of memory address space!
198 //
199 // When a thread (or several threads) overflows a page it must trigger garbage
200 // collection. In the process it must synchronize with all the other threads.
201 // To achieve that it sets the limit value in every (active) thread to zero.
202 // The consequence will be that when the thread next tried to allocate memory
203 // it will get the initial impression that its chunk was full and can go into
204 // a more elaborate path which detects that actual situation. There are two
205 // cases when a memory allocation request might not arise promptly. One is
206 // code loops that do not allocate memory: each of those must contain a
207 // polling request which will be implemented as a request for zero bytes of
208 // memory - which in term will merely amount to a comparison between fringe
209 // and limit. The other would be when the thread was perfroming some operation
210 // that could or did block. That will obviously include when it uses range of
211 // synchronization primitives. In such case before blocking it must remove
212 // itself from the pool of active threads, and it can reinstate itself later.
213 //
214 // As garbage collection starts each thread must record its stack pointer and
215 // ensure that all pointer-objects that it is using are recorded on the
216 // stack (as well as potentially being in machine registers). The (C++) stack
217 // will then be scanned with all values on it treated as potential or
218 // ambiguous pointers.
219 //
220 // Given a potential pointer the system must be able to determine if it in
221 // fact points within a properly allocated object. This process starts by
222 // checking which mega-block is involved. From that information simple
223 // arithmetic can identify a page. Pointers that are into the header sections
224 // of a page or beyond its gLimit are certainly not references to live
225 // objects. I will then have a table of all the chunks within the page, and
226 // after sorting that a simple binary search can narrow the target of the
227 // ambiguous pointer down to a single 16K chunk. I tag that entire chunk as
228 // pinned.
229 // During a minor GC the only page that needs pinned information collected
230 // is the Previous one, since material in all other pages will stay put anyway.
231 // My expectation is that the vast majority of (valid) potential pointers
232 // will be into Current, with the next higher number into Previous. That is
233 // because fairly recently allocated objects are most likely to have
234 // references to them on the stack. For older material I think I expect a
235 // tendancy for clustering and with a large memory configuration only a
236 // small fraction of the total number of pages will be involved.
237 //
238 // A consequence of using a generational system is that I need to be able
239 // to respond to updates to old data that lead to references to new data.
240 // When I perform a major GC I will be scavanging everything and so no special
241 // treatment is called for, but to support minor GCs I will arrange that any
242 // use of RPLACD, RPLACD or PUTV (or derived operations such as PUTHASH, PUT
243 // and NCONC) set a mark indicating a region of memory that must be treated
244 // as roots. I record "dirty" information in bitmaps so that I can identify
245 // updated objects to a resolution of individual calls. During a minor GC
246 // I need to visit each such cell and that basically involves scanning the
247 // bitmap looking for nonzero bits. Since I expect it to be sparse I will have
248 // multiple levels of bitmap so I can in general avoid inspecting areas
249 // of memory that will not be interesting. When I do this during a minor
250 // GC if I only need to concern myself with words that now point into
251 // Current or Previous, and anything that points to older material can have
252 // its dirty bit reset to zero.
253 // Note that dirty bits may end up set on pinned items in pages that are
254 // (mostly) Free.
255 
256 
257 size_t pagesCount;
258 size_t activePagesCount;
259 
260 // Ha ha potentially clever idea. Have activeTheads an atomic<uint32_t> and use
261 // the bottom 10 bits for the number of threads that are busy and the next
262 // 10 bits for the number potentially busy and finally 10 bits for the total
263 // number of threads in play. That way I can eg subtract (1<<10)+1 when a
264 // thread temporarily removes itself from the pool because it
265 // may be about to block! Etc etc. The second idea is that there is potential
266 // misery about needing to get every thread to exit when STOP is called. Well
267 // maybe I can avoid using mutexes at all as places that code can block and
268 // instead use condition variables with a condition of the form
269 // (<sensible-condition> || need_to_exit) and then when the barrier is passed
270 // I immediately check need_to_exit and tidy up a bit more. To make that work
271 // I suspect I will need a table of every muxex/condition variable anywhere
272 // so that when I set need_to_exit I can notify all of them!
273 // Well for muxexes and condition variables visible at the Lisp level I will
274 // need underlying C++ ones which will sort of need garbage collection but
275 // must never move. I think that the best bet is to have a vector pool of
276 // synchronization objects with the Lisp objects that encapsulate them holding
277 // an integer index that will need to be treated as a weak pointer to
278 // keep the object "alive". Whe not alive the object is just available for
279 // re-use... That is all going to be a bit messy.
280 
281 // Another though to be left as comments here until I implement it and comment
282 // it where the code ends up:
283 // The copying GC can probably be coded so as to use multiple threads to do
284 // the copying! I have bits of that sketched in my mind, util until I can sit
285 // down quietly with a full sized keyboard and code some of it things are a
286 // bit uncertain. In particular there will be issues about the cost of
287 // the test-and-set operations I would need to use to keep threads from
288 // entanglement.
289 // A though is that when I am about to evactuate the item it address p I
290 // start with a = atomic_store(p, TAG_FORWARD) where tht returns the
291 // previous value at p. Then if that was TAG_FORWARD I will know that
292 // somebody else had been evacuating that location - I just re-try in a busy-
293 // waiting style. If what was there was a forwarding address then I do an
294 // (atomic) store to put it back. Otherwise I have the original contents and
295 // I have marked the word with TAG_FORWARD so that no other thread will mess
296 // with it (much). As quickly as I can I work out where the data will need
297 // to end up. This is going to be an atomic_add operation on the fringe of
298 // the new heap. Then I can store a proper forwarding address in place.
299 // I HOPE that it will be rare that two threads try to evacuate the same
300 // location at once, so the spin-wait will be uncommon, and I hope that
301 // working out where something moves to will be quick so that when it does
302 // happen it will not need to run for long.
303 // I can batch the identification of locations to evacuate so I can use a
304 // mutex to protect that code that grabs them, and that will be ok provided
305 // identifting locations is (much) cheaper than actually altering stuff.
306 
307 // BEWARE:
308 //   consider the imlemention of v = cons(a,b) where v is some value that may
309 // be shared across threads. Eg rplacd(V, cons(a, b)) is almost certainly
310 // going to count. The natural implementation will be along the lines
311 //    w = allocate();
312 //    w[0] = a;
313 //    w[1] = b;
314 //    v = w;
315 // but as a first issue the compiler might use v in place of the temporary w,
316 // and then we have
317 //    v = allocate();
318 //    some other thread accesses uninitialized car v here!
319 //    v[0] = a; v[1] = b;
320 // Things are even worse because with the code written in vanilla form the CPU
321 // may re-order all the memory writes, again leaving v referencing a chunk of
322 // memory not fully initialized. The two issues must be dealt with using one
323 // of two ideas. Synchronization primitives such as mutexes could be used to
324 // enclose the operation as a critical region, and potential accesses would
325 // do likewise. The performance and ugliness costs are horrendous! Or memory
326 // fences can be used. So that shows I will need to study thread-fence methods
327 // and all the options that provide. Note that without a fence after the update
328 // of v it could be that the chance made would reside locally so that other
329 // threads would not see it, so a fence may be needed to "publish" it.
330 // Making almost every value atomic<T> might also do the job, but that
331 // would then imply fences everywhere and could hurt performance and it would
332 // also be horribly ponderous and clumsy.
333 //
334 
335 
336 uintptr_t *C_stackbase;   // base of the main thread
337 atomic<uint32_t> activeThreads;
338 //  0x00 : total_threads : lisp_threads : still_busy_threads
339 
340 // The variables defined as thread_local here MAY be just rendered as
341 // (eg)    "thread_local uintptr_t threadId;"   but on Windows they end
342 // up as instances of a slightly strange class that supports assignment
343 // from the the specified type and static casts to the specified type, but
344 // where more complicated casts (explicit or implicit) may not be tolerated.
345 // So there are places where I am obliged to write odd-looking code like
346 // ... static_cast<uintptr_t>(threadId) ... or "threadId = static_cast<..."
347 // to make the type conversion process especially explicit and simple.
348 // I could avoid that if I made the wrapper class rather a lot more
349 // compilicated, but I think that transparancy there and a modest amount of
350 // redundancy here is the path that leaves me happiest (at present).
351 
352 DEFINE_THREAD_LOCAL(uintptr_t,    threadId);
353 DEFINE_THREAD_LOCAL(uintptr_t,    fringe);
354 DEFINE_THREAD_LOCAL(Page *,       borrowPages);
355 DEFINE_THREAD_LOCAL(uintptr_t,    borrowFringe);
356 DEFINE_THREAD_LOCAL(uintptr_t,    borrowLimit);
357 DEFINE_THREAD_LOCAL(uintptr_t,    borrowNext);
358 
359 
360 
361 // All the heap memory I use here will be allocated within (up to) 32
362 // segments that are grabbed using "new" etc...
363 // Each segment of memory will start aligned at a multiple of CSL_PAGE_SIZE
364 // which is (at present) 8 Mbytes. The idea here is that I grab seriously
365 // large chunks of memory from the system and these chunks may not be
366 // contiguous. But with at most 32 of them I can take any bit-pattern and
367 // with around 5 comparisons I can identify which (if any!) of these
368 // chunks it points within. Then division by 8M can get me to the start of
369 // the 8 Mbyte segment that is involved. That bit of search will be heavily
370 // used when processing ambiguous pointers.
371 //
372 
373 
374 // Provide the kernel of the allocation code...
375 
376 // This is an updated and fuller explanation of memory layout on each
377 // page.
378 // A page may either be one where allopcation is currently happening within
379 // it or one that is full of data, or it may be one that is currently
380 // free. A messy complication is that in each situation its general pattern
381 // of use can be disrupted by virtue of it having some data within it that
382 // was pinned during a previous garbage collection and so had to be left
383 // in place. That means my idea of having separate pages for cons cells and
384 // for vectors seems to fail.
385 
386 // A page will have as its layout:
387 // (1) A smallish header that contains its fringe, arrangements so it can
388 //     be chained along with other pages etc. Details will emerge as I code
389 //     all this and discover what I need!
390 // (2) A region that acts as a "dirty map". This region will be present in
391 //     every page and is an array of atomic<uint8_t>, where each byte maps
392 //     a 64-byte block within the page and will get set non-zero if a RPLAC
393 //     or PUTV (style) operation updates anything within that small block.
394 //     for 8 Mbyte pages this map will be 128 Kbytes large. It only gets
395 //     written to when Lisp performs a valid RPLAC/PUTV and so addresses
396 //     such as those in the bitmap itself will never become dirty. So if
397 //     this map is aliased to start at the beginning of the page the first
398 //     2 Kbytes may be used (eg for the header) without risk.
399 // (3) A bitmap that can be used to tag "pinned" objects. To have one bit for
400 //     each 8-byte item in the heap (and every valid LispObject will be
401 //     positioned so as to be 8-byte aligned, whether on a 32 or 64-bit
402 //     platform) will require 128 Kbytes, and this will be set up as an
403 //     array of uint64_t values which will mean that any linear scan of it
404 //     only has 16 K words to check. Pinned status is established for memory
405 //     in the scavengable region as a first step in garbage collection. Two
406 //     cases arise. In Minor garbage collection pinning only happens in the
407 //     single page of scavengable material, and all of that will be processed
408 //     so issues as to whether there are up-pointers present in it do not
409 //     arise. Pinned bits are not set in the Stable part of the heap so
410 //     dirty bits there are unaffected. On the other hand during a Major
411 //     garbage collection all memory is Scavangable and will need pin
412 //     information, but dirty bits are not relevant. So provided pinned bits
413 //     are ONLY used or inspected during garbage collection the bitmap here
414 //     can overlay the dirty map.
415 //     If at the end of garbage collection there had been pinned items in the
416 //     Scavengable region and if the contents of any of those are up-pointers
417 //     then they must be marked as dirty when the page is put in the Free area.
418 // (4) A further 128 Kbyte bitmap that can mark which addresses within a page
419 //     are at the start of a LispObject. These get set in the Scavengable
420 //     region as a prelude and prerequisite for setting pinned bits. Well to
421 //     be more careful about that statement, when an ambiguous address is
422 //     inspected to use it for pinning the page it refers into will need its
423 //     object-start map set up. So pages that contain no pinned items at all
424 //     avoid this. So during a minor GC only the scavengable region gets
425 //     scanned to identify object starts, while during a major GC only pages
426 //     that are the targets of ambiguous pointers will.
427 //
428 
429 // Here is a layout for an 8 Mbyte page, specifying the various
430 // ways in which data can be accessed. This uses a union so that the page
431 // header will overlap the beginning of the objstart[] bitmap, but because
432 // objects only reside in the data[] part the first couple of kilobytes
433 // of objstart[] will never be used.
434 
435 // The initializeation here is intended to make the code more fragile
436 // so that unless I initialize elsewhere I stand a good chance of seeing
437 // a prompt sigsegv.
438 static Page *px = reinterpret_cast<Page *>(-0x5a5a5a5aU);
439 
440 Page *currentPage = px;     // Where allocation is happening. The "nursery".
441 Page *previousPage = px;    // A page that was recently the current one.
442 Page *busyPages = px;       // Chained list of pages that contain live data.
443 Page *oldPages = px;        // Page from which live stuff is being evacuated.
444 Page *mostlyFreePages = px; // Chained list of pages that the GC has mostly
445                             // cleared but that have some pinned data left
446                             // in them.
447 Page *freePages = px;       // Chained list of pages that are not currently
448                             // in use and that contain no useful information.
449 
450 size_t busyPagesCount = -1, mostlyFreePagesCount = -1,
451        freePagesCount = -1, oldPagesCount = -1;
452 
453 void *heapSegment[16];
454 char *heapSegmentBase[16];
455 size_t heapSegmentSize[16];
456 size_t heapSegmentCount;
457 
458 // I make some assumptions about the variations on atomic<> that I
459 // use, but then would like to use static_assert to confirm them or to
460 // cause CSL to fail to compile. However the test has to be dynamic, so I can
461 // at best cause things to fail at system startup. Boo Hiss!
462 // I believe that my assumptions have a good chance of being satisfied
463 // on almost all machines, even though I can imagine architectures
464 // where there may be trouble. But what matters most to me will be x86,
465 // x86_64 and both 32 and 64-bit ARM when using g++ or clang, and those will
466 // get checked the first time I compile this code on each.
467 //
468 // Note that the C++ standard explictly says that there is no guarantee that
469 // the sizes of atomic specializations match those of the underlying raw
470 // types, but that implementations are encouraged to make that the situation
471 // where they can.
472 // Here I verify the HOPES that I have by checking them in the constructor
473 // for an otherwise worthless class that I then define an instance of.
474 
475 class MakeAssertions
476 {
477 public:
MakeAssertions()478     MakeAssertions()
479     {   if (sizeof(atomic<std::uint8_t>) != 1)
480         {   cout << "atomic<int8_t> is not the expected size" << "\r" << endl;
481             my_abort();
482         }
483         if (!atomic<std::uint8_t>().is_lock_free())
484         {   cout << "atomic<uint8_t> not lock-free" << "\r" << endl;
485             my_abort();
486         }
487         if (sizeof(atomic<std::uintptr_t>) != sizeof(intptr_t))
488         {   cout << "atomic<uintptr_t> is not the expected size" << "\r" << endl;
489             my_abort();
490         }
491         if (!atomic<uintptr_t>().is_lock_free())
492         {   cout << "Atomic<uintptr_t> not lock-free" << "\r" << endl;
493             my_abort();
494         }
495         if (sizeof(atomic<std::uint32_t>) != 4)
496         {   cout << "atomic<uint32_t> is not the expected size" << "\r" << endl;
497             my_abort();
498         }
499         if (!atomic<std::uint32_t>().is_lock_free())
500         {   cout << "atomic<uint32_t> not lock-free" << "\r" << endl;
501             my_abort();
502         }
503         if (SIXTY_FOUR_BIT)
504         {   if (sizeof(atomic<std::uint64_t>) != 8)
505             {   cout << "atomic<uint64_t> is not the expected size" << "\r" << endl;
506                 my_abort();
507             }
508             if (!atomic<std::uint64_t>().is_lock_free())
509             {   cout << "atomic<uint64_t> not lock-free" << "\r" << endl;
510                 my_abort();
511             }
512         }
513         cout << "is_standard_layout(Chunk) = "
514              << std::is_standard_layout<Chunk>::value << "\r" << endl;
515     }
516 };
517 
518 static MakeAssertions test_for_lockfree;
519 
get_symbol(bool gensymp)520 LispObject get_symbol(bool gensymp)
521 {   return get_basic_vector(TAG_SYMBOL, TYPE_SYMBOL, symhdr_length);
522 }
523 
get_basic_vector(int tag,int type,size_t size)524 LispObject get_basic_vector(int tag, int type, size_t size)
525 {
526 // tag is the value (e.g. TAG_VECTOR) that will go in the low order
527 // 3 bits of the pointer result.
528 // type is the code (e.g. TYPE_STRING) that gets packed, together with
529 // the size, into a header word.
530 // size is measured in bytes and must allow space for the header word.
531 // [Note that this last issue - size including the header - was probably
532 // a mistake since the header size depends on whether I am using a
533 // 32-bit or 64-bit representation. However it would be hard to unwind
534 // that now!]
535 //
536     size_t allocSize = (size_t)doubleword_align_up(size);
537 // Basic vectors must be smaller then the CSL page size.
538     if (allocSize > (CSL_PAGE_SIZE - 32))
539         return aerror1("request for basic vector too big",
540                        fixnum_of_int(allocSize/CELL-1));
541     LispObject r = get_n_bytes(allocSize);
542     *(reinterpret_cast<Header *>(r)) = type + (size <<
543                                        (Tw+5)) + TAG_HDR_IMMED;
544 //
545 // DANGER: the vector allocated here is left uninitialised at this stage.
546 // This is OK if the vector will contain binary information, but if it
547 // will hold any LispObjects it needs safe values put in PDQ.
548 //
549 // All vectors are allocated so as to be 8-byte aligned. On a 64-bit system
550 // a vector that will not end up being a multiple of 8 bytes long naturally
551 // gets padded out. Here I arrange to zero out any such padder word. This
552 // should not be very important since nobody should ever try to use that
553 // word. When the garbage collector copies material around it transcribes
554 // the whole vector (including the padder), but it should never try to trace
555 // through it. By tidying this up here can feel that I do not have any
556 // need to worry about it elsewhere.
557     if (!SIXTY_FOUR_BIT && allocSize != size)
558         *reinterpret_cast<LispObject *>(r+allocSize-CELL) = 0;
559     return static_cast<LispObject>(r + tag);
560 }
561 
562 // This takes a vector (which can be one represented using an INDEXVEC)
563 // and reduces its size to a total value len. It returns the shorter
564 // vector. Only used on simple vectors. This is ONLY used when a hash table
565 // finds that the number of items in it has decreased dramatically and it
566 // wants to shrink. For big tables the index vector will decrease in size
567 // but each sub-vector stored in it will remain as it is. For smaller tables
568 // it can be the table itself that shrinks. When a vector shrinks I should
569 // put a padder in the vacated space so that it will still be possible to
570 // do linear scans of memory.
571 
reduce_basic_vector_size(LispObject v,size_t len)572 LispObject reduce_basic_vector_size(LispObject v, size_t len)
573 {   size_t oldlen = doubleword_align_up(length_of_header(vechdr(v)));
574     setvechdr(v, TYPE_SIMPLE_VEC + (len << (Tw+5)) + TAG_HDR_IMMED);
575     len = doubleword_align_up(len);
576     if (len != oldlen) setvechdr(v + len, makeHeader(oldlen-len, TYPE_PADDER));
577     return v;
578 }
579 
newRegionNeeded()580 void newRegionNeeded()
581 {
582 // This where a page is full up. Or at least where the region within a page
583 // up to the next pinned region was full up! Or if the user asked for a GC.
584 // I will set padders everywhere even if I might think I have done so
585 // already, just so I am certain.
586     for (unsigned int i=0; i<maxThreads; i++)
587     {   if (myChunkBase[i] != nullptr)
588             myChunkBase[i]->chunkFringe = fringeBis[i];
589     }
590 // Here I will put in a padder that may lie between Chunks. I think this
591 // should not be necessary!
592     size_t gap = gLimit - gFringe;
593     if (gap != 0) setHeaderWord(gFringe, gap);
594 // Next if I will be building up to a full GC I can usually just allocate
595 // another Page from the list of free Pages. Note that if userGcRequest has
596 // the value GcStyleMajor I need to force a full collection even if there
597 // is plenty of space available and if it is GcStyleMinor I must force
598 // a minor GC.
599     if ((!generationalGarbageCollection ||
600          !garbage_collection_permitted ||
601          previousPage == nullptr) &&
602         userGcRequest != GcStyleMinor)
603     {   if ((busyPagesCount >= freePagesCount+mostlyFreePagesCount ||
604              userGcRequest == GcStyleMajor) &&
605             !withinMajorGarbageCollection)
606         {   cout << "@@ full GC needed\r\n";
607             userGcRequest = GcStyleNone;
608             {   std::lock_guard<std::mutex> lock(mutexForGc);
609                 activeHelpers = -activeHelpers;
610             }
611             cvForCopying.notify_all();
612             fullGarbageCollect();
613         }
614         else
615         {
616 // Here I can just allocate a next page to use... because memory is less
617 // than half full. At some stage I will want to worry quite hard about
618 // fragmentation and the "half full" issue!
619             if (previousPage == nullptr) busyPages++;
620             previousPage = currentPage;
621 // I must talk through an interaction between pinned data and my write
622 // barrier. Suppose some data is is pinned and in addition to the ambiguous
623 // references to it is is pointed to from ancient structures. During a
624 // garbage collection in which it is pinned the data there will not be moved
625 // and so the pointers from ancient areas remain safe.
626 // Now suppose that during the next garbage collection there are no ambiguous
627 // pointers to this data. Further suppose that it could be in a previous page.
628 // That could happen either because I used the partly pinned page as
629 // a "current" one and allocated more new material within in, or if I tried
630 // to dispose of pinning as soon as possible by treating all partly pinned
631 // pages as previous.
632 // Because the data is no longer pinned it gets evacuated. So all references
633 // to it will need updating. Those in the stable part of the heap and
634 // those within pinned regions in partly-free data may risk being missed
635 // out. So I will need to apply the write barrier to every location where
636 // there is a precise pointer to a pinned item.
637 // So I should worry about the sequence
638 //   (1)  create pointer to Y from location X
639 //   (2)  location X ends up in stable heap region, typically because
640 //        X was evacuated.
641 //   (3)  Y is in a previous page. It gets pinned and then a minor GC leaves
642 //        it in a mostly-free page
643 //   (4)  at a subsequent minor GC Y is no longer pinned and gets evacuated.
644 //        but we then need to be certain that X gets updated.
645 // This is all OK provided X is tagged using the write barrier and that it
646 // never gets tagged as "clean" again until it is seen pointing to something
647 // that is in the stage part of the heap.
648 //
649 // Now I have written out the above I believe that my game-plan should be
650 // that (a) unless desparate I never make a page that has some pinned stuff
651 // on it "current" - after garbage collection pages that are free apart from
652 // some pinned chunks are treated as being in quarantine for so long as the
653 // pinning persists.
654 // (b) Whenever I do anything that notices a (precise) pointer to a pinned
655 // item then the location holding that pointer is tagged as dirty. That
656 // issue becomes critical when the dirty location ends up in the stable
657 // region or in a partly-pinned page.
658 // (c) all "mostly free" pages are considered to be previous pages so that
659 // if items in them are NOT (now) pinned those items can be evacuated
660 // out from them even on a minor GC. And if a chunk ends up with no
661 // current pins and if all live data is evacuated from it then its space
662 // is available for re-use, and if the page ends up with no pinned chunks
663 // it becomes a fully free page.
664 // In the light of the above discussion I will use a fully free page next
665 // if there is one. In pathological situations I may need to drop back
666 // and use a mostly-free one.
667             {   std::lock_guard<std::mutex> guard(mutexForFreePages);
668                 grabNewCurrentPage(withinMajorGarbageCollection);
669             }
670         }
671     }
672     else
673     {   cout << "@@ minor GC needed\r\n";
674         userGcRequest = GcStyleNone;
675         generationalGarbageCollect();
676     }
677 }
678 
releaseOtherThreads()679 void releaseOtherThreads()
680 {
681 //  cout << "@@ unlock any other threads at end of page allocation\r\n";
682 // Now I need to be confident that the other threads have all accessed
683 // gc_started. When they have they will increment activeThreads and when the
684 // last one does that it will notify me.
685     {   std::unique_lock<std::mutex> lock(mutexForGc);
686 // Note that if my entire system had only one thread then the condition
687 // tested here would always be true and the computation would not pause at
688 // all.
689         cv_for_gc_busy.wait(lock,
690                             []{   uint32_t n = activeThreads.load();
691                                   return (n & 0xff) == ((n>>8) & 0xff) - 1;
692                               });
693     }
694 // OK, so now I know that all the other threads are ready to wait on
695 // gc_finished, so I ensure that useful variables are set ready for next
696 // time and release all the threads that have been idling.
697     {   std::lock_guard<std::mutex> guard(mutexForGc);
698         gc_started = false;
699         activeThreads.fetch_add(0x0000001);
700         gc_complete = true;
701     }
702     cv_for_gc_complete.notify_all();
703 }
704 
ensureOtherThreadsAreIdle()705 void ensureOtherThreadsAreIdle()
706 {
707 // When I get here I know that actveThreads==0 and that all other threads
708 // have just decremented that variable and are ready to wait on gc_started.
709 // Before I release them from that I will ensure that gc_finished is false so
710 // that they do not get over-enthusiastic!
711     {   std::lock_guard<std::mutex> guard(mutexForGc);
712         activeHelpers--;
713         gc_complete = false;
714         gc_started = true;
715     }
716     cv_for_gc_idling.notify_all();
717 }
718 
garbageCollectOnBehalfOfAll()719 void garbageCollectOnBehalfOfAll()
720 {   ensureOtherThreadsAreIdle();
721 //    cout << "@@ garbageCollectOnBehalfOfAll called\r\n";
722 // Now while the other threads are idle I can perform some garbage
723 // collection and fill in results via result[] based on request[].
724 // I will also put gFringe back to the value it had before any thread
725 // had done anything with it.
726     restoreGfringe();
727 // When I get here it is as if every thread had known to pause right at the
728 // very start of a call to get_n_bytes() with request[threadId()] showing
729 // how much space it was trying to allocate.
730 //
731 // The "for" loop here is to keep going until I have managed to satisfy all
732 // the pending allocation requests.
733     for (;;)
734     {   unsigned int pendingCount = 0;
735 // Here each thread can have a request that it is making. At least one of
736 // these was such that I had to trigger this GC step. However it may be
737 // possible to satisfy some of them either because they are for more modest
738 // amounts of memory than the GC triggering one or by moving past a pinned
739 // region in the current page.
740         tryToSatisfyAtLeastOneThread(pendingCount);
741 // If there are no pending requests left that may be because using space
742 // beyond pinnings allowed me to cope. But if there are still some left
743 // over here I will need to allocate a fresh page of space. If I am adopting
744 // a major GC strategy I just allocate another page until my whole memory is
745 // about half full, but with a generational GC I will run some GC activity
746 // each time I get here I must evacuating a single page. If the user has
747 // explicitly requested a GC then I had better do one.
748         if (pendingCount == 0 &&
749             userGcRequest == GcStyleNone) break;
750         newRegionNeeded();
751     }
752     releaseOtherThreads();
753 }
754 
755 extern void gcHelper();
756 
waitWhileAnotherThreadGarbageCollects()757 void waitWhileAnotherThreadGarbageCollects()
758 {
759 // If I am a thread that will not myself perform garbage collection I need
760 // to wait for the one that does. This needs to be done in two phases. During
761 // the first I know that I have decremented activeThreads and determined that
762 // I was not the last, but I can not be certain that all other threads have
763 // done that. I wait until gc_started gets set, and that happens when some
764 // thread has found itself to be the last one. And so by construction that
765 // means that all other threads will have reached here.
766 //
767 // I need every idle thread to be woken up here. So when the master GC thread
768 // starts  it can set gc_started and notify everybody through the condition
769 // variable. It can not know how long each will take to notice that so it
770 // must not clear gc_started until it has had positive confirmation that
771 // all the idle threads have responded.
772     {   std::unique_lock<std::mutex> lock(mutexForGc);
773         activeHelpers--;
774         cv_for_gc_idling.wait(lock, [] { return gc_started; });
775     }
776 // To record that threads have paused they can then increment activeThreads
777 // again. When one of them increments it to the value threadcount-1 I will
778 // know that all the idle threads have got here, and I nofify the master GC
779 // thread.
780     bool inform = false;
781     {   std::lock_guard<std::mutex> guard(mutexForGc);
782         uint32_t n = activeThreads.fetch_add(0x000001);
783         if ((n & 0xff) == ((n>>8) & 0xff) - 2) inform = true;
784     }
785     if (inform) cv_for_gc_busy.notify_one();
786 // This is where the thread is paused and so is available to be used as
787 // a helper for the GC. Hmmm I will need to arrange that the fringe and
788 // limit pointers associated with this thread get set to GC new space after
789 // the thread has got around to participating in GC but before the helper
790 // here does very much. However about the first thing that this Helper will
791 // do will be to wait for chunkStack to become non-empty, so provided I set
792 // everything up before I push any Chunks (and provided I keep chunkStack
793 // empty between GCs) I will be OK! There is one more potential issue.
794 // If a thread had been about to block (perhaps on in IO request or perhaps
795 // just because it is a user thread needing to block on a user-managed mutex)
796 // that thread will not get here. So I probably need to adjust my scheme that
797 // allows for blocking actions within Lisp so that the threads involved
798 // can nevertheless participate in GC. I think that might be nasty because
799 // then a blocked thread must be prepared to be woken up to participate in
800 // the garbage collector and so must respond either when the GC asks for
801 // help or when whatever it was being blocked by  tells it to. Hmmm there
802 // are going to be two sorts of cases.
803 // (1) The blocking action is a use of a Lisp syncronization primitive. Well
804 //     in such cases the action can not be released while the GC is active
805 //     because no Lisp threads run then! That may simplify matters at least
806 //     a little.
807 // (2) Things such as awaiting user input from keyboard or mouse. For such
808 //     cases I suspect that the actual blocking call to the keyboard or
809 //     mouse handler will need to be in a separate thread so then the
810 //     main one can wait for both that and for any GC request.
811 // There is a bit of a nuisance in that it is only possible to wait on
812 // one condition variable at a time. So I think that all of these may end up
813 // signalling and waiting on the condition variable(s) used to moderate entry
814 // to the GC, and so every lisp-level syncronization event is liable to
815 // notify same. Well the nature of condition variables is that spurious
816 // wake-ups should be tolerated, and the GC use of them should not be that
817 // frequent... But still "Ugh!".
818     int helpers;
819     {   std::unique_lock<std::mutex> lock(mutexForGc);
820         cvForCopying.wait(lock,
821             [] { return activeHelpers >= 0; });
822         helpers = activeHelpers;
823     }
824     if (helpers != 0) gcHelper();
825 // The gcHelper() function must terminate once it has completed its task.
826 // Note that if the thread that actually processes things manages to avoid
827 // any genuine GC activity (for instance it just allocates a fresh Page)
828 // gcHelper must not do anything!
829 //
830 // Once the master thread has been notified as above it can go forward and
831 // in due course notify gc_complete. Before it does that it must ensure that
832 // it has filled in results for everybody, incremented activeThreads to
833 // reflect that it is busy again and made certain that gc_started is false
834 // again so that everything is tidily ready for next time.
835     {   std::unique_lock<std::mutex> lock(mutexForGc);
836         cv_for_gc_complete.wait(lock, [] { return gc_complete; });
837     }
838     fringe = fringeBis[threadId];
839 //    cout << "At " << __WHERE__ << " fringe set to fringeBis = " << fringe << "\r" << endl;
840 }
841 
842 // Here I have just attempted to allocate n bytes but the attempt failed
843 // because it left fringe>=limit. I must synchronize with all other
844 // threads and one of the threads (it may not be me!) must garbage collect.
845 // When they synchronize with me here the other threads will also have tried
846 // an allocation, but the largest request any is allowed to make is
847 // VECTOR_CHUNK_BYTES (at present 2 megabyte). If all the maxThreads do
848 // this they can have caused fringe to overshoot by about an amount
849 // maxThreads*VECTOR_CHUNK_BYTES and if that caused uintptr_t arithmetic to
850 // overflow and wrap round then there could be big trouble. So when I
851 // allocate chunks of memory I ought to ensure that none has an end-address
852 // that close to UINTPTR_MAX! I think that on all realistic systems that is
853 // a problem that will not actually arise.
854 //
855 // Note that I get here not just at the end of a full Page but also each time
856 // I need to skip past a pinned chunk within a page, so having too many
857 // pinned chunks would lead to quite a lot of synchronization overhead.
858 // Well I thing that is just how things are! To reduce costs if after GC
859 // I find that a page has pinned blocks in I may want to delay re-using
860 // it for allocation, and I may want the generational collector to be
861 // ready to evacuate stuff from such pages almost as soon as it ceases to
862 // be pinned.
863 //
864 
865 std::condition_variable cvForCopying;
866 
difficult_n_bytes()867 uintptr_t difficult_n_bytes()
868 {
869 // Every thread that comes here will need to record the value of its
870 // stack pointer so that the thread that ends up performing garbage
871 // collection can identify the regions of stack it must scan. For that to
872 // be proper the code must not be hiding values in register variables. The
873 // function "withRecordedStack()" tries to arrange that, so that when the
874 // body of code is executed stackFringes[] has that information nicely
875 // set up.
876 // First I need to ensure that all other threads will notice that something
877 // has to be done!
878     for (unsigned int i=0; i<maxThreads; i++) limit[i].store(0);
879     withRecordedStack([&]
880     {
881 // The next line will count down the number of threads that have entered
882 // this synchronization block. The last one to do so can serve as the
883 // thread that performs garbage collection, the other will just need to wait.
884 // The fetch_sub() operation here may cost much more than simple access to
885 // a variable, but I am in context were I am about to do things costing a
886 // lot more than that.
887         int32_t a = activeThreads.fetch_sub(0x000001);
888 // The low byte of activeThreads counts the number of Lisp threads properly
889 // busy in the mutator. When it returns a value > 1 it means that at least
890 // one other thread has not yet joined in with this synchronization. It will
891 // be that last thread that actually performs the GC, so the current one
892 // has nothing to do - it must just sit and wait! When the final thread
893 // performs the fetch_sub() it will know that every other thread is now
894 // quiescent and it can perform as the master garbage collection thread.
895         if ((a & 0xff) > 1) waitWhileAnotherThreadGarbageCollects();
896         else garbageCollectOnBehalfOfAll();
897 // I must arrange that threads continue after idling only when the master
898 // thread has completed its work.
899 //
900     });
901 // At the end the GC can have updated the fringe for each thread,
902 // so I need to put its updated value in the correct place.
903     fringe = fringeBis[threadId];
904 //    cout << "At " << __WHERE__ << " fringe set to fringeBis = " << hex << fringe << dec << "\r" << endl;
905 #ifdef DEBUG
906     testLayout();
907 #endif
908     return result[threadId] - TAG_VECTOR;
909 }
910 
911 // As well as shrinking vectors the hash table code can want to "borrow"
912 // space by allocating vectors (never lists) from the half of memory that
913 // the copying garbage collector is not keeping live data in at present.
914 // The protocol I have is that it goes
915 //      {   Borrowing borrowObject;
916 //          ... get_vector() ... get_vector() ...
917 //      }
918 // and it MUST be coded so that it can not trigger garbage collection while
919 // the "borrowed" vectors are in use. It does not have any specific way of
920 // indicating when the space is no longer needed, save that a subsequent
921 // call to prepare_for_borrowing() must not happen while borrowed space is
922 // still needed. The key use for this is when a hash table needs to be
923 // re-hashed - the code borrows space and copies the existing table contents
924 // into it. It then re-hashes everything back into the existing vector.
925 // Doing things that way really simplifies the hash table code, and avoids
926 // having the temporary space anything other than rather temporary and
927 // transient. However the scheme is not very thread-friendly! My current
928 // plan is that only one thread may be re-hashing (and hence borrowing) at
929 // once, and while that is happening any other thread that needs to trigger
930 // garbage collection will have to wait. I am putting in stubs of code here
931 // but the code to borrow memory will in fact be very similar to that used
932 // during garbage collection to allocate space in the new half-space when
933 // a vector needs to be evacuated to there.
934 // Note that other threads might be busy allocating memory during the
935 // time that one is borrowing, so they can need garbage collection. It will
936 // have to wait until the use of borrowed memory is over.
937 
938 // Perhaps I could invent and then use an alternative protocol so that each
939 // thread could do its own borrowing without messy clashes. Perhaps the main
940 // issue there is recovering memory when a thread has finished. To think
941 // about how vital that might be I would need to consider whether hash tables
942 // might need rehashing multiple times between garbage collections - if the
943 // answer is "no" then each hash table could have its own associated
944 // borrowed shadow ... that starts to sound sensible to me! I will get a LOT
945 // more of this code working before I worry about that detail.
946 
947 //thread_local Page *borrowPages;
948 //thread_local uintptr_t borrowFringe;
949 //thread_local uintptr_t borrowLimit;
950 //thread_local uintptr_t borrowNext;
951 
952 // Here I need to arrange that if several threads each try to borrow memory
953 // at the same (or overlapping) times that they end up with separate
954 // chunks. I do this by letting each grab memory from mostlyFreePages and
955 // freePages, but with a mutex to protect the allocation. Then when borrowing
956 // is complete I push stuff back. I do not change the recorded counts of
957 // free pages.
958 
borrow_n_bytes(size_t n)959 LispObject borrow_n_bytes(size_t n)
960 {   for (;;)
961     {   size_t gap=borrowLimit - borrowFringe;
962         if (n <= gap)
963         {   uintptr_t r = borrowFringe;
964             borrowFringe = borrowFringe + n;
965             return static_cast<LispObject>(r);
966         }
967         if (borrowNext != 0)
968         {   borrowFringe = static_cast<uintptr_t>(borrowNext);
969             borrowLimit = reinterpret_cast<uintptr_t *>(
970                 static_cast<uintptr_t>(borrowFringe))[0];
971             borrowNext = reinterpret_cast<uintptr_t *>(
972                 static_cast<uintptr_t>(borrowFringe))[1];
973             continue;
974         }
975 // here I need to allocate a new page....
976         std::lock_guard<std::mutex> lock(mutexForFreePages);
977         Page *w;
978         if (mostlyFreePages != nullptr)
979         {   w = mostlyFreePages;
980             mostlyFreePages = mostlyFreePages->chain;
981         }
982         else
983         {   w = freePages;
984             freePages = freePages->chain;
985         }
986         w->chain = static_cast<Page *>(borrowPages);
987         borrowPages = w;
988         borrowFringe = w->fringe;
989         borrowLimit = w->limit;
990         borrowNext = 0;    // BAD....
991     }
992 }
993 
borrow_basic_vector(int tag,int type,size_t size)994 LispObject borrow_basic_vector(int tag, int type, size_t size)
995 {   size_t allocSize = (size_t)doubleword_align_up(size);
996     if (allocSize > (CSL_PAGE_SIZE - 32))
997         return aerror1("request for basic vector too big",
998                        fixnum_of_int(allocSize/CELL-1));
999     LispObject r = borrow_n_bytes(allocSize);
1000     *(reinterpret_cast<Header *>(r)) = type + (size <<
1001                                        (Tw+5)) + TAG_HDR_IMMED;
1002     if (!SIXTY_FOUR_BIT && allocSize != size)
1003         *reinterpret_cast<LispObject *>(r+allocSize-CELL) = 0;
1004     return static_cast<LispObject>(r + tag);
1005 }
1006 
borrow_vector(int tag,int type,size_t n)1007 LispObject borrow_vector(int tag, int type, size_t n)
1008 {   if (n-CELL > VECTOR_CHUNK_BYTES)
1009     {   size_t chunks = (n - CELL + VECTOR_CHUNK_BYTES -
1010                          1)/VECTOR_CHUNK_BYTES;
1011         size_t lastSize = (n - CELL) % VECTOR_CHUNK_BYTES;
1012         if (lastSize == 0) lastSize = VECTOR_CHUNK_BYTES;
1013         LispObject v =
1014             borrow_basic_vector(TAG_VECTOR, TYPE_INDEXVEC, CELL*(chunks+1));
1015         for (size_t i=0; i<chunks; i++)
1016         {   size_t k = i==chunks-1 ? lastSize : VECTOR_CHUNK_BYTES;
1017             basic_elt(v, i) = borrow_basic_vector(tag, type, k+CELL);
1018         }
1019         return v;
1020     }
1021     else return borrow_basic_vector(tag, type, n);
1022 }
1023 
1024 // This code sets up an empty page - it is ONLY intended for use at the
1025 // start of a run when there can not be any pinned items present anywhere.
1026 // I put the code here adjacent to the code that allocates from the list of
1027 // pages so that the setup and use can be compared.
1028 
setUpEmptyPage(Page * p)1029 void setUpEmptyPage(Page *p)
1030 {   p->chain = nullptr;
1031     p->chunkCount = 0;
1032     p->chunkMapSorted = false;
1033     for (size_t i=0; i<sizeof(p->dirtyMap)/sizeof(p->dirtyMap[0]); i++)
1034         p->dirtyMap[i] = 0;
1035     for (size_t i=0; i<sizeof(p->dirtyMap1)/sizeof(p->dirtyMap1[0]); i++)
1036         p->dirtyMap1[i] = 0;
1037     for (size_t i=0; i<sizeof(p->dirtyMap2)/sizeof(p->dirtyMap2[0]); i++)
1038         p->dirtyMap2[i] = 0;
1039     p->hasDirty = false;
1040     p->dirtyChain = nullptr;
1041     p->hasPinned = false;
1042     p->pinChain = nullptr;
1043     p->pinnedChunks = nullptr;
1044     p->chain = freePages;
1045     freePages = p;
1046     p->pageClass = freePageTag;
1047     p->fringe = reinterpret_cast<uintptr_t>(&p->data);
1048     p->limit = reinterpret_cast<uintptr_t>(p) + sizeof(Page);
1049 }
1050 
1051 // Now something that takes a page where it must be left free apart from
1052 // any pinned Chunks within it. The fringe and limit fields must be set up
1053 // to reflect them. Note that this does NOT alter the "chain" field or set
1054 // pageClass - those must be dealt with otherwise.
1055 // The page concerned MUST have some clear space on it. If it was
1056 // enirely full of pinned chunks this code would fail.
1057 
setUpUsedPage(Page * p)1058 void setUpUsedPage(Page *p)
1059 {   p->chunkCount = 0;
1060     p->chunkMapSorted = false;
1061     for (size_t i=0; i<sizeof(p->dirtyMap)/sizeof(p->dirtyMap[0]); i++)
1062         p->dirtyMap[i] = 0;
1063     for (size_t i=0; i<sizeof(p->dirtyMap1)/sizeof(p->dirtyMap1[0]); i++)
1064         p->dirtyMap1[i] = 0;
1065     for (size_t i=0; i<sizeof(p->dirtyMap2)/sizeof(p->dirtyMap2[0]); i++)
1066         p->dirtyMap2[i] = 0;
1067     p->hasDirty = false;
1068     p->dirtyChain = nullptr;
1069 // Those Chunks that are on the pinChain need to be put into chunkMap..
1070 // other chunks will get added as they are allocated, but the pinned ones
1071 // are there right from the start.
1072     for (Chunk *c = p->pinnedChunks; c!=nullptr; c=c->pinChain)
1073         p->chunkMap[p->chunkCount++] = c;
1074 // I want the pinned chunks sorted so that the lowest address one comes
1075 // first. That will ensure that when one comes to skip past a pinned
1076 // chunk that the next chunk on the cgain will be the next one up in
1077 // memory.
1078     std::qsort(p->chunkMap, p->chunkCount, sizeof(p->chunkMap[0]),
1079                [](const void *a, const void *b)
1080                {   const Chunk *aa =
1081                        static_cast<const atomic<Chunk *>*>(a)->load();
1082                    const Chunk *bb =
1083                        static_cast<const atomic<Chunk *>*>(b)->load();
1084                    uintptr_t aaa = reinterpret_cast<uintptr_t>(aa);
1085                    uintptr_t bbb = reinterpret_cast<uintptr_t>(bb);
1086                    return aaa < bbb ? -1 :
1087                           aaa > bbb ? 1 : 0;
1088                });
1089     p->pinnedChunks = nullptr;
1090     for (size_t i=p->chunkCount; i!=0; i--)
1091     {   p->chunkMap[i].load()->pinChain = p->pinnedChunks.load();
1092         p->pinnedChunks = p->chunkMap[i].load();
1093     }
1094 // Start as if the page is utterly empty.
1095     p->fringe = reinterpret_cast<uintptr_t>(&p->data);
1096 // Now if MAY be that the first part of memory is consumed by one (or a
1097 // succession) of pinned chunks, or that the start of the page has a small
1098 // vacant region terminated by a pinned chunk. I cope with this by setting
1099 // the limit to either the start of the first pinned chunk or to the end of
1100 // the whole page. In an extreme case this will leave fringe==limit. But
1101 // that is not going to be a problem because the first time I try to
1102 // perform allocation I will find that my chunk is empty and scan to grab
1103 // another.
1104     if (p->pinnedChunks.load()!=nullptr)
1105         p->limit = reinterpret_cast<uintptr_t>(p->pinnedChunks.load());
1106     else p->limit = reinterpret_cast<uintptr_t>(p) + sizeof(Page);
1107 }
1108 
setVariablesFromPage(Page * p)1109 void setVariablesFromPage(Page *p)
1110 {
1111 // Set the variable that are used when allocating within the active page.
1112 // Here I require that when a page it set up the fringe and limit values
1113 // stored within it reflect any pinned chunks. When I do this I set the
1114 // limit for the current page equal to the fringe, and that will mean
1115 // that the very first time I try to allocate I will arrange to set up
1116 // a fresh Chunk. That seems nicer to me than creating that chunk here.
1117     uintptr_t thr = threadId;
1118     fringe = limit[thr] = limitBis[thr] = gFringe = p->fringe.load();
1119     myChunkBase[thr] = nullptr;
1120     gLimit = p->limit;
1121 //    cout << "setVariablesFromPage\r\n";
1122 //    cout << "At " << __WHERE__ << " gFringe = " << std::hex << gFringe << "\r" << endl;
1123 //    cout << "At " << __WHERE__ << " gLimit = " << std::hex << gLimit << "\r" << endl;
1124 //    cout << std::dec;
1125 }
1126 
1127 // This code allocates a segment by asking the operating system.
1128 // It grabs a block that is aligned to sizeof(Page).
1129 // The collection of segments must be stored in heapSegments[]
1130 // such that their addresses are in ascending order, and in consequence of
1131 // that allocating a new segment may shuffle existing ones in the tables.
1132 // So the index of a segment in the tables may not be viewed as permenant.
1133 //
1134 // Returns false and does nothing if it can not grab the memory.
1135 
allocateSegment(size_t n)1136 bool allocateSegment(size_t n)
1137 {
1138 // I will round up the size to allocate so it is a multiple of the
1139 // page size. Note that this will be quite a large value!
1140     n = (n + pageSize - 1) & -pageSize;
1141     Page *r;
1142 // If I have C++17 I can rely on alignment constraints and just allocate
1143 // using new[]
1144 #ifdef MACINTOSH
1145 // I would like to use aligned allocation via "new" in the C++17 style, but
1146 // on the Macintosh that is only supported if your operating system is at
1147 // least 10.14. That is your operating system not a constraint on the release
1148 // of the C++ compiler and library! For backwards compatibility at present I
1149 // set a deployment target of 10.13 so I have to do something different here!
1150     {   size_t sz = n+pageSize-1;
1151         char *tr = new (std::nothrow) char[sz];
1152         heapSegmentBase[heapSegmentCount] = tr;
1153         void *trv = reinterpret_cast<void *>(tr);
1154         std::align(pageSize, n*pageSize, trv, sz);
1155         r = reinterpret_cast<Page *>(trv);
1156     }
1157 #else // MACINTOSH
1158     r = new (std::nothrow) Page[n/pageSize];
1159 #endif // MACINTOSH
1160     if (r == nullptr) return false;
1161     heapSegment[heapSegmentCount] = r;
1162     heapSegmentSize[heapSegmentCount] = n;
1163     heapSegmentCount++;
1164 // Now I need to arrange that the segments are sorted in the tables
1165 // that record them.
1166     for (size_t i=heapSegmentCount-1; i!=0; i--)
1167     {   int j = i-1;
1168         void *h1 = heapSegment[i], *h2 = heapSegment[j];
1169         if (reinterpret_cast<uintptr_t>(h2) < reinterpret_cast<uintptr_t>(h1))
1170             break; // Ordering is OK
1171 // The segment must sink a place in the tables.
1172         std::swap(heapSegment[i], heapSegment[j]);
1173         std::swap(heapSegmentSize[i], heapSegmentSize[j]);
1174     }
1175 // r now refers to a new segment of size n, I want to structure it into
1176 // pages.
1177 //
1178 //  for (size_t k=0; k<n; k+=CSL_PAGE_SIZE)
1179 // Go forwards or backwards!
1180     for (size_t k=n; k!=0; k-=CSL_PAGE_SIZE)
1181     {   Page *p =
1182             reinterpret_cast<Page *>(
1183                 reinterpret_cast<char *>(r) + k - CSL_PAGE_SIZE);
1184 // Keep a chain of all the pages.
1185         setUpEmptyPage(p);
1186         freePagesCount++;
1187     }
1188     cout << freePagesCount << " pages available\r\n";
1189     return true; // Success!
1190 }
1191 
1192 size_t pages_count = 0;
1193 size_t heap_pages_count = 0;
1194 size_t vheap_pages_count = 0;
1195 bool garbage_collection_permitted = true;
1196 bool force_verbos = false;
1197 atomic<Page *> dirtyPages;
1198 Page *globalPinChain;
1199 
1200 // gc-forcer(a, b) should arrange that a garbage collection is triggered
1201 // when at most A cons-sized units of consing happens or when at most
1202 // B units of space is used for vectors (where vectors include bignums and
1203 // boxed floats). This is intended to be used to trigger garbage collection
1204 // with rather fine control over when it happens to help with debugging
1205 // storage management issues.
1206 
1207 bool next_gc_is_hard = false;
1208 uint64_t force_cons=0, force_vec = 0;
1209 
Lgc_forcer(LispObject env,LispObject a,LispObject b)1210 LispObject Lgc_forcer(LispObject env, LispObject a, LispObject b)
1211 {   if (force_cons != 0 || force_vec != 0)
1212         trace_printf("Remaining CONS : %" PRIu64 " VEC : %" PRIu64 "\r\n",
1213                      force_cons, force_vec);
1214 // If you pass a non-fixnum then that leaves the trigger-point unchanged.
1215     if (is_fixnum(a)) force_cons = (uint64_t)sixty_four_bits(a);
1216     if (is_fixnum(b)) force_vec = (uint64_t)sixty_four_bits(b);
1217     return onevalue(nil);
1218 }
1219 
Lgc_forcer1(LispObject env,LispObject a)1220 LispObject Lgc_forcer1(LispObject env, LispObject a)
1221 {   return Lgc_forcer(env, a, a);
1222 }
1223 
1224 // When a thread exhausts memory (to be more specific, when it fills up
1225 // the nursery page) it must arrange that all but one threads are stopped
1226 // with information about their stacks visible somewhere central and all
1227 // their active values on the stack rather than in machine registers.
1228 // Then garbage collection car occur - or sometimes in simplet cases just
1229 // the allocation of a fresh nursery page.
1230 // To organize this threads need to be suspended. The following are the
1231 // techiques that could potentially apply:
1232 // (1) Busy-waiting on a suitable atomic flag. Hmm busy waiting is generally
1233 //     not a good strategy except for very short periods.
1234 // (2) Arranging that the thread receives a signal that takes it into
1235 //     a signal handler that sleeps, in such a manner that it can be woken
1236 //     from the sleep by a further notification. Well the rules about
1237 //     proper portable use of signals and their handler make this hard to
1238 //     arrange ina way that can be counted on across architectures, and I
1239 //     do not really want to get down to that level of grungy detail and
1240 //     verify it on Windows and with various Linux releases.
1241 // (3) A thread can wait when it attempts to claim a semaphore, so for
1242 //     each thread to be paused each will have to claim its own particular
1243 //     semaphore. There are then two follow-on challenges - one is to
1244 //     detect when every thread has become inactive and hence garbage
1245 //     collection can proceed. The second arises when the threads release
1246 //     their semaphores - somebody else must then lock them all with certainty
1247 //     that that has happened before anybody runs out of memory again. There
1248 //     are potential race conditions there and so further synchronization
1249 //     steps are required. I think that this means that the associated
1250 //     complexity means that the apparent simplicity of each thread "just
1251 //     needing to lock a mutex" is apparent rather then real.
1252 // (4) Somewhat in the style of (3) I can use condition variables, and
1253 //     all the threads that are to pause can wait on the same condition
1254 //     variable with a notify_all() operation releasing then all. This is
1255 //     what I have tried to code here and it still feels messier than I
1256 //     would have liked. But this section of comments is to suggest what else
1257 //     I considered and how I ended up with this plan.
1258 
1259 std::jmp_buf *buffer_pointer;
1260 
1261 // Each thread will need a thread_number and I need to be able to allocate
1262 // and release such identifying numbers. I will allow for up to 64 threads.
1263 
1264 std::mutex threadStartingMutex;
1265 // threadMap will have a zero bit in places that correspond to thread
1266 // numbers that are allocated.
1267 
1268 uint64_t threadMap = -1;
1269 
threadBit(unsigned int n)1270 uint64_t threadBit(unsigned int n)
1271 {   return (uint64_t)1 << (63-n);
1272 }
1273 
allocateThreadNumber()1274 unsigned int allocateThreadNumber()
1275 {   my_assert(threadMap != 0); // I need at least one spare.
1276     unsigned int n = nlz(threadMap);
1277 // Now n is 0 if the top bit is set, 1 for the next etc down to 63 when
1278 // the least bit is the only one set.
1279     threadMap &= ~threadBit(n);
1280     return n;
1281 }
1282 
releaseThreadNumber(unsigned int n)1283 void releaseThreadNumber(unsigned int n)
1284 {   my_assert(n <=63);
1285     threadMap |= threadBit(n);
1286 }
1287 
ThreadStartup()1288 ThreadStartup::ThreadStartup()
1289 {   // cout << "ThreadStartup" << "\r" << endl;
1290     initThreadLocals();
1291     std::lock_guard<std::mutex> lock(mutexForGc);
1292     threadId = allocateThreadNumber();
1293 // The update here is just fine while I am in fact single threaded, but I
1294 // will need to review it when multiple threads can be in play.
1295     activeThreads.fetch_add(0x00010101);
1296 }
1297 
~ThreadStartup()1298 ThreadStartup::~ThreadStartup()
1299 {   // cout << "~ThreadStartup" << "\r" << endl;
1300     std::lock_guard<std::mutex> lock(mutexForGc);
1301     releaseThreadNumber(static_cast<uintptr_t>(threadId));
1302     activeThreads.fetch_sub(0x00010101);
1303 }
1304 
1305 LispObject *nilSegment, *stackSegment;
1306 
1307 uintptr_t               stackBases[maxThreads];
1308 uintptr_t               stackFringes[maxThreads];
1309 extern atomic<uint32_t> threadCount;
1310 std::mutex              mutexForGc;
1311 std::mutex              mutexForFreePages;
1312 bool                    gc_started;
1313 std::condition_variable cv_for_gc_idling;
1314 std::condition_variable cv_for_gc_busy;
1315 bool                    gc_complete;
1316 std::condition_variable cv_for_gc_complete;
1317 atomic<uintptr_t>       limit[maxThreads];
1318 Chunk*                  myChunkBase[maxThreads];
1319 uintptr_t               limitBis[maxThreads];
1320 uintptr_t               fringeBis[maxThreads];
1321 size_t                  request[maxThreads];
1322 LispObject              result[maxThreads];
1323 size_t                  gIncrement[maxThreads];
1324 atomic<uintptr_t>       gFringe;
1325 uintptr_t               gLimit = 0xaaaaaaaaU*0x80000001U;
1326 
1327 
initHeapSegments(double storeSize)1328 void initHeapSegments(double storeSize)
1329 //
1330 // This function just makes nil and the pool of page-frames available.
1331 // The store-size is passed in units of Kilobyte, and as a double rather
1332 // than as an integer so that overflow is not an issue.
1333 {
1334 // Most of the arrays initialized here are just set up for the sake of
1335 // being tidy, but myChunkBase[] must be nullptr for safety.
1336     for (unsigned int i=0; i<maxThreads; i++)
1337     {   limit[i] = 0U;
1338         myChunkBase[i] = nullptr;
1339         limitBis[i] = 0U;
1340         fringeBis[i] = 0U;
1341         request[i] = 0U;
1342         result[i] = 0;
1343         gIncrement[i] = 0U;
1344     }
1345     globalPinChain = nullptr;
1346 // I will make the default initial store size around 64M on a 64-bit
1347 // machine and 2048M on a 64-bit system. If the user specified a "-K" option
1348 // they can override this, and also the system will tend to allocate more
1349 // space (if it can) when its memory starts to get full.
1350     size_t freeSpace = static_cast<size_t>(SIXTY_FOUR_BIT ? 2048 : 64) *
1351                        1024*1024;
1352     size_t req = (size_t)storeSize;
1353     if (req != 0) freeSpace = 1024*req;
1354 // Now freeSpace is the amount I want to allocate. I will explicitly
1355 // set the variables that are associated with tracking memory allocation
1356 // to keep everything as clear as I can.
1357     heapSegmentCount = 0;
1358     for (int i=0; i<16; i++)
1359         heapSegment[i] = reinterpret_cast<void *>(-1);
1360     freePages = mostlyFreePages = nullptr;
1361     cout << "Allocate " << (freeSpace/1024U) << " Kbytes" << "\r" << endl;
1362     allocateSegment(freeSpace);
1363 
1364     nilSegment = reinterpret_cast<LispObject *>(
1365         new (std::nothrow) Align8[(NIL_SEGMENT_SIZE)/8]);
1366     if (nilSegment == nullptr) fatal_error(err_no_store);
1367     nil = static_cast<LispObject>((uintptr_t)nilSegment + TAG_SYMBOL);
1368     stackSegment = reinterpret_cast<LispObject *>(
1369         new (std::nothrow) Align8[CSL_PAGE_SIZE/8]);
1370     if (stackSegment == nullptr) fatal_error(err_no_store);
1371     stackBase = reinterpret_cast<LispObject *>(stackSegment);
1372     previousPage = nullptr;
1373     currentPage = freePages;
1374     freePages = freePages->chain;
1375     freePagesCount--;
1376     currentPage->chain = nullptr;
1377     busyPages = currentPage;
1378     busyPagesCount = 1;
1379     setVariablesFromPage(currentPage);
1380     mostlyFreePages = nullptr;
1381     mostlyFreePagesCount = 0;
1382 
1383 #if 0
1384 //- Now as a temporary issue I will try to test my write barrier and
1385 //- pinning scheme. For the write barrier I do not need any data in the
1386 //- pages concerned, but for pinning I need much of the memory to be full -
1387 //- what I do here is make it roughly (2/3) full.
1388     cout << "Total mem = " << freeSpace << "\r" << endl;
1389     size_t conses = freeSpace/(2*sizeof(LispObject));
1390     cout << "conses = " << conses << "\r" << endl;
1391     size_t which[5];
1392     for (int j=0; j<5; j++)
1393         which[j] = arithlib::mersenne_twister() % (conses/3);
1394     LispObject barriered[5];
1395     for (int j=0; j<5; j++) barriered[j] = fixnum_of_int(j);
1396     for (size_t i=0; i<conses/3; i++)
1397     {   LispObject a = cons(nil, nil);
1398         for (int j=0; j<5; j++)
1399             if (i == which[j]) barriered[j] = a;
1400     }
1401     for (int j=0; j<5; j++)
1402     {   uintptr_t n1 = static_cast<uintptr_t>(barriered[j]);
1403         cout << "Barrier on " << std::hex << n1 << std::dec << "\r" << endl;
1404         write_barrier(reinterpret_cast<LispObject *>(n1),
1405                       *reinterpret_cast<LispObject *>(n1));
1406     }
1407     cout << "About to scan all the dirty cells\r\n";
1408     scanDirtyCells(
1409         [](atomic<LispObject> *a) -> void
1410         {   cout << std::hex << reinterpret_cast<intptr_t>(a) << std::dec
1411                  << "\r" << endl;
1412         });
1413     cout << "Dirty cells scanned\r\n";
1414     for (int i=0; i<5; i++)
1415     {   uint64_t n1;
1416 // I want to conjure up an address that is within the region that is so far
1417 // in use. This may point at page or chunk headers, in which case it ought
1418 // not to mark anything.
1419         n1 = arithlib::mersenne_twister();
1420         cout << "Use " << std::hex << n1 << " as ambiguous" << std::dec << "\r" << endl;
1421         processAmbiguousValue(true, n1);
1422         n1 = reinterpret_cast<int64_t>(heapSegment[0]) +
1423              (n1 % heapSegmentSize[0]);
1424         n1 = n1 & ~UINT64_C(7);
1425         cout << "Use " << std::hex << n1 << " as ambiguous" << std::dec << "\r" << endl;
1426         processAmbiguousValue(true, n1);
1427         n1 = barriered[i];
1428         cout << "Use " << std::hex << n1 << " as ambiguous" << std::dec << "\r" << endl;
1429         processAmbiguousValue(true, n1);
1430     }
1431     cout << "About to scan all the pinned chunks\r\n";
1432     scanPinnedChunks(
1433         [](Chunk *c) -> void
1434         {   cout << "Chunk at "
1435                  << std::hex << reinterpret_cast<intptr_t>(c)
1436                  << " to " << (reinterpret_cast<intptr_t>(c)+c->length)
1437                  << std::dec << "\r" << endl;
1438         });
1439     cout << "Pinned chunks scanned\r\n";
1440 
1441 // End of temp testing code
1442 #endif // 0
1443 
1444 }
1445 
dropHeapSegments()1446 void dropHeapSegments()
1447 {
1448     for (size_t i=0; i<heapSegmentCount; i++)
1449     {
1450 #ifdef MACINTOSH
1451         delete [] heapSegmentBase[i];
1452 #else // MACINTOSH
1453         delete [] static_cast<Page *>(heapSegment[i]);
1454 #endif // MACINTOSH
1455     }
1456     delete [] reinterpret_cast<Align8 *>(nilSegment);
1457     delete [] reinterpret_cast<Align8 *>(stackSegment);
1458 }
1459 
drop_heap_segments()1460 void drop_heap_segments()
1461 {   dropHeapSegments();
1462 }
1463 
1464 // This allocates another page of memory if that is allowed and if it is
1465 // possible. It returns true on success.
1466 
allocate_more_memory()1467 bool allocate_more_memory()
1468 {   return false;
1469 //  if ((init_flags & INIT_EXPANDABLE) == 0) return false;
1470 //  void *page = (void *)std::aligned_alloc((size_t)CSL_PAGE_SIZE);
1471 //  if (page == nullptr)
1472 //  {   init_flags &= ~INIT_EXPANDABLE;
1473 //      return false;
1474 //  }
1475 //  else
1476 //  {   pages[pagesCount++] = page;
1477 //      return true;
1478 //  }
1479 }
1480 
1481 double maxStoreSize = 0.0;
1482 
grab_more_memory(size_t npages)1483 void grab_more_memory(size_t npages)
1484 {
1485 // Here I grab more memory (if I am allowed to).
1486 // An initial version here, and one still suitable on machines that will
1487 // have plenty of real memory, will be to defined ok_to_grab_memory(n) as
1488 // 3*n + 2. This expands until the proportion of the heap active at the
1489 // end of garbage collection is less than 1/4.
1490 // If the attempt to grab more memory fails I clear the bit in init_flags
1491 // that allows me to try to expand, so I will not waste time again.  If
1492 // HOLD_BACK_MEMORY was asserted (for machines where grabbing all seemingly
1493 // available memory may cause a crash) I do not try this operation.  The
1494 // aim of keeping the heap less than half full is an heuristic and could be
1495 // adjusted on the basis of experience with this code.
1496 // On systems where it is possible to measure the amount of available
1497 // real memory more sophisticated calculations may be possible.
1498     if (init_flags & INIT_EXPANDABLE)
1499     {   size_t ideal = MAX_PAGES; //
1500 //@@@@ ok_to_grab_memory(heap_pagesCount + vheap_pagesCount);
1501         size_t more;
1502         if (ideal > MAX_PAGES) ideal = MAX_PAGES;
1503         if (maxStoreSize != 0.0)
1504         {   double pageLimit = maxStoreSize*1024*1024/static_cast<double>
1505                                (CSL_PAGE_SIZE);
1506 // Limit memory to (about) the amount the user indicated with --max-mem
1507             size_t plim = (size_t)pageLimit;
1508             if (ideal > plim) ideal = plim;
1509         }
1510         if (ideal > pagesCount)
1511         {   more = ideal - pagesCount;
1512             while (more > 0)
1513             {   if (!allocate_more_memory()) break;
1514                 more--;
1515             }
1516         }
1517     }
1518 }
1519 
init_heap_segments(double d)1520 void init_heap_segments(double d)
1521 {   cout << "init_heap_segments " << d << "\r" << endl;
1522     size_t mem;
1523 #ifdef WIN32
1524     mem = getMemorySize();
1525 #else // WIN32
1526 // BEWARE: _SC_PHYS_PAGES is a glibc extension and is not mandated by POSIX.
1527 // However it (maybe obviously) should work on all variants of Linux and
1528 // experimentally it works on a Macintosh, and on Windows there is a clear
1529 // and proper alternative... Also a FreeBSD manual page suggests that
1530 // _SC_PHYS_PAGES will (often?) be available there. That covers some of the
1531 // more important platforms so anybody with some alternative that is more
1532 // specialised can patch in their own code here!
1533     long pageCount = sysconf(_SC_PHYS_PAGES);
1534     long pageSize = sysconf(_SC_PAGE_SIZE);
1535     mem = pageCount*static_cast<size_t>(pageSize);
1536 #endif // WIN32
1537     mem /= (1024*1024);    // Physical memory now in megabytes
1538     size_t g3 = 3u*1024u;  // 3Gbytes
1539     if (mem <= 2*g3) mem = g3;
1540     else mem -= g3;
1541 // I think if my machine has at most 6GB I will default to using 3GB. If
1542 // has more than that I will use (physmem-3G) up to the stage that I
1543 // use a total of 16GB. All subject to any --maxmem that the user had
1544 // specified.
1545     static const size_t K = 16384;
1546     if (K < mem) mem = K;
1547     if (d == 0.0) d = 1024.0*1024.0*mem; // back to bytes
1548     if (maxStoreSize != 0.0 && maxStoreSize < d) d = maxStoreSize;
1549 // I have to pass the amount to initHeapSegments inkilobytes. On a 32-bit
1550 // machine I will limit myself to 1.6G here, because trying to use 2G or
1551 // more starts to risk muddle with sign bits and address arithmetic overflow.
1552     if (!SIXTY_FOUR_BIT) d = 1600.0*1024.0*1024.0;
1553     initHeapSegments(d/1024.0);
1554 }
1555 
1556 int64_t gc_number = 0;
1557 int64_t reclaim_trap_count = -1;
1558 uintptr_t reclaim_stack_limit = 0;
1559 uint64_t reclaim_trigger_count = 0, reclaim_trigger_target = 0;
1560 
1561 //static intptr_t cons_cells, symbol_heads, strings, user_vectors,
1562 //       big_numbers, box_floats, bytestreams, other_mem,
1563 //       litvecs, getvecs;
1564 
Lgc0(LispObject env)1565 LispObject Lgc0(LispObject env)
1566 {   return Lgc(env, fixnum_of_int(1));
1567 }
1568 
1569 GcStyle userGcRequest = GcStyleNone;
1570 
Lgc(LispObject env,LispObject a)1571 LispObject Lgc(LispObject env, LispObject a)
1572 {
1573 // If GC is called with a non-nil argument the garbage collection
1574 // will be a full one - otherwise it will be incremental and may do hardly
1575 // anything. This distinction will only apply once I have a generational
1576 // collector implemented and so "incremental" collections become possible.
1577     for (unsigned int i=0; i<maxThreads; i++) limit[i].store(0);
1578 // For now I will make (reclaim t) and (reclaim) force a major GC while
1579 // (reclaim nil) will be a minor GC.
1580     userGcRequest = a==nil ? GcStyleMinor : GcStyleMajor;
1581     poll();
1582     return nil;
1583 }
1584 
Lverbos(LispObject env,LispObject a)1585 LispObject Lverbos(LispObject env, LispObject a)
1586 // (verbos 0) or (verbos nil)       silent garbage collection
1587 // (verbos 1) or (verbos t)         standard GC messages
1588 // (verbos 2)                       messages when FASL module loaded
1589 // (verbos 4)                       extra timing info for GC process
1590 // These bits can be added to get combination effects, except that
1591 // "4" has no effect unless "1" is present.
1592 {   int code, old_code = verbos_flag;
1593     if (a == nil) code = 0;
1594     else if (is_fixnum(a)) code = static_cast<int>(int_of_fixnum(a));
1595     else code = 1;
1596     miscflags = (miscflags & ~GC_MSG_BITS) | (code & GC_MSG_BITS);
1597     return onevalue(fixnum_of_int(old_code));
1598 }
1599 
1600 bool volatile already_in_gc;
1601 bool volatile interrupt_pending;
1602 
1603 // static int stop_after_gc = 0;
1604 
1605 // bool force_verbos = false;
1606 
1607 // bool garbage_collection_permitted = false;
1608 
1609 // static void real_garbage_collector()
1610 // {
1611 // // I lift the real garbage collector to a separate function mainly
1612 // // so that I can set breakpoints on it!
1613 //     for (int i=0; i<=LOG2_VECTOR_CHUNK_BYTES; i++)
1614 //         free_vectors[i] = 0;
1615 //
1616 // }
1617 
1618 // LispObject reclaim(LispObject p, const char *why, int stg_class, size_t size)
1619 // {   return p;
1620 // }
1621 
1622 static unsigned int MEM=2u*1024u*1024u*1024u;
1623 bool pageFull;
1624 
Lgctest_0(LispObject env)1625 LispObject Lgctest_0(LispObject env)
1626 {   LispObject a = nil;
1627     for (unsigned int i=0; i<MEM/16u; i++)
1628     {   a = cons(fixnum_of_int(i), a);
1629         cout << ":" << std::flush;
1630         if (i % 1000000 == 0)
1631         {   cout << i;
1632             LispObject b = a;
1633             for (unsigned int j=i; j!=static_cast<unsigned int>(-1); j--)
1634             {   if (!is_cons(b)) my_abort("gc test failure");
1635                 if (car(b) != fixnum_of_int(j)) my_abort("gc test failure");
1636                 b = cdr(b);
1637             }
1638             if (b != nil) my_abort("gc test failure");
1639         }
1640     }
1641     return nil;
1642 }
1643 
Lgctest_1(LispObject env,LispObject a1)1644 LispObject Lgctest_1(LispObject env, LispObject a1)
1645 {   LispObject a = nil, b;
1646     size_t n = int_of_fixnum(a1);
1647     for (unsigned int i=0; i<n; i++)
1648         a = cons(fixnum_of_int(i), a);
1649     cout << "list created" << "\r" << endl;
1650     b = a;
1651     for (unsigned int j=n-1; j!=static_cast<unsigned int>(-1); j--)
1652     {   if (!is_cons(b)) goto failing2;
1653         if (car(b) != fixnum_of_int(j))
1654         {   cout << "Fail3 case with j = " << std::dec << j << "\r" << endl
1655                  << " fixnum_of_int(j) = " << std::hex << fixnum_of_int(j) << "\r" << endl
1656                  << " car(b) = " << car(b) << " which differs" << "\r" << endl
1657                  << " " << (n-1-j) << " items down the list" << "\r" << endl;
1658             goto failing3; //<<<<<<<<<
1659         }
1660         b = cdr(b);
1661     }
1662     if (b != nil) goto failing4;
1663     return nil;
1664 failing2:
1665     cout << "Crashed2 " << std::hex << "b = " << b
1666          << " car(b) = " << car(b) << "\r" << endl;
1667     cout << "n = " << n << "\r" << endl;
1668     for (int z=1; z<10; z++)
1669     {   cout << std::dec << (car(b)/16) << " ";
1670         b = cdr(b);
1671     }
1672     cout << "\r" << endl;
1673     return nil;
1674 failing3:
1675     cout << "Crashed3 " << std::hex << "b = " << b
1676          << " car(b) = " << car(b) << "\r" << endl;
1677     cout << "n = " << n << "\r" << endl;
1678     for (int z=1; z<10; z++)
1679     {   cout << std::dec << (car(b)/16) << " ";
1680         b = cdr(b);
1681     }
1682     cout << "\r" << endl;
1683     return nil;
1684 failing4:
1685     cout << "Crashed4 " << std::hex << "b = " << b
1686          << " car(b) = " << car(b) << "\r" << endl;
1687     cout << "n = " << n << "\r" << endl;
1688     for (int z=1; z<10; z++)
1689     {   cout << std::dec << (car(b)/16) << " ";
1690         b = cdr(b);
1691     }
1692     cout << "\r" << endl;
1693     return nil;
1694 }
1695 
Lgctest_2(LispObject env,LispObject a1,LispObject a2)1696 LispObject Lgctest_2(LispObject env, LispObject a1, LispObject a2)
1697 {   return nil;
1698 }
1699 
1700 // end of newallocate.cpp
1701