1 /*
2    Copyright (c) 2005, 2021, Oracle and/or its affiliates.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
23 */
24 
25 #ifndef SUPER_POOL_HPP
26 #define SUPER_POOL_HPP
27 
28 #include <ndb_global.h>
29 
30 #include <pc.hpp>
31 #include <ErrorReporter.hpp>
32 
33 #define JAM_FILE_ID 281
34 
35 
36 /*
37  * SuperPool - super pool for record pools (abstract class)
38  *
39  * Documents: SuperPool GroupPool RecordPool<T>
40  *
41  * SUPER POOL
42  *
43  * A "super pool" is a shared pool of pages of fixed size.  A "record
44  * pool" is a pool of records of fixed size.  One super pool instance is
45  * used by a number of record pools to allocate their memory.  A special
46  * case is a "page pool" where a record is a simple page whose size
47  * divides super pool page size.
48  *
49  * A record pool allocates memory in pages.  Thus each used page is
50  * associated with one record pool and one record type.  The records on
51  * a page form an array starting at start of page.  Thus each record has
52  * an index within the page.  Any last partial record which does not fit
53  * on the page is disregarded.
54  *
55  * I-VALUE
56  *
57  * The old "i-p" principle is kept.  A reference to a super pool page or
58  * record is stored as an "i-value" from which the record pointer "p" is
59  * computed.  In super pool the i-value is a Uint32 with two parts:
60  *
61  * - "ip" index of page within super pool (high "pageBits")
62  * - "ir" index of record within page (low "recBits")
63  *
64  * At most 16 recBits are used, the rest are zero.
65  *
66  * The translation between "ip" and page address is described in next
67  * section.  Once page address is known, the record address is found
68  * from "ir" in the obvious way.
69  *
70  * One advantage of i-value is that it can be verified.  The level of
71  * verification can depend on compile options.
72  *
73  * - "v1" check i-value specifies valid page
74  * - "v2" check record type matches page type, see below
75  * - "v3" check record is in use
76  * - "v4" check unused record is unmodified
77  *
78  * Another advantage of a 32-bit i-value is that it extends the space of
79  * 32-bit addressable records on a 64-bit platform.
80  *
81  * MEMORY ROOT
82  *
83  * This super pool requires a "memory root" i.e. a memory address such
84  * that the index of a page "ip" satisfies
85  *
86  *   page address = memory root + (signed)ip * page size
87  *
88  * This is possible on all platforms, provided that the memory root and
89  * all pages are either on the heap or on the stack, in order to keep
90  * the size of "ip" reasonably small.
91  *
92  * The cast (signed)ip is done as integer of pageBits bits.  "ip" has
93  * same sign bit as i-value "i" so (signed)ip = (Int32)i >> recBits.
94  *
95  * RESERVED I-VALUES
96  *
97  * RNIL is 0xffffff00 (signed -256).  It is used everywhere in NDB as
98  * "null pointer" i.e. as an i-value which does not point to a record.
99  * In addition the signed values -255 to -1 are reserved for use by the
100  * application.
101  *
102  * An i-value with all "ir" bits set is used as terminator in free
103  * record list.  Unlike RNIL, it still has valid page bits "ip".
104  *
105  * Following restrictions avoid hitting the reserved values:
106  *
107  * - pageBits is <= 30
108  * - the maximum "ip" value 2^pageBits-1 (signed -1) is not used
109  * - the maximum "ir" value 2^recBits-1 is not used
110  *
111  * PAGE ENTRIES
112  *
113  * Each super pool page has a "page entry".  It contains:
114  *
115  * - page type
116  * - i-value of first free record on page
117  * - page use count, to see if page can be freed
118  * - pointers (as i-values) to next and previous page in list
119  *
120  * Page entry cannot be stored on the page itself since this prevents
121  * aligning pages to OS block size and the use of BATs for page pools in
122  * NDB.  For now the implementation provides an array of page entries
123  * with place for all potential (2^pageBits) entries.
124  *
125  * PAGE TYPE
126  *
127  * Page type is unique to the record pool using the super pool.  It is
128  * assigned in record pool constructor.  Page type zero means that the
129  * page is free i.e. not allocated to a record pool.
130  *
131  * Each "i-p" conversion checks ("v2") that the record belongs to same
132  * pool as the page.  This check is much more common than page or record
133  * allocation.  To make it cache effective, there is a separate page
134  * type array.  It truncates type to one non-zero byte.
135  *
136  * GROUP POOL
137  *
138  * Each record pool belongs to a group.  The group specifies minimum
139  * size or memory percentage the group must be able to allocate.  The
140  * sum of the minimum sizes of group pools is normally smaller than
141  * super pool size.  This provides unclaimed memory which a group can
142  * use temporarily to allocate more than its minimum.
143  *
144  * The record pools within a group compete freely for the available
145  * memory within the group.
146  *
147  * Typical exmaple is group of all metadata pools.  The group allows
148  * specifying the memory to reserve for metadata, without having to
149  * specify number of tables, attributes, indexes, triggers, etc.
150  *
151  * PAGE LISTS
152  *
153  * Super pool has free page list.  Each group pool uses it to allocate
154  * its own free page list.  And each record pool within the group uses
155  * the group's free list to allocate its pages.
156  *
157  * A page allocated to a record pool has a use count i.e. number of used
158  * records.  When use count drops to zero the page can be returned to
159  * the group.  This is not necessarily done at once.
160  *
161  * The list of free records in a record pool has two levels.  There are
162  * available pages (some free) and a singly linked free list within the
163  * page.  A page allocated to record pool is on one of 4 lists:
164  *
165  * - free page (all free, available, could be returned to group)
166  * - busy page (some free, some used, available)
167  * - full page (none free)
168  * - current page (list of one), see below
169  *
170  * Some usage types (temporary pools) may never free records.  They pay
171  * a small penalty for the extra overhead.
172  *
173  * RECORD POOL
174  *
175  * A pool of records which allocates its memory from a super pool
176  * instance via a group pool.  There are 3 basic operations:
177  *
178  * - getPtr - translate i-value to pointer-to-record p
179  * - seize - allocate record
180  * - release - free record
181  *
182  * CURRENT PAGE
183  *
184  * getPtr is a fast computation which does not touch the page entry.
185  * For seize (and release) there is a small optimization.
186  *
187  * The "current page" is the page of latest seize.  It is unlinked from
188  * its normal list and the free record pointer is stored under record
189  * pool instance.
190  *
191  * The page remains current until there is a seize and the page is full.
192  * Then the real page entry and its list membership are updated, and
193  * a new page is made current.
194  *
195  * This implies that each (active) record pool allocates at least one
196  * page which is never returned to the group.
197  *
198  * PAGE POLICY
199  *
200  * A group pool returns its "excess" (above minimum) free pages to the
201  * super pool immediately.
202  *
203  * Allocating a new page to a record pool is expensive due to free list
204  * setup.  Therefore a record pool should not always return empty pages
205  * to the group.  Policies:
206  *
207  * - "pp1" never return empty page to the group
208  * - "pp2" always return empty (non-current) page to the group
209  * - "pp3" simple hysteresis
210  *
211  * Last one "pp3" is used.  It works as follows:
212  *
213  * When a page becomes free, check if number of free records exceeds
214  * some fixed fraction of all records.  If it does, move all free pages
215  * to the group.  Current page is ignored in the check.
216  *
217  * TODO
218  *
219  * Define abstract class SuperAlloc.  Make SuperPool a concrete class
220  * with SuperAlloc instance in ctor.  Replace HeapPool by HeapAlloc.
221  */
222 
223 // Types forward.
224 class GroupPool;
225 
226 class SuperPool {
227 public:
228   // Type of i-value, used to reference both pages and records.
229   typedef Uint32 PtrI;
230 
231   // Page entry.
232   struct PageEnt {
233     PageEnt();
234     Uint16 m_pageType;          // zero if not in record pool
235     Uint16 m_useCount;          // used records on the page
236     PtrI m_freeRecI;            // first free record on the page
237     PtrI m_nextPageI;
238     PtrI m_prevPageI;
239   };
240 
241   // Doubly-linked list of page entries.
242   struct PageList {
243     PageList();
244     PageList(PtrI pageI);
245     PtrI m_headPageI;
246     PtrI m_tailPageI;
247     Uint32 m_pageCount;
248   };
249 
250   // Constructor.  Gives page size in bytes (must be power of 2) and
251   // number of bits to use for page index "ip" in i-value.
252   SuperPool(Uint32 pageSize, Uint32 pageBits);
253 
254   // Destructor.
255   virtual ~SuperPool() = 0;
256 
257   // Move all pages from second list to end of first list.
258   void movePages(PageList& pl1, PageList& pl2);
259 
260   // Add page to beginning of page list.
261   void addHeadPage(PageList& pl, PtrI pageI);
262 
263   // Add page to end of page list.
264   void addTailPage(PageList& pl, PtrI pageI);
265 
266   // Remove any page from page list.
267   void removePage(PageList& pl, PtrI pageI);
268 
269   // Translate i-value ("ri" ignored) to page entry.
270   PageEnt& getPageEnt(PtrI pageI);
271 
272   // Translate i-value ("ri" ignored) to page address.
273   void* getPageP(PtrI pageI);
274 
275   // Translate page address to i-value.  Address must be page-aligned to
276   // memory root.  Returns RNIL if "ip" range exceeded.
277   PtrI getPageI(void* pageP);
278 
279   // Record pool info.
280   struct RecInfo {
281     RecInfo(GroupPool& gp, Uint32 recSize);
282     GroupPool& m_groupPool;
283     Uint32 m_recSize;
284     Uint16 m_recType;
285     Uint16 m_maxPerPage;
286     PtrI m_freeRecI;            // first free record on current page
287     Uint32 m_useCount;          // used records excluding current page
288     PageList m_pageList[3];     // 0-free 1-busy 2-full
289     Uint16 m_hyX;               // hysteresis fraction x/y in "pp3"
290     Uint16 m_hyY;
291   };
292 
293   // Translate i-value to record address.
294   void* getRecP(PtrI recI, RecInfo& ri);
295 
296   // Count records on page free list.
297   Uint32 getFreeCount(RecInfo& ri, PtrI freeRecPtrI);
298 
299   // Compute total number of pages in pool.
300   Uint32 getRecPageCount(RecInfo& ri);
301 
302   // Compute total number of records (used or not) in pool.
303   Uint32 getRecTotCount(RecInfo& ri);
304 
305   // Compute total number of used records in pool.
306   Uint32 getRecUseCount(RecInfo& ri);
307 
308   // Compute record pool page list index (0,1,2).
309   Uint32 getRecPageList(RecInfo& ri, PageEnt& pe);
310 
311   // Add current page.
312   void addCurrPage(RecInfo& ri, PtrI pageI);
313 
314   // Remove current page.
315   void removeCurrPage(RecInfo& ri);
316 
317   // Get page with some free records and make it current.  Takes head of
318   // used or free list, or else gets free page from group pool.
319   bool getAvailPage(RecInfo& ri);
320 
321   // Get free page from group pool and add it to record pool free list.
322   // This is an expensive subroutine of getAvailPage(RecInfo&):
323   PtrI getFreePage(RecInfo& ri);
324 
325   // Get free detached (not on list) page from group pool.
326   PtrI getFreePage(GroupPool& gp);
327 
328   // Get free detached page from super pool.
329   PtrI getFreePage();
330 
331   // Get new free detached page from the implementation.
332   virtual PtrI getNewPage() = 0;
333 
334   // Initialize free list etc.  Subroutine of getFreePage(RecInfo&).
335   void initFreePage(RecInfo& ri, PtrI pageI);
336 
337   // Release record which is not on current page.
338   void releaseNotCurrent(RecInfo& ri, PtrI recI);
339 
340   // Free pages from record pool according to page policy.
341   void freeRecPages(RecInfo& ri);
342 
343   // Free all pages in record pool.
344   void freeAllRecPages(RecInfo& ri, bool force);
345 
346   // Set pool size parameters in pages.  Call allocMemory() for changes
347   // (such as extra mallocs) to take effect.
348   void setInitPages(Uint32 initPages);
349   void setIncrPages(Uint32 incrPages);
350   void setMaxPages(Uint32 maxPages);
351 
352   // Get number of pages reserved by all groups.
353   Uint32 getGpMinPages();
354 
355   // Get number of pages reserved to a group.
356   Uint32 getMinPages(GroupPool& gp);
357 
358   // Get max number of pages a group can try to allocate.
359   Uint32 getMaxPages(GroupPool& gp);
360 
361   // Allocate more memory according to current parameters.  Returns
362   // false if no new memory was allocated.   Otherwise returns true,
363   // even if the amount allocated was less than requested.
364   virtual bool allocMemory() = 0;
365 
366   // Debugging.
367   void verify(RecInfo& ri);
368   void verifyPageList(PageList& pl);
369 
370   // Super pool parameters.
371   const Uint32 m_pageSize;
372   const Uint16 m_pageBits;
373   const Uint16 m_recBits;
374   const Uint32 m_recMask;
375   // Implementation must set up these 3 pointers.
376   void* m_memRoot;
377   PageEnt* m_pageEnt;
378   Uint8* m_pageType;
379   // Free page list.
380   PageList m_freeList;
381   // Free pages and sizes.
382   Uint32 m_initPages;
383   Uint32 m_incrPages;
384   Uint32 m_maxPages;
385   Uint32 m_totPages;
386   Uint32 m_typeCount;
387   // Reserved and allocated by group pools.
388   Uint32 m_groupMinPct;
389   Uint32 m_groupMinPages;
390   Uint32 m_groupTotPages;
391 };
392 
393 inline SuperPool::PageEnt&
getPageEnt(PtrI pageI)394 SuperPool::getPageEnt(PtrI pageI)
395 {
396   Uint32 ip = pageI >> m_recBits;
397   return m_pageEnt[ip];
398 }
399 
400 inline void*
getPageP(PtrI ptrI)401 SuperPool::getPageP(PtrI ptrI)
402 {
403   Int32 ip = (Int32)ptrI >> m_recBits;
404   return (Uint8*)m_memRoot + ip * (my_ptrdiff_t)m_pageSize;
405 }
406 
407 inline void*
getRecP(PtrI ptrI,RecInfo & ri)408 SuperPool::getRecP(PtrI ptrI, RecInfo& ri)
409 {
410   Uint32 ip = ptrI >> m_recBits;
411   assert(m_pageType[ip] == (ri.m_recType & 0xFF));
412   Uint32 ir = ptrI & m_recMask;
413   return (Uint8*)getPageP(ptrI) + ir * ri.m_recSize;
414 }
415 
416 /*
417  * GroupPool - subset of a super pool pages (concrete class)
418  */
419 
420 class GroupPool {
421 public:
422   // Types.
423   typedef SuperPool::PageList PageList;
424 
425   // Constructor.
426   GroupPool(SuperPool& sp);
427 
428   // Destructor.
429   ~GroupPool();
430 
431   // Set minimum pct reserved in super pool.
432   void setMinPct(Uint32 resPct);
433 
434   // Set minimum pages reserved in super pool.
435   void setMinPages(Uint32 resPages);
436 
437   SuperPool& m_superPool;
438   Uint32 m_minPct;
439   Uint32 m_minPages;
440   Uint32 m_totPages;
441   PageList m_freeList;
442 };
443 
444 /*
445  * RecordPool -  record pool using one super pool instance (template)
446  */
447 
448 template <class T>
449 class RecordPool {
450 public:
451   // Constructor.
452   RecordPool(GroupPool& gp);
453 
454   // Destructor.
455   ~RecordPool();
456 
457   // Update pointer ptr.p according to i-value ptr.i.
458   void getPtr(Ptr<T>& ptr);
459 
460   // Allocate record from the pool.
461   bool seize(Ptr<T>& ptr);
462 
463   // Return record to the pool.
464   void release(Ptr<T>& ptr);
465 
466   // todo variants of basic methods
467 
468   // Return all pages to group pool.  The force flag is required if
469   // there are any used records.
470   void freeAllRecPages(bool force);
471 
472   SuperPool& m_superPool;
473   SuperPool::RecInfo m_recInfo;
474 };
475 
476 template <class T>
477 inline
RecordPool(GroupPool & gp)478 RecordPool<T>::RecordPool(GroupPool& gp) :
479   m_superPool(gp.m_superPool),
480   m_recInfo(gp, sizeof(T))
481 {
482 }
483 
484 template <class T>
485 inline
~RecordPool()486 RecordPool<T>::~RecordPool()
487 {
488   freeAllRecPages(true);
489 }
490 
491 template <class T>
492 inline void
getPtr(Ptr<T> & ptr)493 RecordPool<T>::getPtr(Ptr<T>& ptr)
494 {
495   void* recP = m_superPool.getRecP(ptr.i, m_recInfo);
496   ptr.p = static_cast<T*>(recP);
497 }
498 
499 template <class T>
500 inline bool
seize(Ptr<T> & ptr)501 RecordPool<T>::seize(Ptr<T>& ptr)
502 {
503   SuperPool& sp = m_superPool;
504   SuperPool::RecInfo& ri = m_recInfo;
505   Uint32 recMask = sp.m_recMask;
506   // get current page
507   if ((ri.m_freeRecI & recMask) != recMask ||
508       sp.getAvailPage(ri)) {
509     SuperPool::PtrI recI = ri.m_freeRecI;
510     void* recP = sp.getRecP(recI, ri);
511     ri.m_freeRecI = *(Uint32*)recP;
512     ptr.i = recI;
513     ptr.p = static_cast<T*>(recP);
514     return true;
515   }
516   ptr.i = RNIL;
517   ptr.p = 0;
518   return false;
519 }
520 
521 template <class T>
522 inline void
release(Ptr<T> & ptr)523 RecordPool<T>::release(Ptr<T>& ptr)
524 {
525   SuperPool& sp = m_superPool;
526   SuperPool::RecInfo& ri = m_recInfo;
527   SuperPool::PtrI recI = ptr.i;
528   Uint32 recMask = sp.m_recMask;
529   // check if current page
530   if ((recI & ~recMask) == (ri.m_freeRecI & ~recMask)) {
531     void* recP = sp.getRecP(recI, ri);
532     *(Uint32*)recP = ri.m_freeRecI;
533     ri.m_freeRecI = recI;
534   } else {
535     sp.releaseNotCurrent(ri, recI);
536   }
537   ptr.i = RNIL;
538   ptr.p = 0;
539 }
540 
541 template <class T>
542 inline void
freeAllRecPages(bool force)543 RecordPool<T>::freeAllRecPages(bool force)
544 {
545   SuperPool& sp = m_superPool;
546   sp.freeAllRecPages(m_recInfo, force);
547 }
548 
549 /*
550  * HeapPool - SuperPool on heap (concrete class)
551  *
552  * A super pool based on malloc with memory root on the heap.  This
553  * pool type has 2 realistic uses:
554  *
555  * - a small pool with only initial malloc and pageBits set to match
556  * - the big pool from which all heap allocations are done
557  *
558  * A smart malloc may break "ip" limit by using different VM areas for
559  * different sized requests.  For this reason malloc is done in units of
560  * increment size if possible.  Memory root is set to the page-aligned
561  * address from first page malloc.
562  */
563 
564 class HeapPool : public SuperPool {
565 public:
566   // Describes malloc area.  The areas are kept in singly linked list.
567   // There is a list head and pointers to current and last area.
568   struct Area {
569     Area();
570     Area* m_nextArea;
571     PtrI m_firstPageI;
572     Uint32 m_currPage;
573     void* m_memory;     // from malloc
574     void* m_pages;      // page-aligned pages
575     Uint32 m_numPages;  // number of pages
576   };
577 
578   // Constructor.
579   HeapPool(Uint32 pageSize, Uint32 pageBits);
580 
581   // Destructor.
582   virtual ~HeapPool();
583 
584   // Get new page from current area.
585   virtual PtrI getNewPage();
586 
587   // Allocate fixed arrays.
588   bool allocInit();
589 
590   // Allocate array of aligned pages.
591   bool allocArea(Area* ap, Uint32 tryPages);
592 
593   // Allocate memory.
allocMemory()594   virtual bool allocMemory() { return allocMemoryImpl();}
595   bool allocMemoryImpl();
596 
597   // List of malloc areas.
598   Area m_areaHead;
599   Area* m_currArea;
600   Area* m_lastArea;
601 };
602 
603 
604 #undef JAM_FILE_ID
605 
606 #endif
607