1
2 /******************************************************************************
3 *
4 * This file is part of meryl-utility, a collection of miscellaneous code
5 * used by Meryl, Canu and others.
6 *
7 * This software is based on:
8 * 'Canu' v2.0 (https://github.com/marbl/canu)
9 * which is based on:
10 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
11 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
12 *
13 * Except as indicated otherwise, this is a 'United States Government Work',
14 * and is released in the public domain.
15 *
16 * File 'README.licenses' in the root directory of this distribution
17 * contains full conditions and disclaimers.
18 */
19
20 #ifndef LIBBITS_H
21 #define LIBBITS_H
22
23 #include "types.H"
24 #include "arrays.H"
25 #include "files.H"
26
27 #include <algorithm>
28 #include <atomic>
29
30
31 // Writing in the middle of data is toublesome.
32 // This code currently will not split an object across two allocated blocks,
33 // so if you want to rewrite in the middle, you need to make sure it
34 // doesn't span a previously written block. E.g., two writes
35 // of 32 bits each could be in different blocks, and trying
36 // to overwrite with a single 64-bit write could result in the first
37 // block getting truncated (to the current position) and the write
38 // replacing the first 64-bits in the second block, not just the 32-bits expected.
39 //
40 // And don't even think of overwriting any of the variable length data.
41
42
43 inline
44 char *
45 displayWord(uint64 word, char *buffer=NULL) {
46 static char b[65];
47
48 if (buffer == NULL)
49 buffer = b;
50
51 memset(buffer, 'x', 64);
52
53 for (uint32 ii=0; ii<64; ii++)
54 buffer[ii] = (word & (uint64)1 << (63 - ii)) ? '1' : '0';
55
56 buffer[64] = 0;
57
58 return(buffer);
59 };
60
61
62
63 // Generate a bit mask on the low (0x000fff) or high bits (0xfff000).
64 //
65 // Algorithm:
66 // - set the return value to all 1's
67 // - shift left or right to keep the desired numBits in the word
68 // - reset to all 0's if the numBits is zero
69 // (if zero, 'r & -0' == 'r & 0000..000)
70 // (if not zero, 'r & -1' == 'r & 1111..111)
71 // - reset to all 1's if the numBits is larger than the number of bits in the word
72 //
73 template<typename uintType>
74 uintType
buildLowBitMask(uint32 numBits)75 buildLowBitMask(uint32 numBits) {
76 uintType r;
77
78 r = ~((uintType)0);
79 r >>= 8 * sizeof(uintType) - numBits;
80 r &= -(uintType)(numBits != 0);
81 r |= -(uintType)(numBits > 8 * sizeof(uintType));
82
83 return(r);
84 }
85
86 template<typename uintType>
87 uintType
buildHighBitMask(uint32 numBits)88 buildHighBitMask(uint32 numBits) {
89 uintType r;
90
91 r = ~((uintType)0);
92 r <<= 8 * sizeof(uintType) - numBits;
93 r &= -(uintType)(numBits != 0);
94 r |= -(uintType)(numBits > 8 * sizeof(uintType));
95
96 return(r);
97 }
98
99
100
101 // Return bits in a word:
102 // Keeping the rightmost 64-n bits (mask out the leftmost n bits)
103 // Keeping the leftmost 64-n bits (mask out the rigthmost n bits)
104 //
clearLeftBits(uint64 v,uint32 l)105 inline uint64 clearLeftBits (uint64 v, uint32 l) { if (l >= 64) return(0); return(v & (uint64max >> l)); };
saveLeftBits(uint64 v,uint32 l)106 inline uint64 saveLeftBits (uint64 v, uint32 l) { if (l == 0) return(0); return(v & (uint64max << (64 - l))); };
clearRightBits(uint64 v,uint32 r)107 inline uint64 clearRightBits (uint64 v, uint32 r) { if (r >= 64) return(0); return(v & (uint64max << r)); };
saveRightBits(uint64 v,uint32 r)108 inline uint64 saveRightBits (uint64 v, uint32 r) { if (r == 0) return(0); return(v & (uint64max >> (64 - r))); };
109
clearMiddleBits(uint64 v,uint32 l,uint32 r)110 inline uint64 clearMiddleBits(uint64 v, uint32 l, uint32 r) { return( saveRightBits(v, r) | saveLeftBits(v, l)); };
saveMiddleBits(uint64 v,uint32 l,uint32 r)111 inline uint64 saveMiddleBits(uint64 v, uint32 l, uint32 r) { return(clearRightBits(v, r) & clearLeftBits(v, l)); };
112
clearLeftBits(uint128 v,uint32 l)113 inline uint128 clearLeftBits (uint128 v, uint32 l) { if (l >= 128) return(0); return(v & (uint128max >> l)); };
saveLeftBits(uint128 v,uint32 l)114 inline uint128 saveLeftBits (uint128 v, uint32 l) { if (l == 0) return(0); return(v & (uint128max << (128 - l))); };
clearRightBits(uint128 v,uint32 r)115 inline uint128 clearRightBits (uint128 v, uint32 r) { if (r >= 128) return(0); return(v & (uint128max << r)); };
saveRightBits(uint128 v,uint32 r)116 inline uint128 saveRightBits (uint128 v, uint32 r) { if (r == 0) return(0); return(v & (uint128max >> (128 - r))); };
117
clearMiddleBits(uint128 v,uint32 l,uint32 r)118 inline uint128 clearMiddleBits(uint128 v, uint32 l, uint32 r) { return( saveRightBits(v, r) | saveLeftBits(v, l)); };
saveMiddleBits(uint128 v,uint32 l,uint32 r)119 inline uint128 saveMiddleBits(uint128 v, uint32 l, uint32 r) { return(clearRightBits(v, r) & clearLeftBits(v, l)); };
120
121
122
123 // Freed, Edwin E. 1983. "Binary Magic Number" Dr. Dobbs Journal Vol. 78 (April) pp. 24-37
124 // Reverse the bits in a word,
125 // Count the number of set bits in a words
126 // and more.
127 //
128 inline
129 uint64
reverseBits64(uint64 x)130 reverseBits64(uint64 x) {
131 x = ((x >> 1) & 0x5555555555555555llu) | ((x << 1) & 0xaaaaaaaaaaaaaaaallu);
132 x = ((x >> 2) & 0x3333333333333333llu) | ((x << 2) & 0xccccccccccccccccllu);
133 x = ((x >> 4) & 0x0f0f0f0f0f0f0f0fllu) | ((x << 4) & 0xf0f0f0f0f0f0f0f0llu);
134 x = ((x >> 8) & 0x00ff00ff00ff00ffllu) | ((x << 8) & 0xff00ff00ff00ff00llu);
135 x = ((x >> 16) & 0x0000ffff0000ffffllu) | ((x << 16) & 0xffff0000ffff0000llu);
136 x = ((x >> 32) & 0x00000000ffffffffllu) | ((x << 32) & 0xffffffff00000000llu);
137 return(x);
138 }
139
140 inline
141 uint32
reverseBits32(uint32 x)142 reverseBits32(uint32 x) {
143 x = ((x >> 1) & 0x55555555lu) | ((x << 1) & 0xaaaaaaaalu);
144 x = ((x >> 2) & 0x33333333lu) | ((x << 2) & 0xcccccccclu);
145 x = ((x >> 4) & 0x0f0f0f0flu) | ((x << 4) & 0xf0f0f0f0lu);
146 x = ((x >> 8) & 0x00ff00fflu) | ((x << 8) & 0xff00ff00lu);
147 x = ((x >> 16) & 0x0000fffflu) | ((x << 16) & 0xffff0000lu);
148 return(x);
149 }
150
151
152 inline
153 uint64
uint64Swap(uint64 x)154 uint64Swap(uint64 x) {
155 x = ((x >> 8) & 0x00ff00ff00ff00ffllu) | ((x << 8) & 0xff00ff00ff00ff00llu);
156 x = ((x >> 16) & 0x0000ffff0000ffffllu) | ((x << 16) & 0xffff0000ffff0000llu);
157 x = ((x >> 32) & 0x00000000ffffffffllu) | ((x << 32) & 0xffffffff00000000llu);
158 return(x);
159 }
160
161 inline
162 uint32
uint32Swap(uint32 x)163 uint32Swap(uint32 x) {
164 x = ((x >> 8) & 0x00ff00fflu) | ((x << 8) & 0xff00ff00lu);
165 x = ((x >> 16) & 0x0000fffflu) | ((x << 16) & 0xffff0000lu);
166 return(x);
167 }
168
169 inline
170 uint16
uint16Swap(uint16 x)171 uint16Swap(uint16 x) {
172 x = ((x >> 8) & 0x00ff) | ((x << 8) & 0xff00);
173 return(x);
174 }
175
176
177 inline
178 uint32
countNumberOfSetBits32(uint32 x)179 countNumberOfSetBits32(uint32 x) {
180 x = ((x >> 1) & 0x55555555lu) + (x & 0x55555555lu);
181 x = ((x >> 2) & 0x33333333lu) + (x & 0x33333333lu);
182 x = ((x >> 4) & 0x0f0f0f0flu) + (x & 0x0f0f0f0flu);
183 x = ((x >> 8) & 0x00ff00fflu) + (x & 0x00ff00fflu);
184 x = ((x >> 16) & 0x0000fffflu) + (x & 0x0000fffflu);
185 return(x);
186 }
187
188 inline
189 uint64
countNumberOfSetBits64(uint64 x)190 countNumberOfSetBits64(uint64 x) {
191 x = ((x >> 1) & 0x5555555555555555llu) + (x & 0x5555555555555555llu);
192 x = ((x >> 2) & 0x3333333333333333llu) + (x & 0x3333333333333333llu);
193 x = ((x >> 4) & 0x0f0f0f0f0f0f0f0fllu) + (x & 0x0f0f0f0f0f0f0f0fllu);
194 x = ((x >> 8) & 0x00ff00ff00ff00ffllu) + (x & 0x00ff00ff00ff00ffllu);
195 x = ((x >> 16) & 0x0000ffff0000ffffllu) + (x & 0x0000ffff0000ffffllu);
196 x = ((x >> 32) & 0x00000000ffffffffllu) + (x & 0x00000000ffffffffllu);
197 return(x);
198 }
199
200
201 // Return the number of bits needed to represent 'x'.
202 // It's really floor(log_2(x)) + 1.
203 // Note that x=0 returns 0.
204 //
205 inline
206 uint32
countNumberOfBits32(uint32 x)207 countNumberOfBits32(uint32 x) {
208 x |= x >> 1;
209 x |= x >> 2;
210 x |= x >> 4;
211 x |= x >> 8;
212 x |= x >> 16;
213 return(countNumberOfSetBits32(x));
214 }
215
216 inline
217 uint64
countNumberOfBits64(uint64 x)218 countNumberOfBits64(uint64 x) {
219 x |= x >> 1;
220 x |= x >> 2;
221 x |= x >> 4;
222 x |= x >> 8;
223 x |= x >> 16;
224 x |= x >> 32;
225 return(countNumberOfSetBits64(x));
226 }
227
228
229
230
231 //#if (__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
232 //#define BUILTIN_POPCOUNT
233 //#endif
234
235 #ifdef BUILTIN_POPCOUNT
236
237 inline
238 uint32
countNumberOfSetBits32(uint32 x)239 countNumberOfSetBits32(uint32 x) {
240 return(__builtin_popcount(x));
241 }
242
243 inline
244 uint64
countNumberOfSetBits64(uint64 x)245 countNumberOfSetBits64(uint64 x) {
246 return(__builtin_popcountll(x));
247 }
248
249 #endif
250
251
252
253 // Expand a 2-bit packed word into a 3-bit packed word.
254 // input aabbccdd
255 // output 0aa0bb0cc0dd
256 // Handy if you want to print such a number as octal.
257 //
258 inline
259 uint64
expandTo3(uint64 v)260 expandTo3(uint64 v) {
261 uint64 o = 0;
262
263 o = (v & 0x0000000000000003llu) << 0;
264 o |= (v & 0x000000000000000cllu) << 1;
265 o |= (v & 0x0000000000000030llu) << 2;
266 o |= (v & 0x00000000000000c0llu) << 3;
267 o |= (v & 0x0000000000000300llu) << 4;
268 o |= (v & 0x0000000000000c00llu) << 5;
269 o |= (v & 0x0000000000003000llu) << 6;
270 o |= (v & 0x000000000000c000llu) << 7;
271 o |= (v & 0x0000000000030000llu) << 8;
272 o |= (v & 0x00000000000c0000llu) << 9;
273 o |= (v & 0x0000000000300000llu) << 10;
274 o |= (v & 0x0000000000c00000llu) << 11;
275 o |= (v & 0x0000000003000000llu) << 12;
276 o |= (v & 0x000000000c000000llu) << 13;
277 o |= (v & 0x0000000030000000llu) << 14;
278 o |= (v & 0x00000000c0000000llu) << 15;
279 o |= (v & 0x0000000300000000llu) << 16;
280 o |= (v & 0x0000000c00000000llu) << 17;
281 o |= (v & 0x0000003000000000llu) << 18;
282 o |= (v & 0x000000c000000000llu) << 19;
283 o |= (v & 0x0000030000000000llu) << 20;
284 // (v & 0x00000c0000000000llu) << 21; // This overflows.
285
286 assert((v & 0xffffc0000000000llu) == 0);
287
288 return(o);
289 }
290
291
292 // Compress a 3-bit packed word into a 2-bit packed word, dropping the high bit.
293 inline
294 uint64
compressTo2(uint64 v)295 compressTo2(uint64 v) {
296 uint64 o = 0;
297
298 o = (v & 0x0000000000000003llu) >> 0;
299 o |= (v & 0x0000000000000018llu) >> 1;
300 o |= (v & 0x00000000000000c0llu) >> 2;
301 o |= (v & 0x0000000000000600llu) >> 3;
302 o |= (v & 0x0000000000003000llu) >> 4;
303 o |= (v & 0x0000000000018000llu) >> 5;
304 o |= (v & 0x00000000000c0000llu) >> 6;
305 o |= (v & 0x0000000000600000llu) >> 7;
306 o |= (v & 0x0000000003000000llu) >> 8;
307 o |= (v & 0x0000000018000000llu) >> 9;
308 o |= (v & 0x00000000c0000000llu) >> 10;
309 o |= (v & 0x0000000600000000llu) >> 11;
310 o |= (v & 0x0000003000000000llu) >> 12;
311 o |= (v & 0x0000018000000000llu) >> 13;
312 o |= (v & 0x00000c0000000000llu) >> 14;
313 o |= (v & 0x0000600000000000llu) >> 15;
314 o |= (v & 0x0003000000000000llu) >> 16;
315 o |= (v & 0x0018000000000000llu) >> 17;
316 o |= (v & 0x00c0000000000000llu) >> 18;
317 o |= (v & 0x0600000000000000llu) >> 19;
318 o |= (v & 0x3000000000000000llu) >> 20;
319
320 assert((o & 0xffffc0000000000llu) == 0);
321
322 return(o);
323 }
324
325
326
327
328 class bitArray {
329 public:
330 bitArray(uint64 maxNumBits=0) {
331 _maxBitSet = 0;
332 _maxBitAvail = maxNumBits;
333 _bits = NULL;
334
335 if (maxNumBits > 0)
336 allocate(maxNumBits);
337 };
338
~bitArray(void)339 ~bitArray(void) {
340 delete [] _bits;
341 };
342
isAllocated(void)343 bool isAllocated(void) {
344 return(_bits != NULL);
345 }
346
allocate(uint64 maxNumBits)347 void allocate(uint64 maxNumBits) {
348 if (isAllocated() == true)
349 return;
350
351 _maxBitSet = 0;
352 _maxBitAvail = maxNumBits;
353 _bits = new uint64 [_maxBitAvail / 64 + 1];
354
355 clear();
356 };
357
clear(void)358 void clear(void) {
359 memset(_bits, 0, sizeof(uint64) * (_maxBitAvail / 64 + 1));
360 };
361
getBit(uint64 position)362 bool getBit(uint64 position) {
363 uint64 w = (position / 64);
364 uint64 b = 63 - (position % 64);
365
366 if (_maxBitAvail <= position)
367 fprintf(stderr, "getBit()-- ERROR: position=" F_U64 " > maximum available=" F_U64 "\n",
368 position, _maxBitAvail);
369 assert(position < _maxBitAvail);
370
371 return((_bits[w] >> b) & 0x00000001);
372 };
373
setBit(uint64 position,bool value)374 void setBit(uint64 position, bool value) {
375 uint64 w = (position / 64);
376 uint64 b = 63 - (position % 64);
377 uint64 m = ((uint64)1) << b;
378
379 //fprintf(stderr, "SET pos %9" F_U64P " word %2" F_U64P " bit %2" F_U64P " value %c 0x%016" F_X64P " -> ",
380 // position, w, b, (value) ? '1' : '0', _bits[w]);
381
382 if (_maxBitAvail <= position)
383 fprintf(stderr, "setBit()-- ERROR: position=" F_U64 " > maximum available=" F_U64 "\n",
384 position, _maxBitAvail);
385 assert(position < _maxBitAvail);
386
387 _bits[w] &= ~m;
388 _bits[w] |= ((uint64)value) << b;
389
390 //fprintf(stderr, "0x%016" F_X64P "\n", _bits[w]);
391 };
392
393 // Returns state of bit before flipping.
flipBit(uint64 position)394 bool flipBit(uint64 position) {
395 uint64 w = (position / 64);
396 uint64 b = 63 - (position % 64);
397 uint64 m = ((uint64)1) << b;
398
399 if (_maxBitAvail <= position)
400 fprintf(stderr, "flipBit()-- ERROR: position=" F_U64 " > maximum available=" F_U64 "\n",
401 position, _maxBitAvail);
402 assert(position < _maxBitAvail);
403
404 uint64 v = _bits[w] & m;
405
406 //fprintf(stderr, "FLIP w %lu b %lu m 0x%016lx v 0x%016lx FROM 0x%016lx", w, b, m, v, _bits[w]);
407
408 _bits[w] ^= m;
409
410 //fprintf(stderr, " TO 0x%016lx\n", _bits[w]);
411
412 return(v >> b);
413 };
414
415 private:
416 uint64 _maxBitSet;
417 uint64 _maxBitAvail;
418 uint64 *_bits;
419 };
420
421
422
423 ////////////////////////////////////////
424 //
425 // wordArray - An array that efficiently stores non-machine-word size
426 // integer words by packing the bits into machine-size words. The array is
427 // variable length but not sparse - accessing element 1,000,000 will
428 // allocate elements 0 through 999,999.
429 //
430 // The size, in bits, of each element is set at construction time. All
431 // elements must be the same size.
432 //
433 // Words of at most 128 bits in size can be stored.
434 //
435 // The elements are stored in a set of fixed-size blocks. The block size
436 // can also be set at construction time. Note that this is specified IN
437 // BITS. The default size is 64 KB per block. Decrease this if you know
438 // you only need a few KB to store all values, or if you are storing several
439 // GB of data. There is no real performance loss/gain; it just adjusts the
440 // number of blocks allocated. There might be a slight degradation in
441 // performance of the memory management system if millions of blocks are
442 // allocated.
443 //
444 class wordArray {
445 public:
446 wordArray(uint32 valueWidth, uint64 segmentsSizeInBits, bool useLocks);
447 ~wordArray();
448
449 void clear(void); // Reset the array to zero, doesn't deallocate space.
450
451 void allocate(uint64 nElements); // Pre-allocate space for nElements.
452
453 uint128 get(uint64 eIdx); // Get the value of element eIdx.
454 void set(uint64 eIdx, uint128 v); // Set the value of element eIdx to v.
455
456 public:
457 void show(void); // Dump the wordArray to the screen; debugging.
458
459 private:
460 void setLock(uint64 seg, uint64 lockW1, uint64 lockW2);
461 void relLock(uint64 seg, uint64 lockW1, uint64 lockW2);
462 void setNval(uint32 eIdx);
463
464 private:
465 uint64 _valueWidth = 0; // Width of the values stored.
466 uint128 _valueMask = 0; // Mask the low _valueWidth bits
467 uint64 _segmentSize = 0; // Size, in bits, of each block of data.
468
469 uint64 _valuesPerSegment = 0; // Number of values in each block.
470
471 uint64 _wordsPerSegment = 0; // Number of 128-bit words in each segment
472 uint64 _wordsPerLock = 0; // How many words are covered by each lock.
473 uint64 _locksPerSegment = 0; // Number of locks per segment
474
475 uint64 _numValues = 0; // Number of values stored in the array.
476 std::atomic_flag _numValuesLock; // Lock on the above.
477
478 uint64 _segmentsLen = 0; // Number of blocks in use.
479 uint64 _segmentsMax = 0; // Number of block pointers allocated.
480 uint128 **_segments = nullptr; // List of blocks allocated.
481
482 std::atomic_flag **_segLocks = nullptr; // Locks on pieces of the segments.
483 };
484
485
486
487 class stuffedBits {
488 public:
489 stuffedBits(uint64 nBits=16 * 1024 * 1024 * 8);
490 stuffedBits(const char *inputName);
491 stuffedBits(FILE *inFile);
492 stuffedBits(readBuffer *B);
493 //stuffedBits(stuffedBits &that); // Untested.
494 ~stuffedBits();
495
496 // Debugging.
497
displayWord(uint64 w)498 char *displayWord(uint64 w) {
499 return(::displayWord(_data[w]));
500 };
501
502 // Files.
503
504 void dumpToBuffer(writeBuffer *B);
505 bool loadFromBuffer(readBuffer *B);
506
507 void dumpToFile(FILE *F);
508 bool loadFromFile(FILE *F);
509
510 // Management of the read/write head.
511
512 void setPosition(uint64 position, uint64 length = 0);
513 uint64 getPosition(void);
514
515 uint64 getLength(void);
516
517 void byteAlign(void);
518
519 // SINGLE BITS
520
521 bool getBit(void); // get a bit.
522 bool testBit(void); // get a bit, but don't move position.
523 void setBit(bool on=true); // set a bit.
524
525 // UNARY CODED DATA
526
527 uint64 getUnary(void);
528 uint64 *getUnary(uint64 number, uint64 *values);
529
530 uint32 setUnary(uint64 value);
531 uint32 setUnary(uint64 number, uint64 *values);
532
533 // BINARY CODED DATA
534
535 uint64 getBinary(uint32 width);
536 uint64 *getBinary(uint32 width, uint64 number, uint64 *values=NULL);
537
538 uint32 setBinary(uint32 width, uint64 value);
539 uint32 setBinary(uint32 width, uint64 number, uint64 *values);
540
541 // ELIAS GAMMA CODED DATA
542
543 uint64 getEliasGamma(void);
544 uint64 *getEliasGamma(uint64 number, uint64 *values=NULL);
545
546 uint32 setEliasGamma(uint64 value);
547 uint32 setEliasGamma(uint64 number, uint64 *values);
548
549 // ELIAS DELTA CODED DATA
550
551 uint64 getEliasDelta(void);
552 uint64 *getEliasDelta(uint64 number, uint64 *values=NULL);
553
554 uint32 setEliasDelta(uint64 value);
555 uint32 setEliasDelta(uint64 number, uint64 *values);
556
557 // ELIAS OMEGA CODED DATA - the omega code looks hard to implement - the
558 // encoding and decoding streams are backwards from each other. The idea
559 // is:
560 //
561 // push the binary representation of the value onto a stack.
562 // set value to one less than the number of bits emitted last.
563 // loop until the value is 1.
564 //
565 // The stream is constructed by emitting the words on the stack, and
566 // terminating the stream with an extra '0'.
567 //
568 #if 0
569 uint64 getEliasOmega(void);
570 uint64 *getEliasOmega(uint64 number, uint64 *values=NULL);
571
572 uint32 setEliasOmega(uint64 value);
573 uint32 setEliasOmega(uint64 number, uint64 *values);
574 #endif
575
576 // GOLOMB CODED DATA
577 //
578 // Pick m. For m == power_of_two, this is RICE CODED DATA.
579 //
580 // q = floow(n/m).
581 // r = n-qm
582 // c = ceil(log_2 m)
583 //
584 // Unary encode q, binary encode r.
585 //
586 // The first 2^c-m values are encoded as c-1 bit values, starting with 00...00,
587 // The rest as c-bit numbers, ending with 11...11
588 //
589
590
591 // FIBONACCI CODED DATA
592 //
593 // A Fibonacci number is F(n) = F(n-1) + F(n-2), wher F(0) = F(1) = 1.
594 //
595 // The Zeckendorf representation of a number encodes it such that no
596 // two consecurive Fibonacci numbers are used. From the definition
597 // of a Fibonacci number, any pattern "100" can be replaced with "011".
598 // A number encoded after this transofmration is in the Fibonacci
599 // representation ("Zeckendorf representation" seems to be a real thing,
600 // I just made up "Fibonacci representation" - the two terms seem to
601 // be used synonymously in the real world).
602 //
603 // Once encoded, it's added to the bit stream reversed.
604 //
605 // For the Zeckendorf representation, a single 1-bit is added, terminating
606 // the number with the last '1' bit of data, followed immediately by
607 // another '1' bit. (Because, by definition, there are no two adjacent
608 // set bits in the encoded number).
609 //
610 // For the Fibonacci representation, we need to append two '0' bits.
611 // (Because, by definition, there are no two adjacent unset bits in the
612 // representation). BUT, this representation saves at most one bit
613 // (replacing 100 at the start of the string by 011), but the savings
614 // is lost by the extra stop bit we need.
615 //
616 uint64 getZeckendorf(void);
617 uint64 *getZeckendorf(uint64 number, uint64 *values=NULL);
618
619 uint32 setZeckendorf(uint64 value);
620 uint32 setZeckendorf(uint64 number, uint64 *values);
621
622 // Old meryl uses preDecrement() when using compressed bucket counting.
623 // Nothing else in canu uses these, and they're painful, so left unimplemented.
624 #if 0
625 uint64 preIncrementBinary(uint64 width, uint64 position);
626 uint64 postIncrementBinary(uint64 width, uint64 position);
627 uint64 preDecrementBinary(uint64 width, uint64 position);
628 uint64 postDecrementBinary(uint64 width, uint64 position);
629 #endif
630
631
632 private:
633
634 // For writing, update the length of the block to the maximum of where we're at now and the existing length.
635 //
updateLen(void)636 void updateLen(void) {
637 _dataBlockLen[_dataBlk] = std::max(_dataPos, _dataBlockLen[_dataBlk]);
638 };
639
640 // For both reading and writing, move to the next word if we're at the end of the current one.
641 //
updateBit(void)642 void updateBit(void) {
643 if (_dataBit == 0) {
644 _dataWrd += 1;
645 _dataBit = 64;
646 }
647 };
648
649 // For reading operations, move to the next block if we're at the end of the current one.
650 // For writing operations, this is done before the write, in ensureSpace().
651 //
652 // Should be done before any reading operation. It isn't (strictly) needed at the end
653 // of a read. The next read will just do it at the start, and the next write
654 // only cares that there is enough space in the current block for the write.
655 //
updateBlk(uint64 readLength)656 void updateBlk(uint64 readLength) {
657
658 assert(_dataBit != 0);
659 assert(_dataBit <= 64);
660
661 // If in this block, nothing to do.
662
663 //fprintf(stderr, "updateBlk()-- at _dataPos " F_U64 " _dataBlockLen[%u] = " F_U64 "\n", _dataPos, _dataBlk, _dataBlockLen[_dataBlk]);
664
665 if (_dataPos + readLength <= _dataBlockLen[_dataBlk])
666 return;
667
668 // Otherwise, we MUST be at the end of the current block. If not,
669 // whatever we're trying to read is in the next block (or we're trying
670 // to read something longer than what is here).
671
672 assert(_dataPos == _dataBlockLen[_dataBlk]);
673
674 _dataBlk += 1;
675
676 if (_dataBlk >= _dataBlocksLen)
677 fprintf(stderr, "ERROR: _dataBlk = %lu _dataBlocksLen = %u\n", _dataBlk, _dataBlocksLen);
678 assert(_dataBlk < _dataBlocksLen);
679
680 _dataPos = 0;
681 _data = _dataBlocks[_dataBlk];
682
683 _dataWrd = 0;
684 _dataBit = 64;
685 }
686
clearBlock(void)687 void clearBlock(void) {
688 for (uint64 ii=0; ii<_dataBlockLenMaxW; ii++)
689 _data[ii] = 0;
690 };
691
692 // For writing operations, make sure there is enough space for the write in this block.
693 //
ensureSpace(uint64 spaceNeeded)694 void ensureSpace(uint64 spaceNeeded) {
695
696 assert(_dataBit != 0);
697 assert(_dataBit <= 64);
698
699 // If enough space in the current block, just return.
700
701 if (_dataPos + spaceNeeded < _dataBlockLenMaxB)
702 return;
703
704 // Othwerwise, terminate the current block.
705
706 _dataBlockLen[_dataBlocksLen - 1] = _dataPos;
707
708 // Move to the new block.
709
710 _dataBlocksLen++;
711
712 if (_dataBlocksLen >= _dataBlocksMax) {
713 setArraySize(_dataBlocks, _dataBlocksLen, _dataBlocksMax, _dataBlocksLen + 128);
714 setArraySize(_dataBlockBgn, _dataBlocksLen, _dataBlocksMax, _dataBlocksLen + 128);
715 setArraySize(_dataBlockLen, _dataBlocksLen, _dataBlocksMax, _dataBlocksLen + 128);
716 }
717
718 assert(spaceNeeded <= _dataBlockLenMaxB);
719
720 _dataPos = 0;
721 _data = _dataBlocks[_dataBlocksLen - 1] = new uint64 [_dataBlockLenMaxW];
722
723 clearBlock();
724
725 _dataBlockBgn[_dataBlocksLen - 1] = _dataBlockBgn[_dataBlocksLen - 2] + _dataBlockLen[_dataBlocksLen - 2];
726 _dataBlockLen[_dataBlocksLen - 1] = 0;
727
728 _dataBlk += 1;
729 _dataWrd = 0;
730 _dataBit = 64;
731 };
732
bitsToWords(uint64 bits)733 uint64 bitsToWords(uint64 bits) {
734 return(bits / 64 + ((bits % 64) ? 1 : 0));
735 };
736
737 uint64 _dataBlockLenMaxB; // Allocated length of each block (in BITS).
738 uint64 _dataBlockLenMaxW; // Allocated length of each block (in WORDS).
739
740 uint32 _dataBlocksLen; // Number of allocated data blocks.
741 uint32 _dataBlocksMax; // Number of blocks we can allocate.
742
743 uint64 *_dataBlockBgn; // Starting position, in the global file, of this block.
744 uint64 *_dataBlockLen; // Length of this block.
745 uint64 **_dataBlocks; // Just piles of bits. Nothing interesting here.
746
747 uint64 _dataPos; // Position in this block, in BITS.
748 uint64 *_data; // Pointer to the currently active data block.
749
750 uint64 _dataBlk; // Active data block.
751 uint64 _dataWrd; // Active word in the active data block.
752 uint64 _dataBit; // Active bit in the active word in the active data block (aka, number of bits left in this word)
753
754 uint64 _fibData[93]; // A pile of Fibonacci numbers.
755 };
756
757
758 // Implementations.
759
760 #define BITS_IMPLEMENTATIONS
761
762 #include "bits-wordArray.H"
763
764 #undef BITS_IMPLEMENTATIONS
765
766
767 #endif // LIBBITS_H
768