1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #ifndef SQREAD_H
19 #define SQREAD_H
20
21 // DO NOT INCLUDE THIS FILE DIRECTLY, include sqStore.H.
22
23 #include "arrays.H"
24 #include "sequence.H"
25
26 class sqRead;
27 class sqLibrary;
28
29 class sqStore;
30 class sqStoreBlobWriter;
31
32 class sqCache;
33
34 // Even though we can store up to 4GB blob files, we artificially limit it to 1 GB
35 // for (presumed) better caching on object storage systems. Though there are
36 // 65k files allowed, pieces that stream through the store (correction, RED, OEA)
37 // run out of file handles well before that.
38
39 const uint64 AS_BLOBFILE_MAX_SIZE = 1024 * 1024 * 1024;
40
41
42
43 // The default version is set either by the user explicitly, or by the store
44 // when it is opened. It should never be unset.
45 //
46 // If the store is created (new sqStore(...)) with no default set, it will
47 // examine what reads are present and return the 'latest' available (raw,
48 // corrected, trimmed, in that order). If the homopolymer compression flag
49 // exists, compressed reads will be returned.
50 //
51 // If the default version is set before the store is opened, exactly those
52 // reads will be returned.
53 //
54 // The need for sqRead_normal arises when the store is set to return
55 // homopoly compressed reads by default (if file 'homopolymerCompression'
56 // exists). sqRead_normal prevents sqStore_loadMetadata from enabling
57 // homopoly compression. It is otherwise not used (and so gets a quite
58 // bogus value).
59 //
60 // The 'exclude' function works ONLY AFTER the store is opened - since there
61 // is no default set we can't remove a flag yet.
62 //
63
64 typedef uint32 sqRead_which;
65
66 const sqRead_which sqRead_unset = 0x0000; // Nothing specified, query the store to decide what to return.
67 const sqRead_which sqRead_raw = 0x0001; // Use noisy sequence.
68 const sqRead_which sqRead_corrected = 0x0002; // Use corrected sequence.
69 const sqRead_which sqRead_normal = 0x0080; // Return normal uncompressed bases.
70 const sqRead_which sqRead_compressed = 0x0004; // Return compressed bases.
71 const sqRead_which sqRead_trimmed = 0x0008; // Return trimmed bases.
72 const sqRead_which sqRead_largest = 0x0010; // Used to size an array.
73
74 extern
75 sqRead_which sqRead_defaultVersion;
76
77 static
78 const
79 char *
toString(sqRead_which w)80 toString(sqRead_which w) {
81 const sqRead_which c = sqRead_compressed;
82 const sqRead_which t = sqRead_trimmed;
83 const sqRead_which ct = sqRead_compressed | sqRead_trimmed;
84
85 w &= ~sqRead_normal; // Disable flag so we report the normal uncompressed string.
86
87 switch (w) {
88 case sqRead_unset: return("unset"); break;
89
90 case sqRead_raw: return("raw"); break;
91 case sqRead_raw | c: return("raw-compressed"); break;
92 case sqRead_raw | t: return("raw-trimmed"); break;
93 case sqRead_raw | ct: return("raw-compressed-trimmed"); break;
94
95 case sqRead_corrected: return("corrected"); break;
96 case sqRead_corrected | c: return("corrected-compressed"); break;
97 case sqRead_corrected | t: return("corrected-trimmed"); break;
98 case sqRead_corrected | ct: return("corrected-compressed-trimmed"); break;
99
100 case sqRead_compressed: return("compressed"); break;
101 case sqRead_trimmed: return("trimmed"); break;
102
103 default: return("undefined-mode"); break;
104 }
105
106 return("undefined-mode");
107 }
108
109 static
110 void
sqRead_setDefaultVersion(sqRead_which v)111 sqRead_setDefaultVersion(sqRead_which v) {
112 sqRead_defaultVersion = v;
113 }
114
115 static
116 void
sqRead_setDefaultVersionInclude(sqRead_which v)117 sqRead_setDefaultVersionInclude(sqRead_which v) {
118 sqRead_defaultVersion |= v;
119 }
120
121 static
122 void
sqRead_setDefaultVersionExclude(sqRead_which v)123 sqRead_setDefaultVersionExclude(sqRead_which v) {
124 sqRead_defaultVersion &= ~v;
125 }
126
127 static // Return true of the default version has mode 'v' set.
128 bool
sqRead_defaultIs(sqRead_which v)129 sqRead_defaultIs(sqRead_which v) {
130 return((sqRead_defaultVersion & v) == v);
131 }
132
133 static // Return true of the default version has mode 'v' not set.
134 bool
sqRead_defaultIsNot(sqRead_which v)135 sqRead_defaultIsNot(sqRead_which v) {
136 return((sqRead_defaultVersion & v) == sqRead_unset);
137 }
138
139 static
140 const
141 char *
sqRead_getDefaultVersion(void)142 sqRead_getDefaultVersion(void) {
143 return(toString(sqRead_defaultVersion));
144 }
145
146
147
148
149
150
151
152
153 // On disk sequence metadata. Sequence data itself is in the blobs.
154 //
155 // In general, you should not be directly using this class:
156 // sqReadSeq_length() ALWAYS returns the untrimmed length of the read.
157 //
158 // sqReadSeq_clearBgn() and sqReadSeq_clearEnd() will both return zero for
159 // a read with no clear range set.
160 //
161 // There are too many places where encapsulation needs to be broken to
162 // make it truly private. For example:
163 // Marking reads as ignore in sqStoreCreate (to get rid of extra coverage).
164 // Loading clear ranges.
165 //
166 class sqReadSeq {
167 public:
sqReadSeq_initialize(void)168 void sqReadSeq_initialize(void) {
169 _seqValid = 0;
170 _unused1 = 0;
171 _seqLength = 0;
172
173 _ignoreU = 0;
174 _ignoreT = 0;
175 _clearBgn = 0;
176
177 _trimmed = 0;
178 _unused2 = 0;
179 _clearEnd = 0;
180 };
181
182 private:
sqReadSeq_length(void)183 uint32 sqReadSeq_length(void) { return(_seqLength); }; // ALWAYS untrimmed length.
184
sqReadSeq_clearBgn(void)185 uint32 sqReadSeq_clearBgn(void) { assert(_trimmed); return(_clearBgn); }; // NOT valid unless trimmed.
sqReadSeq_clearEnd(void)186 uint32 sqReadSeq_clearEnd(void) { assert(_trimmed); return(_clearEnd); };
187
188 private:
sqReadSeq_valid(void)189 bool sqReadSeq_valid(void) { return(_seqValid); }; // True if there is data.
sqReadSeq_trimmed(void)190 bool sqReadSeq_trimmed(void) { return(_trimmed); }; // True if the clear range is set.
191
sqReadSeq_ignoreU(void)192 bool sqReadSeq_ignoreU(void) { return(_ignoreU); }; // True if this read should be ignored.
sqReadSeq_ignoreT(void)193 bool sqReadSeq_ignoreT(void) { return(_ignoreU | _ignoreT); }; // True if the trimmed version should be ignored.
194
195
196 // Call ONLY for initializing with a newly added sequence.
197 // Do NOT call for trimmed reads.
198 // The only caller should be sqReadDataWriter::sqReadDataWriter_writeBlob().
199 // If you're thinking you want to call this, think again.
200 private:
sqReadSeq_setLength(char * bases,uint32 basesLen,bool doCompress)201 void sqReadSeq_setLength(char *bases, uint32 basesLen, bool doCompress) {
202 uint32 sl = 0;
203
204 assert(bases[basesLen] == 0);
205
206 if (doCompress == false)
207 sl = basesLen;
208 else
209 sl = homopolyCompress(bases, basesLen);
210
211 assert(_seqValid == 0);
212
213 _seqValid = 1;
214 _unused1 = 0;
215 _seqLength = sl;
216
217 _ignoreU = 0;
218 _ignoreT = 0;
219 _clearBgn = 0;
220
221 _trimmed = 0;
222 _unused2 = 0;
223 _clearEnd = sl;
224 };
225
226
227 // These are public, but unless the store is opened for appending, changes
228 // will be lost.
229 public:
sqReadSeq_setAllClear(void)230 void sqReadSeq_setAllClear(void) {
231 _clearBgn = 0;
232 _clearEnd = _seqLength;
233 _trimmed = true;
234
235 _ignoreT |= _ignoreU; // If the untrimmed is ignored, ignore the trimmed too.
236 };
237
238 void sqReadSeq_setClearRange(uint32 bgn, uint32 end, bool set=true) {
239 _clearBgn = bgn;
240 _clearEnd = end;
241 _trimmed = set;
242
243 _ignoreT |= _ignoreU; // If the untrimmed is ignored, ignore the trimmed too.
244 };
245
246 private:
247 // Only access from sqStore_setIgnored().
sqReadSeq_setIgnoreU(void)248 void sqReadSeq_setIgnoreU(void) {
249 _ignoreU = true;
250 _ignoreT |= true; // Also set ignoreT if untrimmed is ignored.
251 };
252
sqReadSeq_setIgnoreT(void)253 void sqReadSeq_setIgnoreT(void) {
254 _ignoreT = true;
255 };
256
257
258 private:
259 // The data are logically out of order, so we can fit things into three 32-bit words.
260
261 uint32 _seqValid : 1; // The sequence for this record is present in the blob.
262 uint32 _unused1 : 1; // (unused)
263 uint32 _seqLength : 30; // The length of the sequence stored in the blob.
264
265 uint32 _ignoreU : 1; // Ignore both the untrimmed and trimmed versions.
266 uint32 _ignoreT : 1; // Ignore only the trimmed version.
267 uint32 _clearBgn : 30; //
268
269 uint32 _trimmed : 1; // The trim points are valid.
270 uint32 _unused2 : 1; // (unused)
271 uint32 _clearEnd : 30; //
272
273 // Friends are only allowed to access the methods, not directly the data!
274
275 friend class sqRead;
276 friend class sqStore;
277 friend class sqStoreInfo;
278 friend class sqReadDataWriter;
279 };
280
281
282
283 // On disk read metadata, in particular, the pointer to the blob data.
284 class sqReadMeta {
285 public:
286 void sqReadMeta_initialize(uint32 readID = 0,
287 uint32 libraryID = 0) {
288 _readID = readID; assert(_readID == readID);
289 _libraryID = libraryID; assert(_libraryID == libraryID);
290 _assignment = 0;
291 _assignmentScore = 0;
292
293 _unused = 0;
294 _mSegm = 0;
295 _mByte = 0;
296 };
297
sqRead_readID(void)298 uint32 sqRead_readID(void) { return(_readID); };
sqRead_libraryID(void)299 uint32 sqRead_libraryID(void) { return(_libraryID); };
300
sqRead_assignment(void)301 uint32 sqRead_assignment(void) { return(_assignment); };
sqRead_assignmentScore(void)302 uint32 sqRead_assignmentScore(void) { return(_assignmentScore); };
303
sqRead_mSegm(void)304 uint64 sqRead_mSegm(void) { return(_mSegm); };
sqRead_mByte(void)305 uint64 sqRead_mByte(void) { return(_mByte); };
306
sqRead_setPosition(uint64 mSegm,uint64 mByte)307 void sqRead_setPosition(uint64 mSegm,
308 uint64 mByte) {
309 _mSegm = mSegm; assert(_mSegm == mSegm); // Check that mSegm and mByte are in
310 _mByte = mByte; assert(_mByte == mByte); // the range that can be stored.
311 };
312
313 private:
314 uint64 _readID : 30; // 1 billion
315 uint64 _libraryID : 12; // 4 thousand
316 uint64 _assignment : 15; // 32 thousand
317 uint64 _assignmentScore : 7; // 128
318 #if 30 + 12 + 15 + 7 != 64
319 #error sqReadMeta bits #1 are the wrong size.
320 #endif
321
322 uint64 _unused : 8; // -- FOR FUTURE USE
323 uint64 _mSegm : 16; // 64 thousand files - Pointer to the blobs file we are in.
324 uint64 _mByte : 40; // 1 TB - Pointer to the blob data in the blobs file.
325 #if 8 + 16 + 40 != 64
326 #error sqReadMeta bits #2 are the wrong size.
327 #endif
328 };
329
330
331
332 // In core read representation. Only instantiated as needed, and sequence data is only loaded
333 // as requested.
334 //
335 // This is really the old sqReadData structure. It just got promoted.
336 //
337 class sqRead {
338 public:
~sqRead()339 ~sqRead() {
340 delete [] _metaA;
341 delete [] _rseqA;
342
343 delete [] _blob;
344
345 delete [] _name;
346 delete [] _rawBases;
347 delete [] _corBases;
348 delete [] _retBases;
349 };
350
sqRead_readID(void)351 uint32 sqRead_readID(void) { return(_meta->sqRead_readID()); };
sqRead_libraryID(void)352 uint32 sqRead_libraryID(void) { return(_meta->sqRead_libraryID()); };
sqRead_library(void)353 sqLibrary *sqRead_library(void) { return(_library); };
sqRead_name(void)354 char *sqRead_name(void) { return(_name); };
355
356 // Like sqStore, return 0 for reads we shouldn't use.
357 uint32 sqRead_length (sqRead_which w=sqRead_defaultVersion) {
358 sqReadSeq *seq = sqRead_getSeq(w);
359
360 if (w & sqRead_trimmed)
361 return(((seq == NULL) ||
362 (seq->sqReadSeq_trimmed() == false) ||
363 (seq->sqReadSeq_valid() == false) ||
364 (seq->sqReadSeq_ignoreT() == true)) ? 0 : (seq->sqReadSeq_clearEnd() - seq->sqReadSeq_clearBgn()));
365
366 else
367 return(((seq == NULL) ||
368 (seq->sqReadSeq_valid() == false) ||
369 (seq->sqReadSeq_ignoreU() == true)) ? 0 : (seq->sqReadSeq_length()));
370 };
371
372 uint32 sqRead_clearBgn(sqRead_which w=sqRead_defaultVersion) {
373 sqReadSeq *seq = sqRead_getSeq(w);
374
375 if (seq == NULL)
376 return(0);
377
378 if (w & sqRead_trimmed)
379 return(seq->sqReadSeq_clearBgn());
380 else
381 return(0);
382 };
383
384 uint32 sqRead_clearEnd(sqRead_which w=sqRead_defaultVersion) {
385 sqReadSeq *seq = sqRead_getSeq(w);
386
387 if (seq == NULL)
388 return(0);
389
390 if (w & sqRead_trimmed)
391 return(seq->sqReadSeq_clearBgn());
392 else
393 return(seq->sqReadSeq_length());
394 };
395
396 char *sqRead_sequence(sqRead_which w=sqRead_defaultVersion) {
397 bool comp = ((w & sqRead_compressed) == sqRead_unset) ? false : true;
398 bool trim = ((w & sqRead_trimmed) == sqRead_unset) ? false : true;
399
400 // Figure out which bases to return, either raw or corrected, and the
401 // length of that sequence. 'bases' must be a valid pointer (we're
402 // either loading raw or corrected bases, but the string can be length
403 // zero. In all cases, though, it must be NUL terminated.
404
405 sqReadSeq *seq = sqRead_getSeq(w);
406 char *bases = NULL;
407 uint32 basesLen = 0;
408
409 if (w & sqRead_raw) {
410 bases = _rawBases;
411 basesLen = _rawU->sqReadSeq_length();
412 }
413
414 if (w & sqRead_corrected) {
415 bases = _corBases;
416 basesLen = _corU->sqReadSeq_length();
417 }
418
419 assert(bases != NULL);
420 assert(bases[basesLen] == 0);
421
422 // If neither compressed or trimmed, just return the sequence we already have.
423 //
424 if ((comp == false) &&
425 (trim == false)) {
426 return(bases);
427 }
428
429 // If not compressed but trimmed, copy the trimmed bases into
430 // temporary space and return that.
431 //
432 if ((comp == false) &&
433 (trim == true)) {
434 uint32 bgn = seq->sqReadSeq_clearBgn(); // Only valid if trimmed, do not make global!
435 uint32 end = seq->sqReadSeq_clearEnd();
436
437 resizeArray(_retBases, 0, _retBasesAlloc, end - bgn + 1, _raAct::doNothing);
438
439 memmove(_retBases, bases + bgn, end - bgn);
440 _retBases[end-bgn] = 0;
441
442 return(_retBases);
443 }
444
445 // Otherwise, we need to homopolymer compress the sequence. It'll be no
446 // longer than the uncompressed sequence, so we can allocate that much,
447 // instead of tracking down the actual length.
448
449 resizeArray(_retBases, 0, _retBasesAlloc, basesLen + 1, _raAct::doNothing);
450
451 homopolyCompress(bases, basesLen, _retBases);
452
453 // Trim the read if needed.
454 //
455 if (trim == true) {
456 uint32 bgn = seq->sqReadSeq_clearBgn(); // Only valid if trimmed, do not make global!
457 uint32 end = seq->sqReadSeq_clearEnd();
458
459 memmove(_retBases, _retBases + bgn, end - bgn);
460 _retBases[end-bgn] = 0;
461 }
462
463 // But either way, we return the same thing.
464 return(_retBases);
465 };
466
467 private:
468 void sqRead_fetchBlob(readBuffer *B);
469 void sqRead_decodeBlob(void);
470
471 private:
sqRead_getSeq(sqRead_which w)472 sqReadSeq *sqRead_getSeq(sqRead_which w) {
473
474 if (w == sqRead_unset) // If w is somehow set to the unset state,
475 w = sqRead_defaultVersion; // go back to using the global default.
476
477 bool isRaw = ((w & sqRead_raw) == sqRead_raw); // Return the sqReadSeq object
478 bool isCor = ((w & sqRead_corrected) == sqRead_corrected); // appropriate for the supplied
479 bool isTrm = ((w & sqRead_trimmed) == sqRead_trimmed); // sqRead_which.
480 bool isCmp = ((w & sqRead_compressed) == sqRead_compressed);
481
482 if ((isRaw == true) && (isCmp == false)) { return(_rawU); };
483 if ((isRaw == true) && (isCmp == true)) { return(_rawC); };
484
485 if ((isCor == true) && (isCmp == false)) { return(_corU); };
486 if ((isCor == true) && (isCmp == true)) { return(_corC); };
487
488 fprintf(stderr, "sqRead_getSeq()-- Unknown which '%s'\n", toString(w));
489
490 assert(0);
491 return(NULL);
492 };
493
494 private:
495 sqReadMeta *_meta = nullptr; // Pointers to arrays owned by sqStore.
496 sqReadSeq *_rawU = nullptr;
497 sqReadSeq *_rawC = nullptr;
498 sqReadSeq *_corU = nullptr;
499 sqReadSeq *_corC = nullptr;
500
501 sqReadMeta *_metaA = nullptr; // When loading without a store, these
502 sqReadSeq *_rseqA = nullptr; // hold the data for the above pointers.
503
504 sqLibrary *_library = nullptr;
505
506 bool _blobLoaded = false;
507
508 char _blobName[4] = {0};
509 uint32 _blobLen = 0;
510 uint32 _blobMax = 0;
511 uint8 *_blob = nullptr;
512
513 uint32 _nameAlloc = 0;
514 char *_name = nullptr;
515
516 uint32 _rawBasesAlloc = 0; // The raw sequence, as loaded from disk.
517 char *_rawBases = nullptr;
518
519 uint32 _corBasesAlloc = 0; // The corrected sequence, as loaded from disk.
520 char *_corBases = nullptr;
521
522 sqRead_which _retFlags = sqRead_unset; // Remember what the returned sequence is.
523 uint32 _retBasesAlloc = 0; // Scratch space for computing trimmed and
524 char *_retBases = nullptr; // compressed sequences to return to the user.
525
526 friend class sqStore;
527 friend class sqCache;
528 friend class sqReadDataWriter;
529 };
530
531
532
533
534 class sqReadDataWriter {
535 public:
536 sqReadDataWriter(sqReadMeta *meta=NULL,
537 sqReadSeq *rawu=NULL,
538 sqReadSeq *rawc=NULL,
539 sqReadSeq *coru=NULL,
540 sqReadSeq *corc=NULL) {
541 _meta = meta;
542 _rawU = rawu;
543 _rawC = rawc;
544 _corU = coru;
545 _corC = corc;
546
547 _charMap['A'] = _charMap['a'] = 'A';
548 _charMap['C'] = _charMap['c'] = 'C';
549 _charMap['G'] = _charMap['g'] = 'G';
550 _charMap['T'] = _charMap['t'] = 'T';
551 _charMap['U'] = _charMap['u'] = 'T'; // Map U to T.
552 _charMap['N'] = _charMap['n'] = 'N';
553 };
554
~sqReadDataWriter()555 ~sqReadDataWriter() {
556 delete [] _name;
557 delete [] _rawBases;
558 delete [] _corBases;
559 };
560
561 public:
562 void sqReadDataWriter_importData(sqRead *read);
563
564 void sqReadDataWriter_setName(const char *N);
565 void sqReadDataWriter_setRawBases(const char *S, uint32 Slen);
566 void sqReadDataWriter_setCorrectedBases(const char *S, uint32 Slen);
567
568 void sqReadDataWriter_writeBlob(writeBuffer *buffer);
569
sqReadDataWriter_getRawLength(bool compressed)570 uint32 sqReadDataWriter_getRawLength(bool compressed) {
571 return((compressed == false) ? _rawU->sqReadSeq_length() : _rawC->sqReadSeq_length());
572 };
573
sqReadDataWriter_getCorrectedLength(bool compressed)574 uint32 sqReadDataWriter_getCorrectedLength(bool compressed) {
575 return((compressed == false) ? _corU->sqReadSeq_length() : _corC->sqReadSeq_length());
576 };
577
578 private:
579 sqReadMeta *_meta = nullptr; // Pointer to the read metadata.
580 sqReadSeq *_rawU = nullptr; // In sqStoreCreate, these pointers are
581 sqReadSeq *_rawC = nullptr; // back to the data in sqStore itself.
582 sqReadSeq *_corU = nullptr;
583 sqReadSeq *_corC = nullptr;
584
585 uint32 _nameAlloc = 0;
586 uint32 _nameLen = 0;
587 char *_name = nullptr;
588
589 uint32 _rawBasesAlloc = 0; // The raw sequence. The length is the
590 uint32 _rawBasesLen = 0; // length of the array, not of the string.
591 char *_rawBases = nullptr;
592
593 uint32 _corBasesAlloc = 0; // The corrected sequence. The length is the
594 uint32 _corBasesLen = 0; // length of the array, not of the string.
595 char *_corBases = nullptr;
596
597 char _charMap[256] = { 0 };
598
599 friend class sqStore;
600 friend class sqStoreBlobWriter;
601 };
602
603
604
605 #endif // SQREAD_H
606